In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

In [None]:
# Sample dataset
data = {
    'text': [
        "Natural language processing is fascinating.",
        "Machine learning and data science are closely related.",
        "Data science involves data analysis, machine learning, and more.",
        "I love learning about artificial intelligence.",
        "Data scientists use machine learning algorithms."
    ],
    'label': ['positive', 'neutral', 'neutral', 'positive', 'neutral']
}

In [None]:
# Convert to DataFrame
df = pd.DataFrame(data)

In [None]:
# Step 1: Preprocess the text
def preprocess_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)


In [None]:
df['text'] = df['text'].apply(preprocess_text)

In [None]:
# Step 2: Generate the Bag of Words
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['text'])
y = df['label']

In [None]:
# Step 3: Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 4: Train a classifier
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

In [None]:
# Step 5: Evaluate the classifier
y_pred = classifier.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Step 6: (Optional) Hyperparameter Tuning
# This step can be done using GridSearchCV or RandomizedSearchCV for better performance

Vocabulary: ['analysis' 'closely' 'data' 'fascinating' 'involves' 'language'
 'learning' 'machine' 'natural' 'processing' 'related' 'science']
Bag of Words Array:
 [[0 0 0 1 0 1 0 0 1 1 0 0]
 [0 1 1 0 0 0 1 1 0 0 1 1]
 [1 0 2 0 1 0 1 1 0 0 0 1]]
