In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

In [2]:
# Step 1: Load the CSV Data
# Replace 'your_data.csv' with the path to your CSV file
data = pd.read_csv('sentiment_analysis_data.csv')

In [10]:
# Step 2: Preprocess the Data (e.g., convert text to lowercase)
data['text'] = data['text'].str.lower()

In [11]:
# Step 3: Encode Labels (if necessary)
label_encoder = LabelEncoder()
data['sentiment'] = label_encoder.fit_transform(data['sentiment'])

In [12]:
# Step 4: Split the Data into Training and Testing Sets
X = data['text']  # Features (text data)
y = data['sentiment']  # Labels (encoded sentiment)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [5]:
# Step 4: Convert Text Data to Numerical Format (TF-IDF)
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [13]:
# Step 5: Set up a Pipeline with TfidfVectorizer and MultinomialNB
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 2))),  # Unigrams and Bigrams
    ('clf', MultinomialNB())
])

In [14]:
# Step 6: Hyperparameter Tuning using GridSearchCV
param_grid = {
    'tfidf__max_df': [0.75, 0.85, 1.0],
    'tfidf__min_df': [1, 2, 3],
    'clf__alpha': [0.5, 1.0]
}

In [15]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

60 fits failed out of a total of 90.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Karthick Selvam\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Karthick Selvam\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Karthick Selvam\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\pipeline.py", line 469, in fit
    Xt = self._

In [16]:
# Step 7: Evaluate the Best Model on Test Data
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Best Parameters: {'clf__alpha': 0.5, 'tfidf__max_df': 0.75, 'tfidf__min_df': 1}
Accuracy: 0.3333333333333333

Classification Report:
               precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         1

    accuracy                           0.33         3
   macro avg       0.17      0.33      0.22         3
weighted avg       0.17      0.33      0.22         3



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
# Step 8: Make Predictions (Optional)
sample_text = ["I absolutely love this!", "This is horrible."]
sample_pred = best_model.predict(sample_text)
for text, pred in zip(sample_text, sample_pred):
    print(f"Text: {text} -> Sentiment: {label_encoder.inverse_transform([pred])[0]}")

Text: I absolutely love this! -> Sentiment: positive
Text: This is horrible. -> Sentiment: negative
