In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline

In [None]:
encodings = ['utf-8', 'latin1']

In [None]:
for encoding in encodings:
    try:
        df = pd.read_csv('Q2 Sentiment Analysis Dataset.csv', encoding=encoding)
        X_train, X_test, y_train, y_test = train_test_split(df['text'], df['sentiment'], test_size=0.2, random_state=42)
        classifiers = {
            'Naive Bayes': MultinomialNB(),
            'Logistic Regression': LogisticRegression(),
            'Random Forest': RandomForestClassifier(),
            'SVM': SVC(),
            'Perceptron': Perceptron()
        }
        vectorizers = {
            'Bag of Words (Raw Counts)': CountVectorizer(),
            'Bag of Words (TfIDF)': TfidfVectorizer(),
            'N-grams': CountVectorizer(ngram_range=(1, 3))
        }
        results = pd.DataFrame(columns=['Classifier', 'Feature Extraction', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

        for classifier_name, classifier in classifiers.items():
            for vectorizer_name, vectorizer in vectorizers.items():
                model = Pipeline([('vectorizer', vectorizer), ('classifier', classifier)])
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                accuracy = accuracy_score(y_test, y_pred)
                precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
                results = results.append({
                    'Classifier': classifier_name,
                    'Feature Extraction': vectorizer_name,
                    'Accuracy': accuracy,
                    'Precision': precision,
                    'Recall': recall,
                    'F1 Score': f1
                }, ignore_index=True)
        print(results)
        micro_avg = results.groupby(['Classifier', 'Feature Extraction']).mean().reset_index()
        macro_avg = results.groupby(['Classifier', 'Feature Extraction']).mean().reset_index()
        print("\nMicro Averages:")
        print(micro_avg)
        print("\nMacro Averages:")
        print(macro_avg)
    except UnicodeDecodeError:
        print(f"Failed to read with encoding {encoding}. Trying the next one.")


Failed to read with encoding utf-8. Trying the next one.


  _warn_prf(average, modifier, msg_start, len(result))
  results = results.append({
  _warn_prf(average, modifier, msg_start, len(result))
  results = results.append({
  _warn_prf(average, modifier, msg_start, len(result))
  results = results.append({
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  results = results.append({
  _warn_prf(average, modifier, msg_start, len(result))
  results = results.append({
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to

             Classifier         Feature Extraction  Accuracy  Precision  \
0           Naive Bayes  Bag of Words (Raw Counts)  0.726221   0.713165   
1           Naive Bayes       Bag of Words (TfIDF)  0.751928   0.766513   
2           Naive Bayes                    N-grams  0.742931   0.720286   
3   Logistic Regression  Bag of Words (Raw Counts)  0.748072   0.722980   
4   Logistic Regression       Bag of Words (TfIDF)  0.740360   0.725358   
5   Logistic Regression                    N-grams  0.739075   0.720634   
6         Random Forest  Bag of Words (Raw Counts)  0.741645   0.724348   
7         Random Forest       Bag of Words (TfIDF)  0.739075   0.728230   
8         Random Forest                    N-grams  0.730077   0.717329   
9                   SVM  Bag of Words (Raw Counts)  0.735219   0.735272   
10                  SVM       Bag of Words (TfIDF)  0.750643   0.746760   
11                  SVM                    N-grams  0.718509   0.719345   
12           Perceptron  

  results = results.append({
