# Multiple ways to do sentiment analysis

In [1]:
#! Import libraries.
import pandas as pd
from py_modules.ml_utils import *

#! Import training and test data.
train_df = pd.read_csv("cleaned_aclImdb/train.csv")
test_df = pd.read_csv("cleaned_aclImdb/test.csv")

## Naive Bayes

In [2]:
#! Using a Bag-of-Words model.
bow_lr_classifier = train_and_evaluate_model(
    train_df, test_df, vectorizer_type="bow", classifier_type="nb"
)

#! Using a TF-IDF model.
tfidf_lr_classifier = train_and_evaluate_model(
    train_df, test_df, vectorizer_type="tfidf", classifier_type="nb"
)

Model evaluation for NB classifier with BOW vectorizer:
	Runtime: 4.58 seconds
	Accuracy: 82.24%
              precision    recall  f1-score   support

         neg       0.79      0.87      0.83     12500
         pos       0.86      0.77      0.81     12500

    accuracy                           0.82     25000
   macro avg       0.83      0.82      0.82     25000
weighted avg       0.83      0.82      0.82     25000

----------------------------------------------------------------------------------------------------
Model evaluation for NB classifier with TFIDF vectorizer:
	Runtime: 5.57 seconds
	Accuracy: 83.35%
              precision    recall  f1-score   support

         neg       0.81      0.88      0.84     12500
         pos       0.87      0.79      0.83     12500

    accuracy                           0.83     25000
   macro avg       0.84      0.83      0.83     25000
weighted avg       0.84      0.83      0.83     25000

-------------------------------------------------

In [3]:
#! Using a Unigram model.
unigram_nb_classifier = train_and_evaluate_model(
    train_df, test_df, vectorizer_type="unigram", classifier_type="nb"
)

#! Using a Bigram model.
bigram_nb_classifier = train_and_evaluate_model(
    train_df, test_df, vectorizer_type="bigram", classifier_type="nb"
)

#! Using a Trigram model.
trigram_nb_classifier = train_and_evaluate_model(
    train_df, test_df, vectorizer_type="trigram", classifier_type="nb"
)

Model evaluation for NB classifier with UNIGRAM vectorizer:
	Runtime: 5.49 seconds
	Accuracy: 82.24%
              precision    recall  f1-score   support

         neg       0.79      0.87      0.83     12500
         pos       0.86      0.77      0.81     12500

    accuracy                           0.82     25000
   macro avg       0.83      0.82      0.82     25000
weighted avg       0.83      0.82      0.82     25000

----------------------------------------------------------------------------------------------------
Model evaluation for NB classifier with BIGRAM vectorizer:
	Runtime: 14.54 seconds
	Accuracy: 84.36%
              precision    recall  f1-score   support

         neg       0.81      0.90      0.85     12500
         pos       0.88      0.79      0.83     12500

    accuracy                           0.84     25000
   macro avg       0.85      0.84      0.84     25000
weighted avg       0.85      0.84      0.84     25000

-------------------------------------------

In [4]:
#! Using a Unigram model with TF-IDF.
unigram_nb_classifier = train_and_evaluate_model(
    train_df, test_df, vectorizer_type="unigram-tfidf", classifier_type="nb"
)

#! Using a Bigram model with TF-IDF.
bigram_nb_classifier = train_and_evaluate_model(
    train_df, test_df, vectorizer_type="bigram-tfidf", classifier_type="nb"
)

#! Using a Trigram model with TF-IDF.
trigram_nb_classifier = train_and_evaluate_model(
    train_df, test_df, vectorizer_type="trigram-tfidf", classifier_type="nb"
)

Model evaluation for NB classifier with UNIGRAM-TFIDF vectorizer:
	Runtime: 5.60 seconds
	Accuracy: 83.35%
              precision    recall  f1-score   support

         neg       0.81      0.88      0.84     12500
         pos       0.87      0.79      0.83     12500

    accuracy                           0.83     25000
   macro avg       0.84      0.83      0.83     25000
weighted avg       0.84      0.83      0.83     25000

----------------------------------------------------------------------------------------------------
Model evaluation for NB classifier with BIGRAM-TFIDF vectorizer:
	Runtime: 15.14 seconds
	Accuracy: 85.13%
              precision    recall  f1-score   support

         neg       0.82      0.90      0.86     12500
         pos       0.89      0.80      0.84     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000

-------------------------------

## Logistic Regression

In [5]:
#! Using a Bag-of-Words model.
bow_lr_classifier = train_and_evaluate_model(
    train_df, test_df, vectorizer_type="bow", classifier_type="lr"
)

#! Using a TF-IDF model.
tfidf_lr_classifier = train_and_evaluate_model(
    train_df, test_df, vectorizer_type="tfidf", classifier_type="lr"
)

Model evaluation for LR classifier with BOW vectorizer:
	Runtime: 12.79 seconds
	Accuracy: 86.31%
              precision    recall  f1-score   support

         neg       0.86      0.87      0.86     12500
         pos       0.87      0.86      0.86     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000

----------------------------------------------------------------------------------------------------
Model evaluation for LR classifier with TFIDF vectorizer:
	Runtime: 7.93 seconds
	Accuracy: 87.95%
              precision    recall  f1-score   support

         neg       0.88      0.88      0.88     12500
         pos       0.88      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000

------------------------------------------------

In [6]:
#! Using a Unigram model.
unigram_lr_classifier = train_and_evaluate_model(
    train_df, test_df, vectorizer_type="unigram", classifier_type="lr"
)

#! Using a Bigram model.
bigram_lr_classifier = train_and_evaluate_model(
    train_df, test_df, vectorizer_type="bigram", classifier_type="lr"
)

#! Using a Trigram model.
trigram_lr_classifier = train_and_evaluate_model(
    train_df, test_df, vectorizer_type="trigram", classifier_type="lr"
)

Model evaluation for LR classifier with UNIGRAM vectorizer:
	Runtime: 13.15 seconds
	Accuracy: 86.31%
              precision    recall  f1-score   support

         neg       0.86      0.87      0.86     12500
         pos       0.87      0.86      0.86     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000

----------------------------------------------------------------------------------------------------
Model evaluation for LR classifier with BIGRAM vectorizer:
	Runtime: 29.45 seconds
	Accuracy: 83.79%
              precision    recall  f1-score   support

         neg       0.85      0.83      0.84     12500
         pos       0.83      0.85      0.84     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000

------------------------------------------

In [7]:
#! Using a Unigram model with TF-IDF.
unigram_lr_classifier = train_and_evaluate_model(
    train_df, test_df, vectorizer_type="unigram-tfidf", classifier_type="lr"
)

#! Using a Bigram model with TF-IDF.
bigram_lr_classifier = train_and_evaluate_model(
    train_df, test_df, vectorizer_type="bigram-tfidf", classifier_type="lr"
)

#! Using a Trigram model with TF-IDF.
trigram_lr_classifier = train_and_evaluate_model(
    train_df, test_df, vectorizer_type="trigram-tfidf", classifier_type="lr"
)

Model evaluation for LR classifier with UNIGRAM-TFIDF vectorizer:
	Runtime: 8.40 seconds
	Accuracy: 87.95%
              precision    recall  f1-score   support

         neg       0.88      0.88      0.88     12500
         pos       0.88      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000

----------------------------------------------------------------------------------------------------
Model evaluation for LR classifier with BIGRAM-TFIDF vectorizer:
	Runtime: 26.89 seconds
	Accuracy: 83.55%
              precision    recall  f1-score   support

         neg       0.84      0.83      0.83     12500
         pos       0.83      0.84      0.84     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000

-------------------------------

## Support Vector Machine

In [8]:
#! Using a Bag-of-Words model.
bow_svm_classifier = train_and_evaluate_model(
    train_df, test_df, vectorizer_type="bow", classifier_type="svm"
)

#! Using a TF-IDF model.
tfidf_svm_classifier = train_and_evaluate_model(
    train_df, test_df, vectorizer_type="tfidf", classifier_type="svm"
)

Model evaluation for SVM classifier with BOW vectorizer:
	Runtime: 13.15 seconds
	Accuracy: 84.50%
              precision    recall  f1-score   support

         neg       0.84      0.86      0.85     12500
         pos       0.85      0.83      0.84     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000

----------------------------------------------------------------------------------------------------
Model evaluation for SVM classifier with TFIDF vectorizer:
	Runtime: 4.87 seconds
	Accuracy: 86.93%
              precision    recall  f1-score   support

         neg       0.86      0.88      0.87     12500
         pos       0.88      0.86      0.87     12500

    accuracy                           0.87     25000
   macro avg       0.87      0.87      0.87     25000
weighted avg       0.87      0.87      0.87     25000

----------------------------------------------

In [9]:
#! Using a Unigram model.
unigram_svm_classifier = train_and_evaluate_model(
    train_df, test_df, vectorizer_type="unigram", classifier_type="svm"
)

#! Using a Bigram model.
bigram_svm_classifier = train_and_evaluate_model(
    train_df, test_df, vectorizer_type="bigram", classifier_type="svm"
)

#! Using a Trigram model.
trigram_svm_classifier = train_and_evaluate_model(
    train_df, test_df, vectorizer_type="trigram", classifier_type="svm"
)

Model evaluation for SVM classifier with UNIGRAM vectorizer:
	Runtime: 10.23 seconds
	Accuracy: 84.50%
              precision    recall  f1-score   support

         neg       0.84      0.86      0.85     12500
         pos       0.85      0.83      0.84     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000

----------------------------------------------------------------------------------------------------
Model evaluation for SVM classifier with BIGRAM vectorizer:
	Runtime: 31.32 seconds
	Accuracy: 83.57%
              precision    recall  f1-score   support

         neg       0.84      0.82      0.83     12500
         pos       0.83      0.85      0.84     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000

----------------------------------------

In [10]:
#! Using a Unigram model with TF-IDF.
unigram_svm_classifier = train_and_evaluate_model(
    train_df, test_df, vectorizer_type="unigram-tfidf", classifier_type="svm"
)

#! Using a Bigram model with TF-IDF.
bigram_svm_classifier = train_and_evaluate_model(
    train_df, test_df, vectorizer_type="bigram-tfidf", classifier_type="svm"
)

#! Using a Trigram model with TF-IDF.
trigram_svm_classifier = train_and_evaluate_model(
    train_df, test_df, vectorizer_type="trigram-tfidf", classifier_type="svm"
)

Model evaluation for SVM classifier with UNIGRAM-TFIDF vectorizer:
	Runtime: 4.93 seconds
	Accuracy: 86.93%
              precision    recall  f1-score   support

         neg       0.86      0.88      0.87     12500
         pos       0.88      0.86      0.87     12500

    accuracy                           0.87     25000
   macro avg       0.87      0.87      0.87     25000
weighted avg       0.87      0.87      0.87     25000

----------------------------------------------------------------------------------------------------
Model evaluation for SVM classifier with BIGRAM-TFIDF vectorizer:
	Runtime: 12.88 seconds
	Accuracy: 84.97%
              precision    recall  f1-score   support

         neg       0.85      0.85      0.85     12500
         pos       0.85      0.85      0.85     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000

-----------------------------