In [1]:
# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm

# Plotting and Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Language Detection packages
# `langdetect` for detecting language
from langdetect import detect as langdetect_detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
# `langid` for an alternative language detection method
from langid import classify as langid_classify

# Text Preprocessing and NLP
# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords

# Tokenizing sentences/words
from nltk.corpus import wordnet

# Tokenizing sentences/words
from nltk.tokenize import word_tokenize
# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer
import nltk
# Regular expressions for text pattern matching
import re

# Word Cloud generation
from wordcloud import WordCloud

# For generating n-grams
from nltk.util import ngrams
from collections import Counter

# Libraries for Word2Vec and Logistic Regression
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, make_scorer


In [2]:
data = pd.read_csv("final_df.csv")

# FastText + Complement NB

FastText is an extension of Word2Vec developed by Facebook’s AI Research (FAIR). While Word2Vec treats each word as a unique token, FastText breaks words into character n-grams (subword information). This means that it can generate vectors for words that were not seen during training, as long as their subwords were seen.

In [3]:
from gensim.models import FastText
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Tokenize the processed reviews for FastText training
tokenized_reviews = [review.split() for review in data['processed_full_review']]

# Train the FastText model
fasttext_model = FastText(sentences=tokenized_reviews, vector_size=100, window=5, min_count=1, sg=1, workers=4, seed=42)

# Function to compute the average FastText vectors for each review
def get_average_fasttext(review, model, vector_size):
    words = review.split()
    word_vecs = [model.wv[word] for word in words if word in model.wv]
    if word_vecs:
        return np.mean(word_vecs, axis=0)
    else:
        return np.zeros(vector_size)

# Create the feature matrix by averaging word vectors for each review
vector_size = fasttext_model.vector_size
X = np.array([get_average_fasttext(review, fasttext_model, vector_size) for review in data['processed_full_review']])

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, data['sentiment'], test_size=0.2, random_state=42)

# Initialize and train the Complement Naive Bayes model
nb_model = ComplementNB(alpha=5.0)
nb_model.fit(X_train, y_train)

# Make predictions
nb_predictions = nb_model.predict(X_test)

# Evaluate the model
print("Complement NB Accuracy:", accuracy_score(y_test, nb_predictions))
print("Complement NB Classification Report:\n", classification_report(y_test, nb_predictions, digits=4))


ValueError: Negative values in data passed to ComplementNB (input X)

# FastText + RF

In [9]:
from gensim.models import FastText
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Tokenize the processed reviews for FastText training
tokenized_reviews = [review.split() for review in data['processed_full_review']]

# Train the FastText model
fasttext_model = FastText(sentences=tokenized_reviews, vector_size=100, window=5, min_count=1, sg=1, workers=4, seed=42)

# Function to compute the average FastText vectors for each review
def get_average_fasttext(review, model, vector_size):
    words = review.split()
    word_vecs = [model.wv[word] for word in words if word in model.wv]
    if word_vecs:
        return np.mean(word_vecs, axis=0)
    else:
        return np.zeros(vector_size)

# Create the feature matrix by averaging word vectors for each review
vector_size = fasttext_model.vector_size
X = np.array([get_average_fasttext(review, fasttext_model, vector_size) for review in data['processed_full_review']])
y = data['sentiment']

# Stratified 5-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Initialize and train the Random Forest model
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    
    # Make predictions
    rf_predictions = rf_model.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, rf_predictions)
    accuracy_scores.append(accuracy)
    
    report = classification_report(y_test, rf_predictions, digits=4, output_dict=True)
    precision_scores.append(report["weighted avg"]["precision"])
    recall_scores.append(report["weighted avg"]["recall"])
    f1_scores.append(report["weighted avg"]["f1-score"])

    print(f"Fold Accuracy: {accuracy}")
    print(f"Fold Classification Report:\n", classification_report(y_test, rf_predictions, digits=4))

# Print average scores across all folds
print("\nAverage Accuracy across folds:", np.mean(accuracy_scores))
print("Average Precision across folds:", np.mean(precision_scores))
print("Average Recall across folds:", np.mean(recall_scores))
print("Average F1 Score across folds:", np.mean(f1_scores))


Fold Accuracy: 0.8415798611111112
Fold Classification Report:
               precision    recall  f1-score   support

    Negative     0.7628    0.7910    0.7767       488
     Neutral     0.5000    0.0987    0.1649       233
    Positive     0.8733    0.9665    0.9175      1583

    accuracy                         0.8416      2304
   macro avg     0.7120    0.6187    0.6197      2304
weighted avg     0.8121    0.8416    0.8116      2304

Fold Accuracy: 0.8511284722222222
Fold Classification Report:
               precision    recall  f1-score   support

    Negative     0.7844    0.8053    0.7947       488
     Neutral     0.5692    0.1588    0.2483       233
    Positive     0.8809    0.9672    0.9220      1583

    accuracy                         0.8511      2304
   macro avg     0.7449    0.6438    0.6550      2304
weighted avg     0.8289    0.8511    0.8269      2304

Fold Accuracy: 0.8333333333333334
Fold Classification Report:
               precision    recall  f1-score   sup

# FastText + Log Regression

In [10]:
from gensim.models import FastText
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Tokenize the processed reviews for FastText training
tokenized_reviews = [review.split() for review in data['processed_full_review']]

# Train the FastText model
fasttext_model = FastText(sentences=tokenized_reviews, vector_size=100, window=5, min_count=1, sg=1, workers=4, seed=42)

# Function to compute the average FastText vectors for each review
def get_average_fasttext(review, model, vector_size):
    words = review.split()
    word_vecs = [model.wv[word] for word in words if word in model.wv]
    if word_vecs:
        return np.mean(word_vecs, axis=0)
    else:
        return np.zeros(vector_size)

# Create the feature matrix by averaging word vectors for each review
vector_size = fasttext_model.vector_size
X = np.array([get_average_fasttext(review, fasttext_model, vector_size) for review in data['processed_full_review']])
y = data['sentiment']

# Stratified 5-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Initialize and train the Logistic Regression model
    clf = LogisticRegression(random_state=42, multi_class='multinomial', solver='lbfgs', max_iter=200)
    clf.fit(X_train, y_train)
    
    # Make predictions
    clf_predictions = clf.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, clf_predictions)
    accuracy_scores.append(accuracy)
    
    report = classification_report(y_test, clf_predictions, digits=4, output_dict=True)
    precision_scores.append(report["weighted avg"]["precision"])
    recall_scores.append(report["weighted avg"]["recall"])
    f1_scores.append(report["weighted avg"]["f1-score"])

    print(f"Fold Accuracy: {accuracy}")
    print(f"Fold Classification Report:\n", classification_report(y_test, clf_predictions, digits=4))

# Print average scores across all folds
print("\nAverage Accuracy across folds:", np.mean(accuracy_scores))
print("Average Precision across folds:", np.mean(precision_scores))
print("Average Recall across folds:", np.mean(recall_scores))
print("Average F1 Score across folds:", np.mean(f1_scores))




Fold Accuracy: 0.8493923611111112
Fold Classification Report:
               precision    recall  f1-score   support

    Negative     0.7627    0.7971    0.7796       488
     Neutral     0.5263    0.2146    0.3049       233
    Positive     0.8935    0.9589    0.9250      1583

    accuracy                         0.8494      2304
   macro avg     0.7275    0.6569    0.6698      2304
weighted avg     0.8286    0.8494    0.8315      2304





Fold Accuracy: 0.8550347222222222
Fold Classification Report:
               precision    recall  f1-score   support

    Negative     0.7853    0.8094    0.7972       488
     Neutral     0.5312    0.2189    0.3100       233
    Positive     0.8938    0.9627    0.9270      1583

    accuracy                         0.8550      2304
   macro avg     0.7368    0.6637    0.6781      2304
weighted avg     0.8342    0.8550    0.8371      2304





Fold Accuracy: 0.8415798611111112
Fold Classification Report:
               precision    recall  f1-score   support

    Negative     0.7548    0.8012    0.7773       488
     Neutral     0.4700    0.2017    0.2823       233
    Positive     0.8903    0.9482    0.9183      1583

    accuracy                         0.8416      2304
   macro avg     0.7050    0.6504    0.6593      2304
weighted avg     0.8191    0.8416    0.8241      2304





Fold Accuracy: 0.8441163699522363
Fold Classification Report:
               precision    recall  f1-score   support

    Negative     0.7923    0.8053    0.7988       488
     Neutral     0.4433    0.1845    0.2606       233
    Positive     0.8819    0.9532    0.9162      1582

    accuracy                         0.8441      2303
   macro avg     0.7058    0.6477    0.6585      2303
weighted avg     0.8185    0.8441    0.8250      2303





Fold Accuracy: 0.8532349109856708
Fold Classification Report:
               precision    recall  f1-score   support

    Negative     0.7596    0.8466    0.8008       489
     Neutral     0.5161    0.2069    0.2954       232
    Positive     0.9027    0.9501    0.9258      1582

    accuracy                         0.8532      2303
   macro avg     0.7262    0.6679    0.6740      2303
weighted avg     0.8334    0.8532    0.8357      2303


Average Accuracy across folds: 0.8486716450764703
Average Precision across folds: 0.8267647473776394
Average Recall across folds: 0.8486716450764703
Average F1 Score across folds: 0.8306925339486154
