In [None]:
# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm

# Plotting and Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Language Detection packages
# `langdetect` for detecting language
from langdetect import detect as langdetect_detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
# `langid` for an alternative language detection method
from langid import classify as langid_classify

# Text Preprocessing and NLP
# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords

# Tokenizing sentences/words
from nltk.corpus import wordnet

# Tokenizing sentences/words
from nltk.tokenize import word_tokenize
# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer
import nltk
# Regular expressions for text pattern matching
import re

# Word Cloud generation
from wordcloud import WordCloud

# For generating n-grams
from nltk.util import ngrams
from collections import Counter

# Libraries for Word2Vec and Logistic Regression
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, make_scorer


In [None]:
data = pd.read_csv("final_df.csv")

In [None]:
# pip install gensim

# FastText Embeddings + log regression
FastText is an extension of Word2Vec developed by Facebook’s AI Research (FAIR). While Word2Vec treats each word as a unique token, FastText breaks words into character n-grams (subword information). This means that it can generate vectors for words that were not seen during training, as long as their subwords were seen.

In [None]:
from gensim.models import FastText
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, make_scorer
# Ensure you have the required NLTK package
nltk.download('punkt')

# Define X (features) and y (target) based on multiclass sentiment
X = data['processed_full_review']  # Review text
y = data['sentiment']

# Step 1: Tokenize the sentences
tokenized_sentences = [nltk.word_tokenize(sentence.lower()) for sentence in X]

# Step 2: Train the FastText model
fasttext_model = FastText(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

# Step 3: Function to average word vectors for each sentence
def get_sentence_vector(sentence, model, vector_size):
    words = nltk.word_tokenize(sentence.lower())
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(vector_size)  # Return a zero vector if no words are found
    return np.mean(word_vectors, axis=0)

# Step 4: Convert each sentence to its FastText vector representation
X_vectors = np.array([get_sentence_vector(sentence, fasttext_model, 100) for sentence in X])

# Step 5: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectors, y, test_size=0.3, random_state=42)

# Step 6: Logistic regression model (for multiclass classification)
model = LogisticRegression(solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

# Step 7: Predicting and evaluating
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

# Output the accuracy and F1 score
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print(f"Model F1 Score: {f1:.2f}")

# Step 8: Implement Cross-Validation with F1 scoring
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_f1_scores = cross_val_score(model, X_vectors, y, cv=kf, scoring=make_scorer(f1_score, average='weighted'))

# Output cross-validation F1 score results
print(f"Cross-Validation F1 Scores: {cv_f1_scores}")
print(f"Average Cross-Validation F1 Score: {np.mean(cv_f1_scores):.2f}")

# FastText Embeddings + random forest

In [None]:
# Ensure you have the required NLTK package
nltk.download('punkt')

# Define X (features) and y (target) based on multiclass sentiment
X = data['processed_full_review']  # Review text
y = data['sentiment']

# Step 1: Tokenize the sentences
tokenized_sentences = [nltk.word_tokenize(sentence.lower()) for sentence in X]

# Step 2: Train the FastText model
fasttext_model = FastText(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

# Step 3: Function to average word vectors for each sentence
def get_sentence_vector(sentence, model, vector_size):
    words = nltk.word_tokenize(sentence.lower())
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(vector_size)  # Return a zero vector if no words are found
    return np.mean(word_vectors, axis=0)

# Step 4: Convert each sentence to its FastText vector representation
X_vectors = np.array([get_sentence_vector(sentence, fasttext_model, 100) for sentence in X])

# Step 5: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vectors, y, test_size=0.3, random_state=42)

# Step 6: Random Forest model (for multiclass classification)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 7: Predicting and evaluating
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

# Output the accuracy and F1 score
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print(f"Model F1 Score: {f1:.2f}")

# Step 8: Implement Cross-Validation with F1 scoring
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_f1_scores = cross_val_score(model, X_vectors, y, cv=kf, scoring=make_scorer(f1_score, average='weighted'))

# Output cross-validation F1 score results
print(f"Cross-Validation F1 Scores: {cv_f1_scores}")
print(f"Average Cross-Validation F1 Score: {np.mean(cv_f1_scores):.2f}")