In [2]:
# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm

# Plotting and Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Language Detection packages
# `langdetect` for detecting language
from langdetect import detect as langdetect_detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
# `langid` for an alternative language detection method
from langid import classify as langid_classify

# Text Preprocessing and NLP
# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords

# Tokenizing sentences/words
from nltk.corpus import wordnet

# Tokenizing sentences/words
from nltk.tokenize import word_tokenize
# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer
import nltk
# Regular expressions for text pattern matching
import re

# Word Cloud generation
from wordcloud import WordCloud

# For generating n-grams
from nltk.util import ngrams
from collections import Counter

In [3]:
data = pd.read_csv('final_df.csv')

data

Unnamed: 0,year,month,sentiment,processed_full_review
0,2024,3,Neutral,ok use airlin go singapor london heathrow issu...
1,2024,3,Negative,don give money book paid receiv email confirm ...
2,2024,3,Positive,best airlin world best airlin world seat food ...
3,2024,3,Negative,premium economi seat singapor airlin not worth...
4,2024,3,Negative,imposs get promis refund book flight full mont...
...,...,...,...,...
11513,2021,11,Negative,websit buggi paid first busi class ticket webs...
11514,2021,10,Negative,reduc level qualiti servic fear futur airlin t...
11515,2021,10,Negative,chang would cost usd book ticket singapor airl...
11516,2021,8,Negative,disappoint flight check secur check frankfurt ...


# SVM (linear) + CountVec

Linear kernel computes the dot product between 2 vectors, works best for lienarly separated data, more effective when features are numerous, as in text classification, where each word or term often represents a feature in high-dimensional space.

Less computationally less intensive and faster to train.

Class Imbalance Handling: We set `class_weight='balanced'` to automatically adjust the class weights inversely proportional to the class frequencies in the training data, helping the model pay more attention to minority classes.

Stratified K-fold: Maintain class distribution across the folds, which is important for imbalanced data.

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Initialize CountVectorizer
count_vectorizer = CountVectorizer(max_features=1000)
count_matrix = count_vectorizer.fit_transform(data['processed_full_review'])

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(count_matrix, data['sentiment'], test_size=0.2, random_state=42)

# Initialize the SVC model with specified parameters, using C to control L2 regularization
# A smaller C value increases L2 regularization strength
svm_model = SVC(kernel='linear', C=1, class_weight='balanced', random_state=42)  # Adjust C as needed for regularization strength

# Perform cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(svm_model, X_train, y_train, cv=skf, scoring='accuracy')

# Fit the model on the training data
svm_model.fit(X_train, y_train)

# Make predictions
svm_predictions = svm_model.predict(X_test)

# Print results
print("Cross-Validation Accuracy Scores:", cross_val_scores)
print("Mean Cross-Validation Accuracy:", cross_val_scores.mean())
print("SVM(linear) Accuracy:", accuracy_score(y_test, svm_predictions))
print("SVM(linear) Classification Report:\n", classification_report(y_test, svm_predictions, digits=4))


Cross-Validation Accuracy Scores: [0.80575149 0.81117743 0.81117743 0.8046663  0.81541802]
Mean Cross-Validation Accuracy: 0.809638135433954
SVM(linear) Accuracy: 0.8155381944444444
SVM(linear) Classification Report:
               precision    recall  f1-score   support

    Negative     0.7008    0.7574    0.7280       470
     Neutral     0.3514    0.4825    0.4067       228
    Positive     0.9528    0.8798    0.9149      1606

    accuracy                         0.8155      2304
   macro avg     0.6683    0.7066    0.6832      2304
weighted avg     0.8419    0.8155    0.8265      2304



# SVM (radial basis function (rbf)) + CountVec

RBF kernel, a.k.a. Gaussian kernel, is a non-linear kernel that maps data to a higher-dimensional space. Allows for non-linear separation, where classes cannot be separated by a single straight line, can capture complex patterns by creating flexible decision boundaries.

More computationally expensive and requires careful tuning of parameters to avoid overfitting.

RBF is advantageous if sentiment classes overlap in complex ways or if there are subtle patterns in word combiantions that are harder to capture linearly.

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Initialize CountVectorizer
count_vectorizer = CountVectorizer(max_features=1000)
count_matrix = count_vectorizer.fit_transform(data['processed_full_review'])

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(count_matrix, data['sentiment'], test_size=0.2, random_state=42)

# Initialize the SVC model with specified parameters, using C to control L2 regularization
# A smaller C value increases L2 regularization strength
svm_model = SVC(kernel='rbf', C=1, class_weight='balanced', random_state=42)  # Adjust C as needed for regularization strength

# Perform cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cross_val_scores = cross_val_score(svm_model, X_train, y_train, cv=skf, scoring='accuracy')

# Fit the model on the training data
svm_model.fit(X_train, y_train)

# Make predictions
svm_predictions = svm_model.predict(X_test)

# Print results
print("Cross-Validation Accuracy Scores:", cross_val_scores)
print("Mean Cross-Validation Accuracy:", cross_val_scores.mean())
print("SVM(linear) Accuracy:", accuracy_score(y_test, svm_predictions))
print("SVM(linear) Classification Report:\n", classification_report(y_test, svm_predictions, digits=4))

Cross-Validation Accuracy Scores: [0.83396636 0.8307108  0.83776451 0.8475312  0.83170467]
Mean Cross-Validation Accuracy: 0.8363355078316699
SVM(linear) Accuracy: 0.8446180555555556
SVM(linear) Classification Report:
               precision    recall  f1-score   support

    Negative     0.7201    0.8319    0.7720       470
     Neutral     0.4402    0.5000    0.4682       228
    Positive     0.9594    0.8973    0.9273      1606

    accuracy                         0.8446      2304
   macro avg     0.7065    0.7431    0.7225      2304
weighted avg     0.8592    0.8446    0.8502      2304



## SVM + Word2Vec

In [15]:
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np

# Define X (features) and y (target) based on multiclass sentiment
X = data['processed_full_review']  # Review text
y = data['sentiment']

# Encode sentiment labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Convert labels to numerical format

# Tokenize the sentences
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in X]

# Train the Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=2)

# Function to convert each sentence into an average word2vec vector
def sentence_to_avg_vector(sentence, model):
    words = [word for word in sentence if word in model.wv]  # Keep only words present in the Word2Vec vocabulary
    if len(words) > 0:
        return np.mean([model.wv[word] for word in words], axis=0)
    else:
        return np.zeros(model.vector_size)  # Return a zero vector if no words from the sentence are in the vocabulary

# Convert all tokenized sentences to their corresponding average word2vec vectors
X_word2vec = np.array([sentence_to_avg_vector(sentence, word2vec_model) for sentence in tokenized_sentences])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, y_encoded, test_size=0.3, random_state=42)

# Define the SVM pipeline with scaling and increased max_iter
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Scaling step
    ('svm', SVC(kernel='rbf', max_iter=50000, C=1.0))  # Increased max_iter
])

# Fit the pipeline on the training data
svm_pipeline.fit(X_train, y_train)

# Evaluate on the test data
y_pred = svm_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print(f"Model F1 Score: {f1:.2f}")

# Perform cross-validation with the pipeline
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_f1_scores = cross_val_score(svm_pipeline, X_word2vec, y_encoded, cv=kf, scoring=make_scorer(f1_score, average='weighted'))

# Output cross-validation F1 score results
print(f"Cross-Validation F1 Scores: {cv_f1_scores}")
print(f"Average Cross-Validation F1 Score: {np.mean(cv_f1_scores):.4f}")


Model Accuracy: 83.22%
Model F1 Score: 0.81
Cross-Validation F1 Scores: [0.8124384  0.81277222 0.82237754 0.82027412 0.80612755]
Average Cross-Validation F1 Score: 0.8148


## SVM + FastText 

In [17]:
from gensim.models import FastText
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np

# # Download NLTK data (only needs to be done once)
# nltk.download('punkt')

# Define X (features) and y (target)
X = data['processed_full_review']
y = data['sentiment']

# Encode sentiment labels to numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Step 1: Tokenize sentences
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in X]

# Step 2: Train FastText model
fasttext_model = FastText(
    sentences=tokenized_sentences,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4,
    sg=1,  # Skip-gram model (usually better for classification)
    epochs=10
)

# Step 3: Convert each sentence to an average vector
def get_sentence_vector(sentence, model, vector_size):
    words = word_tokenize(sentence.lower())
    word_vectors = [
        model.wv[word] 
        for word in words 
        if word in model.wv
    ]
    if not word_vectors:
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

# Convert all sentences to their corresponding average vectors
X_vectors = np.array([
    get_sentence_vector(sentence, fasttext_model, fasttext_model.vector_size) 
    for sentence in X
])

# Standardize the data
scaler = StandardScaler()
X_vectors_scaled = scaler.fit_transform(X_vectors)

# Step 4: Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X_vectors_scaled, 
    y_encoded, 
    test_size=0.3, 
    random_state=42,
    stratify=y_encoded
)

# Step 5: Define and train the SVM model
model = SVC(
    kernel='rbf',
    max_iter=5000,  # Increased max_iter for better convergence
    C=1.0,
    gamma='scale',
    class_weight='balanced'
)

# Fit the model
print("Training SVM model...")
model.fit(X_train, y_train)

# Step 6: Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"\nModel Performance:")
print(f"Accuracy: {accuracy:.4f} ({accuracy * 100:.2f}%)")
print(f"F1 Score: {f1:.4f}")

# Step 7: Cross-validation with F1 score
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_f1_scores = cross_val_score(
    model, 
    X_vectors_scaled, 
    y_encoded, 
    cv=kf,
    scoring=make_scorer(f1_score, average='weighted')
)

print("\nCross-Validation Results:")
print(f"F1 Scores: {cv_f1_scores}")
print(f"Mean F1: {np.mean(cv_f1_scores):.4f}")
print(f"Std F1: {np.std(cv_f1_scores):.4f}")


[nltk_data] Downloading package punkt to
[nltk_data]     /home/dariusng2103/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Training SVM model...

Model Performance:
Accuracy: 0.8316 (83.16%)
F1 Score: 0.8458

Cross-Validation Results:
F1 Scores: [0.85204387 0.84201688 0.85859865 0.84342061 0.85172078]
Mean F1: 0.8496
Std F1: 0.0061
