# Enhancing Singapore Airlines' Service Through Automated Sentiment Analysis of Customer Reviews



**Motivation**



## Singapore Airlines Customer Reviews Dataset Information

The [Singapore Airlines Customer Reviews Dataset](https://www.kaggle.com/datasets/kanchana1990/singapore-airlines-reviews) aggregates 10,000 anonymized customer reviews, providing a broad perspective on the passenger experience with Singapore Airlines. 

The dimensions are shown below:
- **`published_date`**: Date and time of review publication.
- **`published_platform`**: Platform where the review was posted.
- **`rating`**: Customer satisfaction rating, from 1 (lowest) to 5 (highest).
- **`type`**: Specifies the content as a review.
- **`text`**: Detailed customer feedback.
- **`title`**: Summary of the review.
- **`helpful_votes`**: Number of users finding the review helpful.

## Importing Libraries

Please uncomment the code box below to pip install relevant dependencies for this notebook.

In [2]:
!pip3 install -r ../requirements.txt

In [1]:
# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm

# Plotting and Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Language Detection packages
# `langdetect` for detecting language
from langdetect import detect as langdetect_detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
# `langid` for an alternative language detection method
from langid import classify as langid_classify

# Text Preprocessing and NLP
# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords

# Tokenizing sentences/words
from nltk.corpus import wordnet

# Tokenizing sentences/words
from nltk.tokenize import word_tokenize
# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer
import nltk
# Regular expressions for text pattern matching
import re

# Word Cloud generation
from wordcloud import WordCloud

# For generating n-grams
from nltk.util import ngrams
from collections import Counter

# Libraries for Word2Vec and Logistic Regression
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, make_scorer


# Word2Vec + log regression


In [7]:
pip install gensim


Note: you may need to restart the kernel to use updated packages.


In [2]:
data = pd.read_csv("final_df.csv")

In [3]:
# Define X (features) and y (target) based on multiclass sentiment
X = data['processed_full_review']  # Review text
y = data['sentiment']

# Tokenize the sentences
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in X]

# Train the Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=2)

# Function to convert each sentence into an average word2vec vector
def sentence_to_avg_vector(sentence, model):
    words = [word for word in sentence if word in model.wv]  # Keep only words present in the Word2Vec vocabulary
    if len(words) > 0:
        return np.mean([model.wv[word] for word in words], axis=0)
    else:
        return np.zeros(model.vector_size)  # Return a zero vector if no words from the sentence are in the vocabulary

# Convert all tokenized sentences to their corresponding average word2vec vectors
X_word2vec = np.array([sentence_to_avg_vector(sentence, word2vec_model) for sentence in tokenized_sentences])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, y, test_size=0.3, random_state=42)

# Train a logistic regression model
model = LogisticRegression(solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate accuracy and F1 score
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print(f"Model F1 Score: {f1:.2f}")

# Cross-validation with F1 score
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_f1_scores = cross_val_score(model, X_word2vec, y, cv=kf, scoring=make_scorer(f1_score, average='weighted'))

# Output cross-validation F1 score results
print(f"Cross-Validation F1 Scores: {cv_f1_scores}")
print(f"Average Cross-Validation F1 Score: {np.mean(cv_f1_scores):.2f}")

Model Accuracy: 83.45%
Model F1 Score: 0.82


Traceback (most recent call last):
  File "/home/dariusng2103/projects/mla_project/tf217/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 139, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "/home/dariusng2103/projects/mla_project/tf217/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 371, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/home/dariusng2103/projects/mla_project/tf217/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 89, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "/home/dariusng2103/projects/mla_project/tf217/lib/python3.12/site-packages/sklearn/utils/_response.py", line 204, in _get_response_values
    raise ValueError(
ValueError: pos_label=1 is not a valid label: It should be one of ['Negative' 'Neutral' 'Positive']

Traceback (most recent call last):
  File "/home/dariusng2103/projects/mla_project/tf217/lib/python3.12/sit

Cross-Validation F1 Scores: [nan nan nan nan nan]
Average Cross-Validation F1 Score: nan


Traceback (most recent call last):
  File "/home/dariusng2103/projects/mla_project/tf217/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 139, in __call__
    score = scorer._score(
            ^^^^^^^^^^^^^^
  File "/home/dariusng2103/projects/mla_project/tf217/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 371, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/home/dariusng2103/projects/mla_project/tf217/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 89, in _cached_call
    result, _ = _get_response_values(
                ^^^^^^^^^^^^^^^^^^^^^
  File "/home/dariusng2103/projects/mla_project/tf217/lib/python3.12/site-packages/sklearn/utils/_response.py", line 204, in _get_response_values
    raise ValueError(
ValueError: pos_label=1 is not a valid label: It should be one of ['Negative' 'Neutral' 'Positive']



# Word2Vec + random forest

In [14]:
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, make_scorer
import numpy as np

# Define X (features) and y (target) based on multiclass sentiment
X = data['processed_full_review']  # Review text
y = data['sentiment']

# Tokenize the sentences
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in X]

# Train the Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=2)

# Function to convert each sentence into an average word2vec vector
def sentence_to_avg_vector(sentence, model):
    words = [word for word in sentence if word in model.wv]  # Keep only words present in the Word2Vec vocabulary
    if len(words) > 0:
        return np.mean([model.wv[word] for word in words], axis=0)
    else:
        return np.zeros(model.vector_size)  # Return a zero vector if no words from the sentence are in the vocabulary

# Convert all tokenized sentences to their corresponding average word2vec vectors
X_word2vec = np.array([sentence_to_avg_vector(sentence, word2vec_model) for sentence in tokenized_sentences])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, y, test_size=0.3, random_state=42)

# Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Model Accuracy: {accuracy * 100:.2f}%")
print(f"Model F1 Score: {f1:.2f}")

# Cross-validation with F1 score
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_f1_scores = cross_val_score(model, X_word2vec, y, cv=kf, scoring=make_scorer(f1_score, average='weighted'))

# Output cross-validation F1 score results
print(f"Cross-Validation F1 Scores: {cv_f1_scores}")
print(f"Average Cross-Validation F1 Score: {np.mean(cv_f1_scores):.2f}")

Model Accuracy: 82.23%
Model F1 Score: 0.80
Cross-Validation F1 Scores: [0.80198335 0.79488966 0.79785105 0.79883879 0.78189603]
Average Cross-Validation F1 Score: 0.80
