In [96]:
# !pip3 install -r requirements.txt

## Importing Libraries

In [97]:
# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm

# Plotting and Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Language Detection packages
# `langdetect` for detecting language
from langdetect import detect as langdetect_detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
# `langid` for an alternative language detection method
from langid import classify as langid_classify

# Text Preprocessing and NLP
# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords
# Tokenizing sentences/words
from nltk.tokenize import word_tokenize
# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer
import nltk
# Regular expressions for text pattern matching
import re

# Word Cloud generation
from wordcloud import WordCloud

## Data Preparation (Loading CSV)

Load the `singapore_airline_reviews.csv` file into a pandas DataFrame.

In [98]:
data_raw = pd.read_csv("singapore_airlines_reviews.csv")
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   published_date      10000 non-null  object
 1   published_platform  10000 non-null  object
 2   rating              10000 non-null  int64 
 3   type                10000 non-null  object
 4   text                10000 non-null  object
 5   title               9999 non-null   object
 6   helpful_votes       10000 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 547.0+ KB


In [99]:
data_raw.head()

Unnamed: 0,published_date,published_platform,rating,type,text,title,helpful_votes
0,2024-03-12T14:41:14-04:00,Desktop,3,review,We used this airline to go from Singapore to L...,Ok,0
1,2024-03-11T19:39:13-04:00,Desktop,5,review,The service on Singapore Airlines Suites Class...,The service in Suites Class makes one feel lik...,0
2,2024-03-11T12:20:23-04:00,Desktop,1,review,"Booked, paid and received email confirmation f...",Don’t give them your money,0
3,2024-03-11T07:12:27-04:00,Desktop,5,review,"Best airline in the world, seats, food, servic...",Best Airline in the World,0
4,2024-03-10T05:34:18-04:00,Desktop,2,review,Premium Economy Seating on Singapore Airlines ...,Premium Economy Seating on Singapore Airlines ...,0


# Data Cleaning

## Remove Duplicate Rows

In [100]:
#No duplicates found
data = data_raw.drop_duplicates()

# Display the new dataframe shape
print("The new shape is: ", data.shape)

The new shape is:  (10000, 7)


## Check for Null Values



In [101]:
# In this case only `title` feature has one null value, will fill it with empty string " "
data.isnull().sum()

published_date        0
published_platform    0
rating                0
type                  0
text                  0
title                 1
helpful_votes         0
dtype: int64

In [102]:
# Fill missing values with empty string
data = data.fillna("")

In [103]:
#Verify that there are no missing values
data.isnull().sum()

published_date        0
published_platform    0
rating                0
type                  0
text                  0
title                 0
helpful_votes         0
dtype: int64

## Convert `published_date` to datetime by making a new column `date`

In [104]:
# set utc = True to convert the date to common timezone (UTC)
data["date"] = pd.to_datetime(data["published_date"],utc = True)
print(data["date"].dtype)

datetime64[ns, UTC]


## Remove Outliers

### `text`

The `text` column of `data`, which is of string type, may contain values with unusually long lengths, indicating the presence of outliers. We will identify the outliers using [Z-score method].

1. Create a new column `text_length` in the DataFrame `data` by calculating the length of each review. (Set the value as 0 if the correponding `text` column has NaN values.)

2. Check the statistics of `text_length` using `describe()` method.

3. Calculate the mean and standard deviation of the `text_length` column.

4. Set the Z-score threshold for identifying outliers to 3.

5. Identify outliers of the `text_length` column and set the corresponding `text` to np.nan.

6. Drop the `text_length` column from the DataFrame.

In [105]:
data['text_length'] = data['text'].apply(lambda x: len(x) if pd.notna(x) else 0)
print(data.head(3))

TL = data["text_length"]
stats_TL = TL.describe()
print(stats_TL)

              published_date published_platform  rating    type  \
0  2024-03-12T14:41:14-04:00            Desktop       3  review   
1  2024-03-11T19:39:13-04:00            Desktop       5  review   
2  2024-03-11T12:20:23-04:00            Desktop       1  review   

                                                text  \
0  We used this airline to go from Singapore to L...   
1  The service on Singapore Airlines Suites Class...   
2  Booked, paid and received email confirmation f...   

                                               title  helpful_votes  \
0                                                 Ok              0   
1  The service in Suites Class makes one feel lik...              0   
2                         Don’t give them your money              0   

                       date  text_length  
0 2024-03-12 18:41:14+00:00         1352  
1 2024-03-11 23:39:13+00:00         4666  
2 2024-03-11 16:20:23+00:00          420  
count    10000.000000
mean       556.526800
std  

In [106]:
mean_TL = TL.mean()
# print(mean_TL)

sd_TL = TL.std()
# print(sd_TL)

threshold = 3

z_score = zscore(TL)
# print(z_score)

# Remove 'text' of lengths that are greater than 3 standard deviations above the mean
data.loc[abs(z_score) > threshold, 'text'] = np.nan
# print(data.head(3))

data = data.drop("text_length", axis=1)

data.head()
data.shape

(10000, 8)

### `title`

Similarly, the `title` column of `data` (of type `str`) may also contain values with unusually long lengths, indicating the presence of outliers.

1. Create a new column `title_length` in the DataFrame `data` by calculating the length of each price value. (Set the value as 0 if the correponding `title` column has NaN values.)

2. Check the statistics of `title_length` using `describe()` method and display its unique values.

3. Identify the outlier values by inspecting the content in `title` corresponding to the abnormal value in `title_length` and set the corresponding value of `title` to np.nan.

4. Drop the `title_length` column from the DataFrame.

In [107]:
data['title_length'] = data['title'].apply(lambda x: len(x) if pd.notna(x) else 0)
print(data.head(3))

TL = data["title_length"]
stats_TL = TL.describe()
print(stats_TL)

              published_date published_platform  rating    type  \
0  2024-03-12T14:41:14-04:00            Desktop       3  review   
1  2024-03-11T19:39:13-04:00            Desktop       5  review   
2  2024-03-11T12:20:23-04:00            Desktop       1  review   

                                                text  \
0  We used this airline to go from Singapore to L...   
1                                                NaN   
2  Booked, paid and received email confirmation f...   

                                               title  helpful_votes  \
0                                                 Ok              0   
1  The service in Suites Class makes one feel lik...              0   
2                         Don’t give them your money              0   

                       date  title_length  
0 2024-03-12 18:41:14+00:00             2  
1 2024-03-11 23:39:13+00:00            51  
2 2024-03-11 16:20:23+00:00            26  
count    10000.000000
mean        28.409500
s

In [108]:
mean_TL = TL.mean()
# print(mean_TL)

sd_TL = TL.std()
# print(sd_TL)

threshold = 3

z_score = zscore(TL)
# print(z_score)

# Remove 'title' of lengths that are greater than 3 standard deviations above the mean
data.loc[abs(z_score) > threshold, 'title'] = np.nan
# print(data.head(3))

data = data.drop("title_length", axis=1)
data.head()

Unnamed: 0,published_date,published_platform,rating,type,text,title,helpful_votes,date
0,2024-03-12T14:41:14-04:00,Desktop,3,review,We used this airline to go from Singapore to L...,Ok,0,2024-03-12 18:41:14+00:00
1,2024-03-11T19:39:13-04:00,Desktop,5,review,,The service in Suites Class makes one feel lik...,0,2024-03-11 23:39:13+00:00
2,2024-03-11T12:20:23-04:00,Desktop,1,review,"Booked, paid and received email confirmation f...",Don’t give them your money,0,2024-03-11 16:20:23+00:00
3,2024-03-11T07:12:27-04:00,Desktop,5,review,"Best airline in the world, seats, food, servic...",Best Airline in the World,0,2024-03-11 11:12:27+00:00
4,2024-03-10T05:34:18-04:00,Desktop,2,review,Premium Economy Seating on Singapore Airlines ...,Premium Economy Seating on Singapore Airlines ...,0,2024-03-10 09:34:18+00:00


In [109]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   published_date      10000 non-null  object             
 1   published_platform  10000 non-null  object             
 2   rating              10000 non-null  int64              
 3   type                10000 non-null  object             
 4   text                9846 non-null   object             
 5   title               9834 non-null   object             
 6   helpful_votes       10000 non-null  int64              
 7   date                10000 non-null  datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), int64(2), object(5)
memory usage: 625.1+ KB


In [110]:
#check data types of each column, make sure they are correct
print(data.dtypes)

# Make sure no more duplicates are present
print("Remaining duplicate rows:", data.duplicated().sum())

# Check for outliers in ratings
print("Unique ratings:", data['rating'].unique())

published_date                     object
published_platform                 object
rating                              int64
type                               object
text                               object
title                              object
helpful_votes                       int64
date                  datetime64[ns, UTC]
dtype: object
Remaining duplicate rows: 0
Unique ratings: [3 5 1 2 4]


In [111]:
data.isnull().sum()

published_date          0
published_platform      0
rating                  0
type                    0
text                  154
title                 166
helpful_votes           0
date                    0
dtype: int64

### Create new column `full_review`
Since there are some rows with empty `text` and `title`, we will concatenate both columns (`text` and `title`) to form a new column `full_review`.
1. Replace `NaN` values in `text` and `title` with an empty string.

2. Combine `text` and `title` into `full_review`.

3. Strip any leading/trailing whitespaces in `full_review`.

4. Drop `text` and `title` columns.

In [93]:
# 1) Fill NaN values in 'text' and 'title' with an empty string
data['title'] = data['title'].fillna('')
data['text'] = data['text'].fillna('')

# 2) Combine 'text' and 'title' into 'full_review'
data['full_review'] = data['text'] + " " + data['title']

# 3) Strip any leading/trailing whitespace
data['full_review'] = data['full_review'].str.strip()

# 4) Drop `text` and `title` columns
data = data.drop(columns = ['text', 'title'])

# Check if the 'full_review' column was added and if 'text' and 'title' columns has been dropped
print(data.head())
print("\nThe old shape is:",data.shape)

              published_date published_platform  rating    type  \
0  2024-03-12T14:41:14-04:00            Desktop       3  review   
1  2024-03-11T19:39:13-04:00            Desktop       5  review   
2  2024-03-11T12:20:23-04:00            Desktop       1  review   
3  2024-03-11T07:12:27-04:00            Desktop       5  review   
4  2024-03-10T05:34:18-04:00            Desktop       2  review   

   helpful_votes                      date  \
0              0 2024-03-12 18:41:14+00:00   
1              0 2024-03-11 23:39:13+00:00   
2              0 2024-03-11 16:20:23+00:00   
3              0 2024-03-11 11:12:27+00:00   
4              0 2024-03-10 09:34:18+00:00   

                                         full_review  
0  We used this airline to go from Singapore to L...  
1  The service in Suites Class makes one feel lik...  
2  Booked, paid and received email confirmation f...  
3  Best airline in the world, seats, food, servic...  
4  Premium Economy Seating on Singapore Airli

In [94]:
data.head()

Unnamed: 0,published_date,published_platform,rating,type,helpful_votes,date,full_review
0,2024-03-12T14:41:14-04:00,Desktop,3,review,0,2024-03-12 18:41:14+00:00,We used this airline to go from Singapore to L...
1,2024-03-11T19:39:13-04:00,Desktop,5,review,0,2024-03-11 23:39:13+00:00,The service in Suites Class makes one feel lik...
2,2024-03-11T12:20:23-04:00,Desktop,1,review,0,2024-03-11 16:20:23+00:00,"Booked, paid and received email confirmation f..."
3,2024-03-11T07:12:27-04:00,Desktop,5,review,0,2024-03-11 11:12:27+00:00,"Best airline in the world, seats, food, servic..."
4,2024-03-10T05:34:18-04:00,Desktop,2,review,0,2024-03-10 09:34:18+00:00,Premium Economy Seating on Singapore Airlines ...


# Exploratory Data Analysis (EDA) and Feature Identification

## Summary Statistics

In [114]:
data.describe()

Unnamed: 0,rating,helpful_votes
count,10000.0,10000.0
mean,4.0158,1.2752
std,1.346006,2.721618
min,1.0,0.0
25%,3.0,0.0
50%,5.0,1.0
75%,5.0,2.0
max,5.0,158.0


## Distribution of Ratings

## Temporal Analysis, using the `date` column

## Word Cloud

# Feature Engineering

# Feature Selection
Here we select the relevant features for fake news classification.
- `title`, `text`, `rating`,`date`.
- Create a new DataFrame (`data`) by selecting the specifc columns mentioned above from the original DataFrame `data_raw`.

In [None]:
data = data_raw[['full_review','rating','date',]]
print(type(data))
print(data.head())

# Shape before dropping duplicates
print("\nThe old shape is: ", data.shape)