# Text Preprocessing

In [1]:
%pip install spacy

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

## Load Dataset

In [3]:
import os

path = os.getcwd()
path = '/'.join(path.split('/')[:-1])

df = pd.read_csv(os.path.join(path, 'data/raw/airlines_reviews.csv'))

df.head()

Unnamed: 0,Title,Name,Review Date,Airline,Verified,Reviews,Type of Traveller,Month Flown,Route,Class,Seat Comfort,Staff Service,Food & Beverages,Inflight Entertainment,Value For Money,Overall Rating,Recommended
0,Flight was amazing,Alison Soetantyo,2024-03-01,Singapore Airlines,True,Flight was amazing. The crew onboard this fl...,Solo Leisure,December 2023,Jakarta to Singapore,Business Class,4,4,4,4,4,9,yes
1,seats on this aircraft are dreadful,Robert Watson,2024-02-21,Singapore Airlines,True,Booking an emergency exit seat still meant h...,Solo Leisure,February 2024,Phuket to Singapore,Economy Class,5,3,4,4,1,3,no
2,Food was plentiful and tasty,S Han,2024-02-20,Singapore Airlines,True,Excellent performance on all fronts. I would...,Family Leisure,February 2024,Siem Reap to Singapore,Economy Class,1,5,2,1,5,10,yes
3,“how much food was available,D Laynes,2024-02-19,Singapore Airlines,True,Pretty comfortable flight considering I was f...,Solo Leisure,February 2024,Singapore to London Heathrow,Economy Class,5,5,5,5,5,10,yes
4,“service was consistently good”,A Othman,2024-02-19,Singapore Airlines,True,The service was consistently good from start ...,Family Leisure,February 2024,Singapore to Phnom Penh,Economy Class,5,5,5,5,5,10,yes


In [4]:
columns_to_keep = ['Title', 'Airline', 'Type of Traveller', 'Class', 'Route', 'Month Flown', 'Overall Rating', 'Title', 'Reviews']

df = df[columns_to_keep].dropna(subset='Reviews')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8100 entries, 0 to 8099
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Title              8100 non-null   object
 1   Airline            8100 non-null   object
 2   Type of Traveller  8100 non-null   object
 3   Class              8100 non-null   object
 4   Route              8100 non-null   object
 5   Month Flown        8100 non-null   object
 6   Overall Rating     8100 non-null   int64 
 7   Title              8100 non-null   object
 8   Reviews            8100 non-null   object
dtypes: int64(1), object(8)
memory usage: 569.7+ KB


In [6]:
df.head()

Unnamed: 0,Title,Airline,Type of Traveller,Class,Route,Month Flown,Overall Rating,Title.1,Reviews
0,Flight was amazing,Singapore Airlines,Solo Leisure,Business Class,Jakarta to Singapore,December 2023,9,Flight was amazing,Flight was amazing. The crew onboard this fl...
1,seats on this aircraft are dreadful,Singapore Airlines,Solo Leisure,Economy Class,Phuket to Singapore,February 2024,3,seats on this aircraft are dreadful,Booking an emergency exit seat still meant h...
2,Food was plentiful and tasty,Singapore Airlines,Family Leisure,Economy Class,Siem Reap to Singapore,February 2024,10,Food was plentiful and tasty,Excellent performance on all fronts. I would...
3,“how much food was available,Singapore Airlines,Solo Leisure,Economy Class,Singapore to London Heathrow,February 2024,10,“how much food was available,Pretty comfortable flight considering I was f...
4,“service was consistently good”,Singapore Airlines,Family Leisure,Economy Class,Singapore to Phnom Penh,February 2024,10,“service was consistently good”,The service was consistently good from start ...


In [7]:
# !python -m spacy download en_core_web_sm

In [8]:
import re
import spacy

nlp = spacy.load('en_core_web_sm')
tokenizer = nlp.tokenizer



def clean_reviews(review, tokenizer):
    review = re.sub(r'[^a-zA-Z0-9\s]', '', review.lower())
    doc = nlp(review)
    return " ".join([token.lemma_ for token in doc if not token.is_stop])

In [9]:

from tqdm import tqdm
tqdm.pandas()

df['Cleaned Reviews'] = df['Reviews'].progress_apply(clean_reviews, tokenizer=tokenizer)
df.head()

100%|██████████| 8100/8100 [02:54<00:00, 46.29it/s]


Unnamed: 0,Title,Airline,Type of Traveller,Class,Route,Month Flown,Overall Rating,Title.1,Reviews,Cleaned Reviews
0,Flight was amazing,Singapore Airlines,Solo Leisure,Business Class,Jakarta to Singapore,December 2023,9,Flight was amazing,Flight was amazing. The crew onboard this fl...,flight amazing crew onboard flight welcomin...
1,seats on this aircraft are dreadful,Singapore Airlines,Solo Leisure,Economy Class,Phuket to Singapore,February 2024,3,seats on this aircraft are dreadful,Booking an emergency exit seat still meant h...,book emergency exit seat mean huge discomfo...
2,Food was plentiful and tasty,Singapore Airlines,Family Leisure,Economy Class,Siem Reap to Singapore,February 2024,10,Food was plentiful and tasty,Excellent performance on all fronts. I would...,excellent performance front definitely choo...
3,“how much food was available,Singapore Airlines,Solo Leisure,Economy Class,Singapore to London Heathrow,February 2024,10,“how much food was available,Pretty comfortable flight considering I was f...,pretty comfortable flight consider fly econo...
4,“service was consistently good”,Singapore Airlines,Family Leisure,Economy Class,Singapore to Phnom Penh,February 2024,10,“service was consistently good”,The service was consistently good from start ...,service consistently good start finish cabin...


In [11]:
path = os.getcwd()
path = '/'.join(path.split('/')[:-1])

os.makedirs(os.path.join(path, 'data/processed/'), exist_ok=True)
df.to_csv(os.path.join(path, 'data/processed/reviews.csv'), index=False)

## EDA