In [1]:
#imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

#regex
import re

In [3]:
cwd = os.getcwd()

df = pd.read_csv(cwd+"/BA_reviews.csv", index_col=0)

In [4]:
df.head()

Unnamed: 0,reviews,stars,date,country
0,✅ Trip Verified | If you can’t fly First Class...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,19th July 2024,United States
1,Not Verified | Singapore to Heathrow. Busine...,1,16th July 2024,New Zealand
2,Not Verified | I reported my damaged/ruined ...,1,15th July 2024,United Kingdom
3,"Not Verified | On March 1st, I flew from Ber...",1,9th July 2024,Germany
4,Not Verified | The WORST customer experience! ...,1,5th July 2024,United Kingdom


In [5]:
df['verified'] = df.reviews.str.contains("Trip Verified")

In [6]:
df['verified']

0        True
1       False
2       False
3       False
4       False
        ...  
3695    False
3696    False
3697    False
3698    False
3699    False
Name: verified, Length: 3700, dtype: bool

In [10]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/karthikdoguparthi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/karthikdoguparthi/nltk_data...


True

In [11]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemma = WordNetLemmatizer()


reviews_data = df.reviews.str.strip("✅ Trip Verified |")

#create an empty list to collect cleaned data corpus
corpus =[]

#loop through each review, remove punctuations, small case it, join it and add it to corpus
for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]',' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)

In [12]:
df['corpus'] = corpus

In [13]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,✅ Trip Verified | If you can’t fly First Class...,\n\t\t\t\t\t\t\t\t\t\t\t\t\t5,19th July 2024,United States,True,fly first class fly british airway uncomfortab...
1,Not Verified | Singapore to Heathrow. Busine...,1,16th July 2024,New Zealand,False,verified singapore heathrow business class pre...
2,Not Verified | I reported my damaged/ruined ...,1,15th July 2024,United Kingdom,False,verified reported damaged ruined suitcase week...
3,"Not Verified | On March 1st, I flew from Ber...",1,9th July 2024,Germany,False,verified march st flew berlin paulo ba encount...
4,Not Verified | The WORST customer experience! ...,1,5th July 2024,United Kingdom,False,verified worst customer experience british air...


In [14]:
df.dtypes

reviews     object
stars       object
date        object
country     object
verified      bool
corpus      object
dtype: object

In [15]:
df.date = pd.to_datetime(df.date)

In [16]:
df.date.head()

0   2024-07-19
1   2024-07-16
2   2024-07-15
3   2024-07-09
4   2024-07-05
Name: date, dtype: datetime64[ns]

In [17]:
df.stars.unique()

array(['\n\t\t\t\t\t\t\t\t\t\t\t\t\t5', '1', '6', '3', '5', '9', '2', '8',
       '7', '4', '10', 'None'], dtype=object)

In [18]:
df.stars = df.stars.str.strip("\n\t\t\t\t\t\t\t\t\t\t\t\t\t")

In [19]:
df.stars.value_counts()

1       892
3       421
2       420
8       365
10      307
9       302
7       297
5       259
4       248
6       186
None      3
Name: stars, dtype: int64

In [20]:
df.drop(df[df.stars == "None"].index, axis=0, inplace=True)

In [21]:
df.stars.unique()

array(['5', '1', '6', '3', '9', '2', '8', '7', '4', '10'], dtype=object)

In [22]:
df.isnull().value_counts()

reviews  stars  date   country  verified  corpus
False    False  False  False    False     False     3695
                       True     False     False        2
dtype: int64

In [23]:
df.country.isnull().value_counts()

False    3695
True        2
Name: country, dtype: int64

In [24]:
df.drop(df[df.country.isnull() == True].index, axis=0, inplace=True)

In [25]:
df.shape

(3695, 6)

In [26]:
df.reset_index(drop=True)

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,✅ Trip Verified | If you can’t fly First Class...,5,2024-07-19,United States,True,fly first class fly british airway uncomfortab...
1,Not Verified | Singapore to Heathrow. Busine...,1,2024-07-16,New Zealand,False,verified singapore heathrow business class pre...
2,Not Verified | I reported my damaged/ruined ...,1,2024-07-15,United Kingdom,False,verified reported damaged ruined suitcase week...
3,"Not Verified | On March 1st, I flew from Ber...",1,2024-07-09,Germany,False,verified march st flew berlin paulo ba encount...
4,Not Verified | The WORST customer experience! ...,1,2024-07-05,United Kingdom,False,verified worst customer experience british air...
...,...,...,...,...,...,...
3690,Having taken several domestic flights to Londo...,8,2014-06-25,United Kingdom,False,taken several domestic flight london heathrow ...
3691,Have flown with BA several times over past few...,10,2014-06-25,United Kingdom,False,flown ba several time past year mostly economy...
3692,LHR to VIE BA 704 23 June 2014. The meal servi...,3,2014-06-25,United Kingdom,False,lhr vie ba june meal service euro traveller ap...
3693,Flew from LHR to Hong Kong April 13th 2014 BA ...,3,2014-06-25,United Kingdom,False,flew lhr hong kong april th ba flight took hou...


In [27]:
df.to_csv(cwd + "/cleaned-BA-reviews.csv")