In [2]:
import nltk

In [2]:
import pandas as pd 
import numpy as np 

In [3]:
df = pd.read_csv("british_airways_reviews.csv")

In [4]:
df.head()

Unnamed: 0,reviews,stars,date,country
0,Not Verified | Very good flight following an ...,9.0,20th January 2025,United Kingdom
1,Not Verified | An hour's delay due to late ar...,7.0,19th January 2025,United Kingdom
2,✅ Trip Verified | I booked through BA becaus...,1.0,15th January 2025,United Kingdom
3,✅ Trip Verified | British airways lost bags ...,1.0,9th January 2025,United States
4,✅ Trip Verified | The check in process and rew...,1.0,5th January 2025,Netherlands


In [5]:
# Need to change date to datetime
df.dtypes

reviews     object
stars      float64
date        object
country     object
dtype: object

In [6]:
## Create a verified column
df['verified'] = df['reviews'].str.contains('Trip Verified')

In [7]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified
0,Not Verified | Very good flight following an ...,9.0,20th January 2025,United Kingdom,False
1,Not Verified | An hour's delay due to late ar...,7.0,19th January 2025,United Kingdom,False
2,✅ Trip Verified | I booked through BA becaus...,1.0,15th January 2025,United Kingdom,True
3,✅ Trip Verified | British airways lost bags ...,1.0,9th January 2025,United States,True
4,✅ Trip Verified | The check in process and rew...,1.0,5th January 2025,Netherlands,True


Cleaning the reviews column by extracting only the reviews 

In [8]:
#for lemmatization of words we will use nltk library
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemma = WordNetLemmatizer()

reviews_data = df['reviews'].str.strip("✅ Trip Verified | ")
reviews_data = reviews_data.str.strip("Not Verified | ")

cleaned_reviews = []

for review in reviews_data:
    review = re.sub('[^a-zA-Z]', ' ', review)
    review = review.lower()
    review = review.split()
    review = [lemma.lemmatize(word) for word in review if word not in set(stopwords.words("english"))]
    review = " ".join(review)
    cleaned_reviews.append(review)

In [9]:
len(cleaned_reviews)

3500

In [10]:
df['cleaned_reviews'] = cleaned_reviews

In [11]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified,cleaned_reviews
0,Not Verified | Very good flight following an ...,9.0,20th January 2025,United Kingdom,False,good flight following equally good flight rome...
1,Not Verified | An hour's delay due to late ar...,7.0,19th January 2025,United Kingdom,False,hour delay due late arrival incoming aircraft ...
2,✅ Trip Verified | I booked through BA becaus...,1.0,15th January 2025,United Kingdom,True,booked ba loganair representative manchester a...
3,✅ Trip Verified | British airways lost bags ...,1.0,9th January 2025,United States,True,british airway lost bag lhr found sent cologne...
4,✅ Trip Verified | The check in process and rew...,1.0,5th January 2025,Netherlands,True,check process reward loyalty program mess neve...


Cleaning the 'data' column

In [12]:
df['date'] = df['date'].apply(lambda x: re.sub(r'(\d+)(st|nd|rd|th)', r'\1', x))
df['date'] = pd.to_datetime(df['date'])

In [13]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified,cleaned_reviews
0,Not Verified | Very good flight following an ...,9.0,2025-01-20,United Kingdom,False,good flight following equally good flight rome...
1,Not Verified | An hour's delay due to late ar...,7.0,2025-01-19,United Kingdom,False,hour delay due late arrival incoming aircraft ...
2,✅ Trip Verified | I booked through BA becaus...,1.0,2025-01-15,United Kingdom,True,booked ba loganair representative manchester a...
3,✅ Trip Verified | British airways lost bags ...,1.0,2025-01-09,United States,True,british airway lost bag lhr found sent cologne...
4,✅ Trip Verified | The check in process and rew...,1.0,2025-01-05,Netherlands,True,check process reward loyalty program mess neve...


Cleaning the 'stars' column

In [14]:
df['stars'].unique()

array([ 9.,  7.,  1.,  5.,  2.,  8.,  4., 10.,  3.,  6., nan])

In [15]:
df.dropna(inplace=True)

In [16]:
df['stars'] = df['stars'].astype(int)

In [17]:
df.value_counts('stars')

stars
1     892
2     410
3     400
8     338
10    279
7     274
9     263
5     244
4     231
6     166
Name: count, dtype: int64

In [18]:
df.reset_index(drop=True)

Unnamed: 0,reviews,stars,date,country,verified,cleaned_reviews
0,Not Verified | Very good flight following an ...,9,2025-01-20,United Kingdom,False,good flight following equally good flight rome...
1,Not Verified | An hour's delay due to late ar...,7,2025-01-19,United Kingdom,False,hour delay due late arrival incoming aircraft ...
2,✅ Trip Verified | I booked through BA becaus...,1,2025-01-15,United Kingdom,True,booked ba loganair representative manchester a...
3,✅ Trip Verified | British airways lost bags ...,1,2025-01-09,United States,True,british airway lost bag lhr found sent cologne...
4,✅ Trip Verified | The check in process and rew...,1,2025-01-05,Netherlands,True,check process reward loyalty program mess neve...
...,...,...,...,...,...,...
3492,52b on upper deck to LAX and 51b back from LAX...,5,2014-11-20,United Kingdom,False,b upper deck lax b back lax lhr food flight ok...
3493,Just completed a return trip to Hong Kong on t...,8,2014-11-20,United Kingdom,False,completed return trip hong kong magnificent ai...
3494,I travel to and from Singapore on BA in Club w...,7,2014-11-20,United Kingdom,False,travel singapore ba club world month first tim...
3495,First time with BA (a code share flight for JA...,2,2014-11-20,Australia,False,first time ba code share flight jal travelled ...


In [19]:
df.to_csv("british_airways_reviews_cleaned.csv", index=False)