In [1]:
from bs4 import BeautifulSoup
import lxml
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

**DATA CLEANING**

In [12]:
df = pd.read_csv("BA_reviews.csv")

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3500 entries, ✅ Trip Verified |  Came from Glasgow to London and took connecting flight American Airlines from London to North Carolina. It's my third day here in North Carolina and still waiting for my luggage. They didn't even bother to transfer the luggage to connecting flight. Luggage are still in Glasgow.  to ✅ Trip Verified |  Back in December my family and I as we were getting onto the plane were refused. Even though we had boarding passes and seats allocated and had booked 6 months in advance we were refused. It turns out that someone had not turned up to work and they couldn't let all the passengers on the plane. So quite randomly (not) 9 people were selected and turned away. We were selected as we had no check in luggage and it was easier to boot us off despite them not knowing my circumstances, if I had connecting flights in London etc. I sought compensation from BA (which I was told I could do) and what an absolute nightmare this

In [13]:
df.head()

Unnamed: 0,reviews,date,country,stars
0,✅ Trip Verified | Came from Glasgow to London...,30th June 2023,United States,5/10
1,✅ Trip Verified | My flight on on 12 May 2023...,29th June 2023,United Arab Emirates,1/10
2,Not Verified | Cairo is a 5 hour flight and B...,29th June 2023,United Kingdom,1/10
3,✅ Trip Verified | After travelling London to ...,27th June 2023,United Kingdom,2/10
4,✅ Trip Verified | My luggage was mis-tagged i...,27th June 2023,United States,1/10


In [14]:
df['verified'] = df.reviews.str.contains("Trip Verified")
df['verified']

0        True
1        True
2       False
3        True
4        True
        ...  
3495    False
3496     True
3497    False
3498     True
3499     True
Name: verified, Length: 3500, dtype: bool

**Cleaning Reviews**
We will extract the column of reviews into a separate dataframe and clean it for semantic analysis

In [15]:
#for lemmatization of words we will use nltk library
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemma = WordNetLemmatizer()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [16]:
reviews_data = df.reviews.str.strip("✅ Trip Verified |")

#create an empty list to collect cleaned data corpus
corpus =[]

#loop through each review, remove punctuations, small case it, join it and add it to corpus
for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]',' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)

In [17]:
df['corpus'] = corpus

In [18]:
df.head()

Unnamed: 0,reviews,date,country,stars,verified,corpus
0,✅ Trip Verified | Came from Glasgow to London...,30th June 2023,United States,5/10,True,came glasgow london took connecting flight ame...
1,✅ Trip Verified | My flight on on 12 May 2023...,29th June 2023,United Arab Emirates,1/10,True,flight may got delayed hour minute resulted ca...
2,Not Verified | Cairo is a 5 hour flight and B...,29th June 2023,United Kingdom,1/10,False,verified cairo hour flight ba considers short ...
3,✅ Trip Verified | After travelling London to ...,27th June 2023,United Kingdom,2/10,True,travelling london madrid british airway iberia...
4,✅ Trip Verified | My luggage was mis-tagged i...,27th June 2023,United States,1/10,True,luggage mi tagged dallas way cairo via london ...


In [20]:
df.date = pd.to_datetime(df.date)

In [21]:
df.isnull().value_counts()

reviews  date   country  stars  verified  corpus
False    False  False    False  False     False     3500
dtype: int64

In [22]:
df.country.isnull().value_counts()

False    3500
Name: country, dtype: int64

Clean stars

In [24]:
df.stars.unique()

array(['5/10', '1/10', '2/10', '4/10', '7/10', '3/10', '9/10', '10/10',
       '8/10'], dtype=object)

In [25]:
# Extract numerical values and convert to integers
df['stars'] = df['stars'].str.split('/').str[0].astype(int)

# Print the modified DataFrame
print(df)

                                                reviews       date  \
0     ✅ Trip Verified |  Came from Glasgow to London... 2023-06-30   
1     ✅ Trip Verified |  My flight on on 12 May 2023... 2023-06-29   
2     Not Verified |  Cairo is a 5 hour flight and B... 2023-06-29   
3     ✅ Trip Verified |  After travelling London to ... 2023-06-27   
4     ✅ Trip Verified |  My luggage was mis-tagged i... 2023-06-27   
...                                                 ...        ...   
3495  Not Verified |  This was literally one of the ... 2023-03-13   
3496  ✅ Trip Verified |  The usual shambolic unfoldi... 2023-03-12   
3497  Not Verified |  Lost my case and took 6 weeks ... 2023-03-12   
3498  ✅ Trip Verified |  The incoming and outgoing f... 2023-03-10   
3499  ✅ Trip Verified |  Back in December my family ... 2023-03-10   

                   country  stars  verified  \
0            United States      5      True   
1     United Arab Emirates      1      True   
2           United

In [27]:
df.stars.value_counts()

1     1733
2      589
3      381
4      174
10     174
5      139
7      138
9      138
8       34
Name: stars, dtype: int64

In [31]:
import os

# Path to save the CSV file
path = r'C:\Users\Dell\Desktop\Data Science\Projects\British airlines DA'

# Create the directory if it doesn't exist
if not os.path.exists(path):
    os.makedirs(path)

# Save DataFrame as CSV
df.to_csv(os.path.join(path, 'cleaned_BA_reviews.csv'), index=False)