In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
import plotly.express as px

In [2]:
df= pd.read_csv('british_airways_reviews.csv')

In [5]:
df.head()

Unnamed: 0,review_title,review_content,stars,review_date,country,route,seat_type,date_flown
0,“A very poor experience”,"✅Trip Verified| I had visa issues, and hence...",1.0,5th November 2024,India,Mumbai to London,Premium Economy,November 2024
1,"""food and beverages being targeted""",✅Trip Verified| Singapore to Heathrow with B...,6.0,5th November 2024,United Kingdom,Singapore to London,Business Class,November 2024
2,"""never fly with them again""",✅Trip Verified| I recently travelled from Mu...,1.0,3rd November 2024,United Kingdom,Munich to London Heathrow,Economy Class,October 2024
3,"""still have not heard any updates""",Not Verified| I paid for seats 80 A and B on ...,3.0,3rd November 2024,United States,Heathrow to Boston,Premium Economy,September 2024
4,"""cabin crew were nice""","Not Verified| The flight wasn’t that bad, alth...",7.0,3rd November 2024,United Kingdom,Los Angeles to London Heathrow,Economy Class,November 2024


In [6]:
df.shape

(3600, 8)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600 entries, 0 to 3599
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   review_title    3600 non-null   object 
 1   review_content  3600 non-null   object 
 2   stars           3597 non-null   float64
 3   review_date     3600 non-null   object 
 4   country         3598 non-null   object 
 5   route           3110 non-null   object 
 6   seat_type       3598 non-null   object 
 7   date_flown      3108 non-null   object 
dtypes: float64(1), object(7)
memory usage: 225.1+ KB


In [8]:
# Rename a column in-place
# df.rename(columns={'stars': 'rating', 'review_content':'reviews' }, inplace=True)


In [9]:
# Creating columns of verified and not verified reviewers
df['verified'] = df.review_content.str.contains("Trip Verified")


In [10]:
df.head()

Unnamed: 0,review_title,review_content,stars,review_date,country,route,seat_type,date_flown,verified
0,“A very poor experience”,"✅Trip Verified| I had visa issues, and hence...",1.0,5th November 2024,India,Mumbai to London,Premium Economy,November 2024,True
1,"""food and beverages being targeted""",✅Trip Verified| Singapore to Heathrow with B...,6.0,5th November 2024,United Kingdom,Singapore to London,Business Class,November 2024,True
2,"""never fly with them again""",✅Trip Verified| I recently travelled from Mu...,1.0,3rd November 2024,United Kingdom,Munich to London Heathrow,Economy Class,October 2024,True
3,"""still have not heard any updates""",Not Verified| I paid for seats 80 A and B on ...,3.0,3rd November 2024,United States,Heathrow to Boston,Premium Economy,September 2024,False
4,"""cabin crew were nice""","Not Verified| The flight wasn’t that bad, alth...",7.0,3rd November 2024,United Kingdom,Los Angeles to London Heathrow,Economy Class,November 2024,False


In [12]:
print('verified:',df['verified'].sum())
print('not-verified:', len(df)- df['verified'].sum())

verified: 1283
not-verified: 2317


In [14]:
# converting written date formate into number formate

# Ensure suffixes (st, nd, rd, th) are removed from 'review_date'
df['review_date'] = df['review_date'].str.replace(r'(\d+)(st|nd|rd|th)', r'\1', regex=True)

# Convert 'review_date' to datetime, with error handling
df['review_date'] = pd.to_datetime(df['review_date'], errors='coerce', dayfirst=True)


In [16]:
df.review_date.head()

0   2024-11-05
1   2024-11-05
2   2024-11-03
3   2024-11-03
4   2024-11-03
Name: review_date, dtype: datetime64[ns]

In [17]:
df.isnull().sum()

review_title        0
review_content      0
stars               3
review_date         0
country             2
route             490
seat_type           2
date_flown        492
verified            0
dtype: int64

In [18]:
# Filling missing values in 'stars' with the mode (most common rating)
df['stars'].fillna(df['stars'].mode()[0], inplace=True)

# Drop rows where 'country' column has null values
df = df.dropna(subset=['country'])

# Filling missing values in 'seat_type' with the mode (most common seat type)
df['seat_type'].fillna(df['seat_type'].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['stars'].fillna(df['stars'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['seat_type'].fillna(df['seat_type'].mode()[0], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https:/

In [21]:
df.isnull().sum()

review_title        0
review_content      0
stars               0
review_date         0
country             0
route             488
seat_type           0
date_flown        490
verified            0
dtype: int64

In [22]:
df.shape

(3598, 9)

### Text Preprocessing for Reviews
### In this step, we will preprocess the review_content data by removing specific strings and cleaning the text. The goal is to prepare the data for semantic analysis by normalizing the text through the following processes.

In [23]:
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# required nltk resources download
import nltk
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('stopwords')

# Initialize the lemmatizer
lemma = WordNetLemmatizer()

# Extract the reviews column from the DataFrame
reviews_data = df['review_content'].str.strip("✅ Trip Verified |")

# Create an empty list to collect cleaned data corpus
corpus = []

# Loop through each review, clean it, and add it to the corpus
for rev in reviews_data:
    # Remove non-alphabetic characters
    rev = re.sub('[^a-zA-Z]', ' ', rev)
    # Convert to lowercase
    rev = rev.lower()
    # Split into words
    rev = rev.split()
    # Remove stopwords and lemmatize the words
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    # Join the cleaned words back into a single string
    rev = " ".join(rev)
    # Append the cleaned review to the corpus
    corpus.append(rev)

# Optional: If you want to add the cleaned reviews back to the DataFrame
df['cleaned_reviews'] = corpus

# Display the cleaned reviews
print(df[['review_content', 'cleaned_reviews']].head())


                                      review_content  \
0  ✅Trip Verified|   I had visa issues, and hence...   
1  ✅Trip Verified|   Singapore to Heathrow with B...   
2  ✅Trip Verified|   I recently travelled from Mu...   
3  Not Verified|  I paid for seats 80 A and B on ...   
4  Not Verified| The flight wasn’t that bad, alth...   

                                     cleaned_reviews  
0  visa issue hence debarred flying ground staff ...  
1  singapore heathrow ba two choice route economy...  
2  recently travelled munich london british airwa...  
3  verified paid seat b flight heathrow boston pa...  
4  verified flight bad although inflight entertai...  


In [24]:
df.head()

Unnamed: 0,review_title,review_content,stars,review_date,country,route,seat_type,date_flown,verified,cleaned_reviews
0,“A very poor experience”,"✅Trip Verified| I had visa issues, and hence...",1.0,2024-11-05,India,Mumbai to London,Premium Economy,November 2024,True,visa issue hence debarred flying ground staff ...
1,"""food and beverages being targeted""",✅Trip Verified| Singapore to Heathrow with B...,6.0,2024-11-05,United Kingdom,Singapore to London,Business Class,November 2024,True,singapore heathrow ba two choice route economy...
2,"""never fly with them again""",✅Trip Verified| I recently travelled from Mu...,1.0,2024-11-03,United Kingdom,Munich to London Heathrow,Economy Class,October 2024,True,recently travelled munich london british airwa...
3,"""still have not heard any updates""",Not Verified| I paid for seats 80 A and B on ...,3.0,2024-11-03,United States,Heathrow to Boston,Premium Economy,September 2024,False,verified paid seat b flight heathrow boston pa...
4,"""cabin crew were nice""","Not Verified| The flight wasn’t that bad, alth...",7.0,2024-11-03,United Kingdom,Los Angeles to London Heathrow,Economy Class,November 2024,False,verified flight bad although inflight entertai...


In [29]:
print(df.columns)

Index(['review_title', 'review_content', 'stars', 'review_date', 'country',
       'route', 'seat_type', 'date_flown', 'verified', 'cleaned_reviews'],
      dtype='object')


In [30]:
df.dtypes

review_title               object
review_content             object
stars                     float64
review_date        datetime64[ns]
country                    object
route                      object
seat_type                  object
date_flown                 object
verified                     bool
cleaned_reviews            object
dtype: object

In [31]:
## check null values for all
print(df.isnull().value_counts())

review_title  review_content  stars  review_date  country  route  seat_type  date_flown  verified  cleaned_reviews
False         False           False  False        False    False  False      False       False     False              3102
                                                           True   False      True        False     False               482
                                                           False  False      True        False     False                 8
                                                           True   False      False       False     False                 6
Name: count, dtype: int64


In [32]:
df.stars.value_counts()

stars
1.0     905
2.0     416
3.0     411
8.0     354
10.0    296
9.0     290
7.0     283
4.0     245
5.0     219
6.0     179
Name: count, dtype: int64

## Check anomolies

In [34]:
# Check for unique values in categorical columns
print("Unique values in 'stars':", df['stars'].unique())
print("Unique values in 'country':", df['country'].unique())
print("Unique values in 'seat_type':", df['seat_type'].unique())

Unique values in 'stars': [ 1.  6.  3.  7.  9.  5.  2.  8.  4. 10.]
Unique values in 'country': ['India' 'United Kingdom' 'United States' 'Canada' 'Switzerland'
 'South Africa' 'Germany' 'Morocco' 'Australia' 'Hong Kong' 'Lebanon'
 'Italy' 'New Zealand' 'Austria' 'Denmark' 'Singapore' 'Netherlands'
 'Belgium' 'Nigeria' 'Japan' 'Ukraine' 'Ireland' 'Spain' 'China' 'Ecuador'
 'Romania' 'France' 'Kuwait' 'Iceland' 'Poland' 'Qatar' 'Greece' 'Senegal'
 'United Arab Emirates' 'Cyprus' 'Chile' 'Sweden' 'Czech Republic'
 'Malaysia' 'Ghana' 'Bermuda' 'Botswana' 'Brazil' 'Panama'
 'Russian Federation' 'Philippines' 'Bulgaria' 'Thailand' 'Argentina'
 'Mexico' 'Saint Kitts and Nevis' 'Vietnam' 'Norway' 'Jordan' 'Taiwan'
 'Slovakia' 'Israel' 'South Korea' 'Saudi Arabia' 'Hungary' 'Portugal'
 'Cayman Islands' 'Costa Rica' 'Egypt' 'Laos' 'Turkey' 'Indonesia'
 'Bahrain' 'Dominican Republic' 'Luxembourg' 'Finland' 'Trinidad & Tobago']
Unique values in 'seat_type': ['Premium Economy' 'Business Class' 'Ec

In [36]:
# All clear now make clean British Airlines Dataset by selecting columns for further analysis

In [37]:
df.columns

Index(['review_title', 'review_content', 'stars', 'review_date', 'country',
       'route', 'seat_type', 'date_flown', 'verified', 'cleaned_reviews'],
      dtype='object')

In [39]:
# Selecting important columns for analysis
selected = ['review_title', 'cleaned_reviews', 'stars', 'review_date', 'country','seat_type','verified']
df_cleaned = df[selected]

# Display the first few rows of the new DataFrame
df_cleaned.head()


Unnamed: 0,review_title,cleaned_reviews,stars,review_date,country,seat_type,verified
0,“A very poor experience”,visa issue hence debarred flying ground staff ...,1.0,2024-11-05,India,Premium Economy,True
1,"""food and beverages being targeted""",singapore heathrow ba two choice route economy...,6.0,2024-11-05,United Kingdom,Business Class,True
2,"""never fly with them again""",recently travelled munich london british airwa...,1.0,2024-11-03,United Kingdom,Economy Class,True
3,"""still have not heard any updates""",verified paid seat b flight heathrow boston pa...,3.0,2024-11-03,United States,Premium Economy,False
4,"""cabin crew were nice""",verified flight bad although inflight entertai...,7.0,2024-11-03,United Kingdom,Economy Class,False


In [42]:
df_cleaned.shape

(3598, 7)

In [43]:
df_cleaned.to_csv("BA_cleaned_data.csv", index=False, encoding="utf-8")