# Data Cleaning

Now since the data has been extracted from the website, it is not cleaned  and ready to be analyzed yet. The reviews will have to be cleaned of punctuation, spellings and other characters

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

#regex
import re

In [2]:
#create a dataframe from csv file

cwd = os.getcwd()
df = pd.read_csv(cwd+ "/BA_reviews.csv", index_col= 0)

In [6]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified
0,✅ Trip Verified | First our morning flight wa...,5,28th February 2023,Canada,True
1,✅ Trip Verified | Although it was a bit uncom...,1,27th February 2023,United Kingdom,True
2,✅ Trip Verified | Boarding was decently organ...,8,27th February 2023,Belgium,True
3,✅ Trip Verified | Boarding on time and departu...,6,27th February 2023,Belgium,True
4,✅ Trip Verified | My original flight was canc...,7,26th February 2023,United Kingdom,True


we will also create a column which mentions if the user is verified

In [4]:
df['verified'] = df.reviews.str.contains("Trip Verified")

we will extract the column of the reviews into a seperate dataframe  and clean it for semantics analysis

In [22]:
#for lemmatization of words we will use the library nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
lemma = WordNetLemmatizer()

reviews_data = df.reviews.str.strip("✅ Trip Verified |")

#create an empty list to collect cleaned data corpus
corpus = []

#loop through each review, rmeove punctuation, small case it, join it and add it to corpus
for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]', ' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words('english'))]
    rev = " ".join(rev)
    corpus.append(rev)
    

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [23]:
#add the corpus to the original dataframe

df['corpus'] = corpus

In [27]:
df.head()

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,✅ Trip Verified | First our morning flight wa...,5,28th February 2023,Canada,True,first morning flight cancelled moved afternoon...
1,✅ Trip Verified | Although it was a bit uncom...,1,27th February 2023,United Kingdom,True,although bit uncomfortable flight economy flig...
2,✅ Trip Verified | Boarding was decently organ...,8,27th February 2023,Belgium,True,boarding decently organised still rather stran...
3,✅ Trip Verified | Boarding on time and departu...,6,27th February 2023,Belgium,True,boarding time departure time flight london hea...
4,✅ Trip Verified | My original flight was canc...,7,26th February 2023,United Kingdom,True,original flight cancelled week actual flight a...


In [26]:
df.dtypes

reviews     object
stars        int64
date        object
country     object
verified      bool
corpus      object
dtype: object

# Cleaning/Formatting the Date


In [28]:
#converting the date to datetime

df.date = pd.to_datetime(df.date)
df.date.head()

0   2023-02-28
1   2023-02-27
2   2023-02-27
3   2023-02-27
4   2023-02-26
Name: date, dtype: datetime64[ns]

# Cleaning ratings

In [29]:
#Check for unique stars

df.stars.unique()

array([ 5,  1,  8,  6,  7,  4,  9, 10,  2,  3], dtype=int64)

In [31]:
#to remove any unique value with strings, do:
# df.stars = df.stars.str.string("\n\t\n\t")

In [32]:
df.stars.value_counts()

1     1149
10     387
5      376
2      373
9      329
3      322
8      289
4      205
7      166
6      144
Name: stars, dtype: int64

In [33]:
#if there is any value that needed to be dropped such as NONE, do:
  #df.drop(df[df.stars == "None"].index, axis=0, inplace = True)
    
    #then you can now check the unique values again

In [34]:
df.stars.unique()

array([ 5,  1,  8,  6,  7,  4,  9, 10,  2,  3], dtype=int64)


# Check for null values

In [35]:
df.isnull().value_counts()

reviews  stars  date   country  verified  corpus
False    False  False  False    False     False     3740
dtype: int64

In [36]:
df.country.isnull().value_counts()

False    3740
Name: country, dtype: int64

In [38]:
#if we have missing values, drop the rows using index where the country value is null

df.drop(df[df.country.isnull() == "None"].index, axis=0, inplace = True)

In [39]:
df.shape

(3740, 6)

In [40]:
#resetting the index
df.reset_index(drop=True)

Unnamed: 0,reviews,stars,date,country,verified,corpus
0,✅ Trip Verified | First our morning flight wa...,5,2023-02-28,Canada,True,first morning flight cancelled moved afternoon...
1,✅ Trip Verified | Although it was a bit uncom...,1,2023-02-27,United Kingdom,True,although bit uncomfortable flight economy flig...
2,✅ Trip Verified | Boarding was decently organ...,8,2023-02-27,Belgium,True,boarding decently organised still rather stran...
3,✅ Trip Verified | Boarding on time and departu...,6,2023-02-27,Belgium,True,boarding time departure time flight london hea...
4,✅ Trip Verified | My original flight was canc...,7,2023-02-26,United Kingdom,True,original flight cancelled week actual flight a...
...,...,...,...,...,...,...
3735,✅ Trip Verified | Flying last Sunday from Gene...,6,2019-02-03,France,True,flying last sunday geneva piladelphia via lond...
3736,✅ Trip Verified | London Heathrow to Paris CD...,1,2019-02-03,United States,True,london heathrow paris cdg quick easy flight pa...
3737,✅ Trip Verified | Johannesburg to London. At ...,1,2019-02-03,United Kingdom,True,johannesburg london johannesburg used ba galle...
3738,✅ Trip Verified | Edinburgh to Bangalore via H...,5,2019-02-02,India,True,edinburgh bangalore via heathrow rude staff po...


In [41]:
# Export the data into csv file
import os

cwd = os.getcwd()
df.to_csv(cwd+ "/cleaned-BA_reviews.csv")