# Enhancing Airline Service Through Automated Sentiment Analysis of Customer Reviews



### Import Libraries

In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


### Data Preparation (Loading CSV)

In [84]:
data = pd.read_csv('data.csv')

In [85]:
data.info()
data.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23171 entries, 0 to 23170
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              23171 non-null  int64  
 1   Airline Name            23171 non-null  object 
 2   Overall_Rating          23171 non-null  object 
 3   Review_Title            23171 non-null  object 
 4   Review Date             23171 non-null  object 
 5   Verified                23171 non-null  bool   
 6   Review                  23171 non-null  object 
 7   Aircraft                7129 non-null   object 
 8   Type Of Traveller       19433 non-null  object 
 9   Seat Type               22075 non-null  object 
 10  Route                   19343 non-null  object 
 11  Date Flown              19417 non-null  object 
 12  Seat Comfort            19016 non-null  float64
 13  Cabin Staff Service     18911 non-null  float64
 14  Food & Beverages        14500 non-null

(23171, 20)

In [86]:
data.head()

Unnamed: 0.1,Unnamed: 0,Airline Name,Overall_Rating,Review_Title,Review Date,Verified,Review,Aircraft,Type Of Traveller,Seat Type,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Ground Service,Inflight Entertainment,Wifi & Connectivity,Value For Money,Recommended
0,0,AB Aviation,9,"""pretty decent airline""",11th November 2019,True,Moroni to Moheli. Turned out to be a pretty ...,,Solo Leisure,Economy Class,Moroni to Moheli,November 2019,4.0,5.0,4.0,4.0,,,3.0,yes
1,1,AB Aviation,1,"""Not a good airline""",25th June 2019,True,Moroni to Anjouan. It is a very small airline...,E120,Solo Leisure,Economy Class,Moroni to Anjouan,June 2019,2.0,2.0,1.0,1.0,,,2.0,no
2,2,AB Aviation,1,"""flight was fortunately short""",25th June 2019,True,Anjouan to Dzaoudzi. A very small airline an...,Embraer E120,Solo Leisure,Economy Class,Anjouan to Dzaoudzi,June 2019,2.0,1.0,1.0,1.0,,,2.0,no
3,3,Adria Airways,1,"""I will never fly again with Adria""",28th September 2019,False,Please do a favor yourself and do not fly wi...,,Solo Leisure,Economy Class,Frankfurt to Pristina,September 2019,1.0,1.0,,1.0,,,1.0,no
4,4,Adria Airways,1,"""it ruined our last days of holidays""",24th September 2019,True,Do not book a flight with this airline! My fr...,,Couple Leisure,Economy Class,Sofia to Amsterdam via Ljubljana,September 2019,1.0,1.0,1.0,1.0,1.0,1.0,1.0,no


### Data Cleaning

In [87]:
# Selecting the relevant features for sentiment analysis 
columns_to_keep = [
    'Airline Name', 'Overall_Rating', 'Review_Title', 'Review Date', 
    'Recommended', 'Review', 'Type Of Traveller', 'Seat Type'
]

# Dropping the unnecessary columns
df_cleaned = data[columns_to_keep].copy()

# Display the cleaned dataframe information and first few rows
df_cleaned.info(),df_cleaned.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23171 entries, 0 to 23170
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Airline Name       23171 non-null  object
 1   Overall_Rating     23171 non-null  object
 2   Review_Title       23171 non-null  object
 3   Review Date        23171 non-null  object
 4   Recommended        23171 non-null  object
 5   Review             23171 non-null  object
 6   Type Of Traveller  19433 non-null  object
 7   Seat Type          22075 non-null  object
dtypes: object(8)
memory usage: 1.4+ MB


(None,
     Airline Name Overall_Rating                           Review_Title  \
 0    AB Aviation              9                "pretty decent airline"   
 1    AB Aviation              1                   "Not a good airline"   
 2    AB Aviation              1         "flight was fortunately short"   
 3  Adria Airways              1    "I will never fly again with Adria"   
 4  Adria Airways              1  "it ruined our last days of holidays"   
 
            Review Date Recommended  \
 0   11th November 2019         yes   
 1       25th June 2019          no   
 2       25th June 2019          no   
 3  28th September 2019          no   
 4  24th September 2019          no   
 
                                               Review Type Of Traveller  \
 0    Moroni to Moheli. Turned out to be a pretty ...      Solo Leisure   
 1   Moroni to Anjouan. It is a very small airline...      Solo Leisure   
 2    Anjouan to Dzaoudzi. A very small airline an...      Solo Leisure   
 3   

In [88]:
# Convert Overall_Rating to numeric 
unique_ratings = df_cleaned['Overall_Rating'].unique()
print(unique_ratings)

# Step 2: Convert 'Overall_Rating' to numeric and handle non-numeric values (errors='coerce' converts non-numeric values to NaN)
df_cleaned['Overall_Rating'] = pd.to_numeric(df_cleaned['Overall_Rating'], errors='coerce')

# Check how many missing values were introduced in 'Overall_Rating'
df_cleaned['Overall_Rating'].isnull().sum()


['9' '1' '8' '2' '3' '5' '6' '7' '4' 'n']


842

In [89]:
# Remove rows with missing 'Overall_Rating' values
df_cleaned = df_cleaned.dropna(subset=['Overall_Rating'])

# Display the shape and the first few rows of the cleaned dataframe
print(df_cleaned.shape)
print(df_cleaned.info())

(22329, 8)
<class 'pandas.core.frame.DataFrame'>
Index: 22329 entries, 0 to 23170
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Airline Name       22329 non-null  object 
 1   Overall_Rating     22329 non-null  float64
 2   Review_Title       22329 non-null  object 
 3   Review Date        22329 non-null  object 
 4   Recommended        22329 non-null  object 
 5   Review             22329 non-null  object 
 6   Type Of Traveller  19433 non-null  object 
 7   Seat Type          21761 non-null  object 
dtypes: float64(1), object(7)
memory usage: 1.5+ MB
None


In [90]:
# Label encode 'Recommended' as binary values
df_cleaned['Recommended'] = df_cleaned['Recommended'].apply(lambda x: 1 if x.lower() == 'yes' else 0)

# Handle missing values in 'Type Of Traveller' and 'Seat Type' by filling with 'Unknown'
df_cleaned['Type Of Traveller'].fillna('Unknown', inplace=True)
df_cleaned['Seat Type'].fillna('Unknown', inplace=True)

# Display the final cleaned dataframe information and first few rows
df_cleaned_info_final = df_cleaned.info()
df_cleaned_head_final = df_cleaned.head()

df_cleaned_info_final, df_cleaned_head_final


<class 'pandas.core.frame.DataFrame'>
Index: 22329 entries, 0 to 23170
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Airline Name       22329 non-null  object 
 1   Overall_Rating     22329 non-null  float64
 2   Review_Title       22329 non-null  object 
 3   Review Date        22329 non-null  object 
 4   Recommended        22329 non-null  int64  
 5   Review             22329 non-null  object 
 6   Type Of Traveller  22329 non-null  object 
 7   Seat Type          22329 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 1.5+ MB


(None,
     Airline Name  Overall_Rating                           Review_Title  \
 0    AB Aviation             9.0                "pretty decent airline"   
 1    AB Aviation             1.0                   "Not a good airline"   
 2    AB Aviation             1.0         "flight was fortunately short"   
 3  Adria Airways             1.0    "I will never fly again with Adria"   
 4  Adria Airways             1.0  "it ruined our last days of holidays"   
 
            Review Date  Recommended  \
 0   11th November 2019            1   
 1       25th June 2019            0   
 2       25th June 2019            0   
 3  28th September 2019            0   
 4  24th September 2019            0   
 
                                               Review Type Of Traveller  \
 0    Moroni to Moheli. Turned out to be a pretty ...      Solo Leisure   
 1   Moroni to Anjouan. It is a very small airline...      Solo Leisure   
 2    Anjouan to Dzaoudzi. A very small airline an...      Solo Leis

In [94]:
df_cleaned['Review Date']

4214

### Exploratory Data Analysis

#### Statistical Summary

In [48]:
df_cleaned.shape
df_cleaned.isnull().sum()

Airline Name            0
Overall_Rating          0
Review_Title            0
Review Date             0
Recommended             0
Review                  0
Type Of Traveller    2896
Seat Type             568
dtype: int64

The unique number of data values are Unnamed: 0                23171
Airline Name                497
Overall_Rating               10
Review_Title              17219
Review Date                4557
Verified                      2
Review                    23046
Aircraft                   1048
Type Of Traveller             4
Seat Type                     4
Route                     13607
Date Flown                  109
Seat Comfort                  6
Cabin Staff Service           6
Food & Beverages              6
Ground Service                5
Inflight Entertainment        6
Wifi & Connectivity           6
Value For Money               6
Recommended                   2
dtype: int64


#### Class Distribution

#### Distribution of Features

#### Correlation Matrix


#### Pairplot of Features
