In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [2]:
# Importing data
df_satisf = pd.read_csv(r"C:\Users\joaoa\Desktop\Ironhack\Labs\lab-predictions-logistic-regression\Invistico_Airline.csv")
df_satisf

Unnamed: 0,satisfaction,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,satisfied,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,2,...,2,3,3,0,3,5,3,2,0,0.0
1,satisfied,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,3,...,2,3,4,4,4,2,3,2,310,305.0
2,satisfied,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,3,...,2,2,3,3,4,4,4,2,0,0.0
3,satisfied,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,3,...,3,1,1,0,1,4,1,3,0,0.0
4,satisfied,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,3,...,4,2,2,0,2,4,2,5,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,satisfied,disloyal Customer,29,Personal Travel,Eco,1731,5,5,5,3,...,2,2,3,3,4,4,4,2,0,0.0
129876,dissatisfied,disloyal Customer,63,Personal Travel,Business,2087,2,3,2,4,...,1,3,2,3,3,1,2,1,174,172.0
129877,dissatisfied,disloyal Customer,69,Personal Travel,Eco,2320,3,0,3,3,...,2,4,4,3,4,2,3,2,155,163.0
129878,dissatisfied,disloyal Customer,66,Personal Travel,Eco,2450,3,2,3,2,...,2,3,3,2,3,2,1,2,193,205.0


In [3]:
# Dropping null values
df_satisf = df_satisf.dropna().reset_index(drop=True)

In [4]:
# Transforming satisfaction column in satisfied==1 and dissatisfied==0
df_satisf.loc[df_satisf["satisfaction"] == "satisfied", "satisfaction"] = 1
df_satisf.loc[df_satisf["satisfaction"] == "dissatisfied", "satisfaction"] = 0
df_satisf["satisfaction"] = df_satisf["satisfaction"].astype("int")
df_satisf

Unnamed: 0,satisfaction,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,1,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,2,...,2,3,3,0,3,5,3,2,0,0.0
1,1,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,3,...,2,3,4,4,4,2,3,2,310,305.0
2,1,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,3,...,2,2,3,3,4,4,4,2,0,0.0
3,1,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,3,...,3,1,1,0,1,4,1,3,0,0.0
4,1,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,3,...,4,2,2,0,2,4,2,5,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129482,1,disloyal Customer,29,Personal Travel,Eco,1731,5,5,5,3,...,2,2,3,3,4,4,4,2,0,0.0
129483,0,disloyal Customer,63,Personal Travel,Business,2087,2,3,2,4,...,1,3,2,3,3,1,2,1,174,172.0
129484,0,disloyal Customer,69,Personal Travel,Eco,2320,3,0,3,3,...,2,4,4,3,4,2,3,2,155,163.0
129485,0,disloyal Customer,66,Personal Travel,Eco,2450,3,2,3,2,...,2,3,3,2,3,2,1,2,193,205.0


In [5]:
# Creating a new dataframe with numerical values
df_num = df_satisf.select_dtypes(include=np.number)
df_num

Unnamed: 0,satisfaction,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,1,65,265,0,0,0,2,2,4,2,3,3,0,3,5,3,2,0,0.0
1,1,47,2464,0,0,0,3,0,2,2,3,4,4,4,2,3,2,310,305.0
2,1,15,2138,0,0,0,3,2,0,2,2,3,3,4,4,4,2,0,0.0
3,1,60,623,0,0,0,3,3,4,3,1,1,0,1,4,1,3,0,0.0
4,1,70,354,0,0,0,3,4,3,4,2,2,0,2,4,2,5,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129482,1,29,1731,5,5,5,3,2,5,2,2,3,3,4,4,4,2,0,0.0
129483,0,63,2087,2,3,2,4,2,1,1,3,2,3,3,1,2,1,174,172.0
129484,0,69,2320,3,0,3,3,3,2,2,4,4,3,4,2,3,2,155,163.0
129485,0,66,2450,3,2,3,2,3,2,2,3,3,2,3,2,1,2,193,205.0


In [6]:
# Creating a new dataframe with categorical values and applying "One hot encoding" process
df_categ = df_satisf.select_dtypes(include = object)
dummies = pd.get_dummies(df_categ, drop_first = False)
dummies

Unnamed: 0,Customer Type_Loyal Customer,Customer Type_disloyal Customer,Type of Travel_Business travel,Type of Travel_Personal Travel,Class_Business,Class_Eco,Class_Eco Plus
0,1,0,0,1,0,1,0
1,1,0,0,1,1,0,0
2,1,0,0,1,0,1,0
3,1,0,0,1,0,1,0
4,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...
129482,0,1,0,1,0,1,0
129483,0,1,0,1,1,0,0
129484,0,1,0,1,0,1,0
129485,0,1,0,1,0,1,0


In [7]:
# Concatenating numerical and categorical dataframes
df_concat = pd.concat([df_num, dummies], axis=1, ignore_index=False)
df_concat

Unnamed: 0,satisfaction,Age,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,Inflight wifi service,Inflight entertainment,Online support,...,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes,Customer Type_Loyal Customer,Customer Type_disloyal Customer,Type of Travel_Business travel,Type of Travel_Personal Travel,Class_Business,Class_Eco,Class_Eco Plus
0,1,65,265,0,0,0,2,2,4,2,...,2,0,0.0,1,0,0,1,0,1,0
1,1,47,2464,0,0,0,3,0,2,2,...,2,310,305.0,1,0,0,1,1,0,0
2,1,15,2138,0,0,0,3,2,0,2,...,2,0,0.0,1,0,0,1,0,1,0
3,1,60,623,0,0,0,3,3,4,3,...,3,0,0.0,1,0,0,1,0,1,0
4,1,70,354,0,0,0,3,4,3,4,...,5,0,0.0,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129482,1,29,1731,5,5,5,3,2,5,2,...,2,0,0.0,0,1,0,1,0,1,0
129483,0,63,2087,2,3,2,4,2,1,1,...,1,174,172.0,0,1,0,1,1,0,0
129484,0,69,2320,3,0,3,3,3,2,2,...,2,155,163.0,0,1,0,1,0,1,0
129485,0,66,2450,3,2,3,2,3,2,2,...,2,193,205.0,0,1,0,1,0,1,0


In [8]:
# Standardizing columns names
cols = []
for i in range(len(df_concat.columns)):
    cols.append(df_concat.columns[i].lower().replace(" ","_"))
df_concat.columns = cols
df_concat

Unnamed: 0,satisfaction,age,flight_distance,seat_comfort,departure/arrival_time_convenient,food_and_drink,gate_location,inflight_wifi_service,inflight_entertainment,online_support,...,online_boarding,departure_delay_in_minutes,arrival_delay_in_minutes,customer_type_loyal_customer,customer_type_disloyal_customer,type_of_travel_business_travel,type_of_travel_personal_travel,class_business,class_eco,class_eco_plus
0,1,65,265,0,0,0,2,2,4,2,...,2,0,0.0,1,0,0,1,0,1,0
1,1,47,2464,0,0,0,3,0,2,2,...,2,310,305.0,1,0,0,1,1,0,0
2,1,15,2138,0,0,0,3,2,0,2,...,2,0,0.0,1,0,0,1,0,1,0
3,1,60,623,0,0,0,3,3,4,3,...,3,0,0.0,1,0,0,1,0,1,0
4,1,70,354,0,0,0,3,4,3,4,...,5,0,0.0,1,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129482,1,29,1731,5,5,5,3,2,5,2,...,2,0,0.0,0,1,0,1,0,1,0
129483,0,63,2087,2,3,2,4,2,1,1,...,1,174,172.0,0,1,0,1,1,0,0
129484,0,69,2320,3,0,3,3,3,2,2,...,2,155,163.0,0,1,0,1,0,1,0
129485,0,66,2450,3,2,3,2,3,2,2,...,2,193,205.0,0,1,0,1,0,1,0


In [9]:
# Checking balance between 0 and 1
df_concat.groupby("satisfaction").count()

Unnamed: 0_level_0,age,flight_distance,seat_comfort,departure/arrival_time_convenient,food_and_drink,gate_location,inflight_wifi_service,inflight_entertainment,online_support,ease_of_online_booking,...,online_boarding,departure_delay_in_minutes,arrival_delay_in_minutes,customer_type_loyal_customer,customer_type_disloyal_customer,type_of_travel_business_travel,type_of_travel_personal_travel,class_business,class_eco,class_eco_plus
satisfaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,58605,58605,58605,58605,58605,58605,58605,58605,58605,58605,...,58605,58605,58605,58605,58605,58605,58605,58605,58605,58605
1,70882,70882,70882,70882,70882,70882,70882,70882,70882,70882,...,70882,70882,70882,70882,70882,70882,70882,70882,70882,70882


In [10]:
# X-y split
X = df_concat.drop("satisfaction", axis = 1)
y = df_concat["satisfaction"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2)

# Creating the model
LR = LogisticRegression()
LR.fit(X_train, y_train)

# Computing indicatores
print("Accuracy:", LR.score(X_test, y_test))
pred = LR.predict(X_test)
print("Precision:", precision_score(y_test, pred))
print("Recall:", recall_score(y_test, pred))
print("F1:", f1_score(y_test, pred))

Accuracy: 0.7596339485674569
Precision: 0.7722985157442086
Recall: 0.7910066065212759
F1: 0.7815406211616073


In [11]:
# Classification report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.74      0.72      0.73     11821
           1       0.77      0.79      0.78     14077

    accuracy                           0.76     25898
   macro avg       0.76      0.76      0.76     25898
weighted avg       0.76      0.76      0.76     25898



In [12]:
# Confusion matrix
confusion_matrix(y_test, pred)

array([[ 8538,  3283],
       [ 2942, 11135]], dtype=int64)