In [1]:
#importing libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split

from sklearn.metrics import precision_score # This is the precision
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score # f1
from sklearn.metrics import classification_report # This is the classification matrix

from sklearn.metrics import confusion_matrix

from sklearn.utils import resample

from imblearn.over_sampling import SMOTE

In [2]:
#importing csv
airline = pd.read_csv(r"C:\Users\pedro\Desktop\Ironhack\Invistico_Airline.csv")
airline.head(5)

Unnamed: 0,satisfaction,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,satisfied,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,2,...,2,3,3,0,3,5,3,2,0,0.0
1,satisfied,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,3,...,2,3,4,4,4,2,3,2,310,305.0
2,satisfied,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,3,...,2,2,3,3,4,4,4,2,0,0.0
3,satisfied,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,3,...,3,1,1,0,1,4,1,3,0,0.0
4,satisfied,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,3,...,4,2,2,0,2,4,2,5,0,0.0


In [4]:
def clean_and_process(df):
    cols = []
    for i in range(len(df.columns)):
        cols.append(df.columns[i].lower().replace(' ','_'))
    df.columns = cols    
    #creating dataframes for numerical and categorical variables
    df_numerical = df.select_dtypes(include=[np.number])
    df_categoricals = df.select_dtypes(['object'])
    #normalizing data
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    normalized_data = scaler.fit_transform(df_numerical)
    normalized_data = pd.DataFrame(normalized_data, columns=df_numerical.columns)
    dummy_data = pd.get_dummies(df_categoricals)
    #Concating DataFrames
    df_data = pd.concat([df_numerical,dummy_data],axis=1)
    return df_data

In [5]:
#standardizing headers and normalization
airline=clean_and_process(airline)

In [16]:
airline['satisfaction_satisfied']

0         1
1         1
2         1
3         1
4         1
         ..
129875    1
129876    0
129877    0
129878    0
129879    0
Name: satisfaction_satisfied, Length: 129880, dtype: uint8

In [25]:
#counting satisfied and unsatisfied
airline['satisfaction_satisfied'].groupby(airline['satisfaction_satisfied']).count()

satisfaction_satisfied
0    58793
1    71087
Name: satisfaction_satisfied, dtype: int64

In [7]:
#checking NA's
def check_NA(df):
    print(df.isna().sum().sum())
    print("\n",df.isna().sum() / len(df))

In [8]:
#No NA's
check_NA(airline)

393

 age                                  0.000000
flight_distance                      0.000000
seat_comfort                         0.000000
departure/arrival_time_convenient    0.000000
food_and_drink                       0.000000
gate_location                        0.000000
inflight_wifi_service                0.000000
inflight_entertainment               0.000000
online_support                       0.000000
ease_of_online_booking               0.000000
on-board_service                     0.000000
leg_room_service                     0.000000
baggage_handling                     0.000000
checkin_service                      0.000000
cleanliness                          0.000000
online_boarding                      0.000000
departure_delay_in_minutes           0.000000
arrival_delay_in_minutes             0.003026
satisfaction_dissatisfied            0.000000
satisfaction_satisfied               0.000000
customer_type_Loyal Customer         0.000000
customer_type_disloyal Custo

In [9]:
airline=airline.dropna()

In [10]:
#Creating a logistic regression model
X = airline.drop('satisfaction_satisfied', axis = 1)
y = airline['satisfaction_satisfied']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42, test_size = 0.2)

LR = LogisticRegression()
LR.fit(X_train, y_train)
pred = LR.predict(X_test)

print ("Accuracy is:",round(LR.score(X_test, y_test),3))    #accuracy
print("Precision is: ", round(precision_score(y_test, pred),3))
print("Recall is: ", round(recall_score(y_test, pred),3))
print("F1 is: ", round(f1_score(y_test, pred),3))
print(classification_report(y_test, pred))
print(confusion_matrix(y_test, pred))

Accuracy is: 0.886
Precision is:  0.89
Recall is:  0.902
F1 is:  0.896
              precision    recall  f1-score   support

           0       0.88      0.87      0.87     11821
           1       0.89      0.90      0.90     14077

    accuracy                           0.89     25898
   macro avg       0.89      0.88      0.89     25898
weighted avg       0.89      0.89      0.89     25898

[[10249  1572]
 [ 1376 12701]]
