In this lab, you will be using the Sakila database of movie rentals.

In order to optimize our inventory, we would like to know which films will be rented next month and we are asked to create a model to predict it.

Instructions

- Read the data into a Pandas dataframe.
- Analyze extracted features and transform them. You may need to encode some categorical variables, or scale numerical variables.
- Create a logistic regression model to predict this variable from the cleaned data.
- Evaluate the results.

In [1]:
import imblearn
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline


# importing the lib to use the log reg
from sklearn.linear_model import LogisticRegression

In [2]:
# Read the data into a Pandas dataframe

df = pd.read_csv("/Users/giulianamiranda/Downloads/Invistico_Airline.csv", sep = ',')
df

Unnamed: 0,satisfaction,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,Food and drink,Gate location,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,satisfied,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,2,...,2,3,3,0,3,5,3,2,0,0.0
1,satisfied,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,3,...,2,3,4,4,4,2,3,2,310,305.0
2,satisfied,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,3,...,2,2,3,3,4,4,4,2,0,0.0
3,satisfied,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,3,...,3,1,1,0,1,4,1,3,0,0.0
4,satisfied,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,3,...,4,2,2,0,2,4,2,5,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,satisfied,disloyal Customer,29,Personal Travel,Eco,1731,5,5,5,3,...,2,2,3,3,4,4,4,2,0,0.0
129876,dissatisfied,disloyal Customer,63,Personal Travel,Business,2087,2,3,2,4,...,1,3,2,3,3,1,2,1,174,172.0
129877,dissatisfied,disloyal Customer,69,Personal Travel,Eco,2320,3,0,3,3,...,2,4,4,3,4,2,3,2,155,163.0
129878,dissatisfied,disloyal Customer,66,Personal Travel,Eco,2450,3,2,3,2,...,2,3,3,2,3,2,1,2,193,205.0


In [3]:
# Analyze extracted features and transform them. 
# You may need to encode some categorical variables, or scale numerical variables


# I'm first cleaning the data and taking a closer look at it, including a scan for NaN and duplicated values

cols = []

for a in range(len(df.columns)):
    cols.append(df.columns[a].lower().replace(' ', '_'))
    
df.columns = cols

df.columns


Index(['satisfaction', 'customer_type', 'age', 'type_of_travel', 'class',
       'flight_distance', 'seat_comfort', 'departure/arrival_time_convenient',
       'food_and_drink', 'gate_location', 'inflight_wifi_service',
       'inflight_entertainment', 'online_support', 'ease_of_online_booking',
       'on-board_service', 'leg_room_service', 'baggage_handling',
       'checkin_service', 'cleanliness', 'online_boarding',
       'departure_delay_in_minutes', 'arrival_delay_in_minutes'],
      dtype='object')

In [4]:
df.head(50)

Unnamed: 0,satisfaction,customer_type,age,type_of_travel,class,flight_distance,seat_comfort,departure/arrival_time_convenient,food_and_drink,gate_location,...,online_support,ease_of_online_booking,on-board_service,leg_room_service,baggage_handling,checkin_service,cleanliness,online_boarding,departure_delay_in_minutes,arrival_delay_in_minutes
0,satisfied,Loyal Customer,65,Personal Travel,Eco,265,0,0,0,2,...,2,3,3,0,3,5,3,2,0,0.0
1,satisfied,Loyal Customer,47,Personal Travel,Business,2464,0,0,0,3,...,2,3,4,4,4,2,3,2,310,305.0
2,satisfied,Loyal Customer,15,Personal Travel,Eco,2138,0,0,0,3,...,2,2,3,3,4,4,4,2,0,0.0
3,satisfied,Loyal Customer,60,Personal Travel,Eco,623,0,0,0,3,...,3,1,1,0,1,4,1,3,0,0.0
4,satisfied,Loyal Customer,70,Personal Travel,Eco,354,0,0,0,3,...,4,2,2,0,2,4,2,5,0,0.0
5,satisfied,Loyal Customer,30,Personal Travel,Eco,1894,0,0,0,3,...,2,2,5,4,5,5,4,2,0,0.0
6,satisfied,Loyal Customer,66,Personal Travel,Eco,227,0,0,0,3,...,5,5,5,0,5,5,5,3,17,15.0
7,satisfied,Loyal Customer,10,Personal Travel,Eco,1812,0,0,0,3,...,2,2,3,3,4,5,4,2,0,0.0
8,satisfied,Loyal Customer,56,Personal Travel,Business,73,0,0,0,3,...,5,4,4,0,1,5,4,4,0,0.0
9,satisfied,Loyal Customer,22,Personal Travel,Eco,1556,0,0,0,3,...,2,2,2,4,5,3,4,2,30,26.0


In [5]:
df.isnull().sum()

# The only column with NaN values is the 'arrival_delay_in_minutes'

satisfaction                           0
customer_type                          0
age                                    0
type_of_travel                         0
class                                  0
flight_distance                        0
seat_comfort                           0
departure/arrival_time_convenient      0
food_and_drink                         0
gate_location                          0
inflight_wifi_service                  0
inflight_entertainment                 0
online_support                         0
ease_of_online_booking                 0
on-board_service                       0
leg_room_service                       0
baggage_handling                       0
checkin_service                        0
cleanliness                            0
online_boarding                        0
departure_delay_in_minutes             0
arrival_delay_in_minutes             393
dtype: int64

In [6]:
#Verifying if the NaN values correspond with the customers who didn't experience a delay on their departure

nan_arrival = df['arrival_delay_in_minutes'].isna()

departure_zero_delay = df['departure_delay_in_minutes'] == 0
correspondence = nan_arrival  == departure_zero_delay

print("The number of correspondes is: ", correspondence.sum())



The number of correspondes is:  56425


In [7]:
# I decided to replace the NaN for zeros. And I'm also encoding the custumer satisfaction column

df['arrival_delay_in_minutes'] = df['arrival_delay_in_minutes'].fillna(0)


In [8]:
df.dtypes

satisfaction                          object
customer_type                         object
age                                    int64
type_of_travel                        object
class                                 object
flight_distance                        int64
seat_comfort                           int64
departure/arrival_time_convenient      int64
food_and_drink                         int64
gate_location                          int64
inflight_wifi_service                  int64
inflight_entertainment                 int64
online_support                         int64
ease_of_online_booking                 int64
on-board_service                       int64
leg_room_service                       int64
baggage_handling                       int64
checkin_service                        int64
cleanliness                            int64
online_boarding                        int64
departure_delay_in_minutes             int64
arrival_delay_in_minutes             float64
dtype: obj

In [9]:
df.duplicated().any

<bound method NDFrame._add_numeric_operations.<locals>.any of 0         False
1         False
2         False
3         False
4         False
          ...  
129875    False
129876    False
129877    False
129878    False
129879    False
Length: 129880, dtype: bool>

In [10]:
# I want to predict customers satisfaction based on other variables.
# For this, I´m choosing only a few columns 

In [11]:
satisfaction_num = []

for x in df['satisfaction']:
    if x == 'satisfied':
        satisfaction_num.append(1)
    else:
        satisfaction_num.append(0)


df['satisfaction_num'] = satisfaction_num
df.columns


Index(['satisfaction', 'customer_type', 'age', 'type_of_travel', 'class',
       'flight_distance', 'seat_comfort', 'departure/arrival_time_convenient',
       'food_and_drink', 'gate_location', 'inflight_wifi_service',
       'inflight_entertainment', 'online_support', 'ease_of_online_booking',
       'on-board_service', 'leg_room_service', 'baggage_handling',
       'checkin_service', 'cleanliness', 'online_boarding',
       'departure_delay_in_minutes', 'arrival_delay_in_minutes',
       'satisfaction_num'],
      dtype='object')

In [12]:
# Select categorical and numerical columns

cat = df.select_dtypes(include=[object])
num = df.select_dtypes(include=[np.number])

In [13]:
num

Unnamed: 0,age,flight_distance,seat_comfort,departure/arrival_time_convenient,food_and_drink,gate_location,inflight_wifi_service,inflight_entertainment,online_support,ease_of_online_booking,on-board_service,leg_room_service,baggage_handling,checkin_service,cleanliness,online_boarding,departure_delay_in_minutes,arrival_delay_in_minutes,satisfaction_num
0,65,265,0,0,0,2,2,4,2,3,3,0,3,5,3,2,0,0.0,1
1,47,2464,0,0,0,3,0,2,2,3,4,4,4,2,3,2,310,305.0,1
2,15,2138,0,0,0,3,2,0,2,2,3,3,4,4,4,2,0,0.0,1
3,60,623,0,0,0,3,3,4,3,1,1,0,1,4,1,3,0,0.0,1
4,70,354,0,0,0,3,4,3,4,2,2,0,2,4,2,5,0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,29,1731,5,5,5,3,2,5,2,2,3,3,4,4,4,2,0,0.0,1
129876,63,2087,2,3,2,4,2,1,1,3,2,3,3,1,2,1,174,172.0,0
129877,69,2320,3,0,3,3,3,2,2,4,4,3,4,2,3,2,155,163.0,0
129878,66,2450,3,2,3,2,3,2,2,3,3,2,3,2,1,2,193,205.0,0


In [14]:
num.groupby('satisfaction_num').count()

Unnamed: 0_level_0,age,flight_distance,seat_comfort,departure/arrival_time_convenient,food_and_drink,gate_location,inflight_wifi_service,inflight_entertainment,online_support,ease_of_online_booking,on-board_service,leg_room_service,baggage_handling,checkin_service,cleanliness,online_boarding,departure_delay_in_minutes,arrival_delay_in_minutes
satisfaction_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,58793,58793,58793,58793,58793,58793,58793,58793,58793,58793,58793,58793,58793,58793,58793,58793,58793,58793
1,71087,71087,71087,71087,71087,71087,71087,71087,71087,71087,71087,71087,71087,71087,71087,71087,71087,71087


In [15]:
x = num.drop('satisfaction_num', axis = 1)
y = num['satisfaction_num']

from sklearn.model_selection import train_test_split

x_train, x_test,y_train, y_test = train_test_split(x, y, random_state = 42, test_size = 0.2)

LR = LogisticRegression() 
LR.fit(x_train, y_train)

LR.score(x_test, y_test) #This is accuracy

# accuracy, how many times you predicted correct divided# by the total amount
# evaluating the model 
# inbalanced dataset


0.7401447489990761

In [16]:
# evaluating the results


from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report #this is the confusion matrix

# here we want to maximize the recall 

pred = LR.predict(x_test) #Is predicting 
pred

print("Precision is: ", precision_score(y_test, pred))
print("Precision is: ", recall_score(y_test, pred))
print("Precision is: ", f1_score(y_test, pred))

print(classification_report(y_test, pred))

Precision is:  0.7304804346498993
Precision is:  0.8367247045661143
Precision is:  0.7800013036959781
              precision    recall  f1-score   support

           0       0.76      0.62      0.68     11675
           1       0.73      0.84      0.78     14301

    accuracy                           0.74     25976
   macro avg       0.74      0.73      0.73     25976
weighted avg       0.74      0.74      0.74     25976

