1. Create a query or queries to extract the information you think may be relevant for building the prediction model. It should include some film features and some rental features.

2. Read the data into a Pandas dataframe.

3. Analyze extracted features and transform them. You may need to encode some categorical variables, or scale numerical variables.

4. Create a query to get the list of films and a boolean indicating if it was rented last month. This would be our target variable.

5. Create a logistic regression model to predict this variable from the cleaned data.

6. Evaluate the results.

In [28]:
# Importing libraries

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

from sklearn import linear_model
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from statsmodels.formula.api import ols

import math

pd.set_option('display.max_rows', 25)  # Display all rows
pd.set_option('display.max_columns', 50)  # Display all columns
pd.set_option('display.width', 100)  # Set the display width to fit the entire DataFrame


from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

Getting the data

In [2]:
#importing file from csv

inv_air = pd.read_csv(r"C:\Users\ssara\OneDrive\Ambiente de Trabalho\Class 1\4. Labs\11. NOV lab-predictions-logistic-regression\Invistico_Airline.csv")

Cleaning/Wrangling/EDA

In [3]:
# Headers and space Cleaning

inv_air.columns = inv_air.columns.str.lower().str.replace(' ', '_')

In [4]:
# Analyzing the dataset

num_rows, num_columns = inv_air.shape
print(f"The dataset has {num_rows} rows and {num_columns} columns.")

inv_air.isna().sum() 

The dataset has 129880 rows and 22 columns.


satisfaction                           0
customer_type                          0
age                                    0
type_of_travel                         0
class                                  0
flight_distance                        0
seat_comfort                           0
departure/arrival_time_convenient      0
food_and_drink                         0
gate_location                          0
inflight_wifi_service                  0
inflight_entertainment                 0
online_support                         0
ease_of_online_booking                 0
on-board_service                       0
leg_room_service                       0
baggage_handling                       0
checkin_service                        0
cleanliness                            0
online_boarding                        0
departure_delay_in_minutes             0
arrival_delay_in_minutes             393
dtype: int64

In [5]:
inv_air.groupby('satisfaction').count()

Unnamed: 0_level_0,customer_type,age,type_of_travel,class,flight_distance,seat_comfort,departure/arrival_time_convenient,food_and_drink,gate_location,inflight_wifi_service,inflight_entertainment,online_support,ease_of_online_booking,on-board_service,leg_room_service,baggage_handling,checkin_service,cleanliness,online_boarding,departure_delay_in_minutes,arrival_delay_in_minutes
satisfaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
dissatisfied,58793,58793,58793,58793,58793,58793,58793,58793,58793,58793,58793,58793,58793,58793,58793,58793,58793,58793,58793,58793,58605
satisfied,71087,71087,71087,71087,71087,71087,71087,71087,71087,71087,71087,71087,71087,71087,71087,71087,71087,71087,71087,71087,70882


In [6]:
inv_air.dropna(inplace=True)
num_rows, num_columns = inv_air.shape
print(f"The dataset has {num_rows} rows and {num_columns} columns.")

The dataset has 129487 rows and 22 columns.


In [7]:
replacement_dict = {"satisfied": 1, "dissatisfied": 0}
inv_air["satisfaction"] = inv_air["satisfaction"].replace(replacement_dict)

In [8]:
inv_air.groupby('satisfaction').count()

Unnamed: 0_level_0,customer_type,age,type_of_travel,class,flight_distance,seat_comfort,departure/arrival_time_convenient,food_and_drink,gate_location,inflight_wifi_service,inflight_entertainment,online_support,ease_of_online_booking,on-board_service,leg_room_service,baggage_handling,checkin_service,cleanliness,online_boarding,departure_delay_in_minutes,arrival_delay_in_minutes
satisfaction,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,58605,58605,58605,58605,58605,58605,58605,58605,58605,58605,58605,58605,58605,58605,58605,58605,58605,58605,58605,58605,58605
1,70882,70882,70882,70882,70882,70882,70882,70882,70882,70882,70882,70882,70882,70882,70882,70882,70882,70882,70882,70882,70882


In [9]:
inv_air.columns

Index(['satisfaction', 'customer_type', 'age', 'type_of_travel', 'class', 'flight_distance',
       'seat_comfort', 'departure/arrival_time_convenient', 'food_and_drink', 'gate_location',
       'inflight_wifi_service', 'inflight_entertainment', 'online_support',
       'ease_of_online_booking', 'on-board_service', 'leg_room_service', 'baggage_handling',
       'checkin_service', 'cleanliness', 'online_boarding', 'departure_delay_in_minutes',
       'arrival_delay_in_minutes'],
      dtype='object')

In [10]:
inv_air.dtypes

satisfaction                           int64
customer_type                         object
age                                    int64
type_of_travel                        object
class                                 object
flight_distance                        int64
seat_comfort                           int64
departure/arrival_time_convenient      int64
food_and_drink                         int64
gate_location                          int64
inflight_wifi_service                  int64
inflight_entertainment                 int64
online_support                         int64
ease_of_online_booking                 int64
on-board_service                       int64
leg_room_service                       int64
baggage_handling                       int64
checkin_service                        int64
cleanliness                            int64
online_boarding                        int64
departure_delay_in_minutes             int64
arrival_delay_in_minutes             float64
dtype: obj

In [11]:
inv_air['satisfaction']

0         1
1         1
2         1
3         1
4         1
         ..
129875    1
129876    0
129877    0
129878    0
129879    0
Name: satisfaction, Length: 129487, dtype: int64

In [12]:
categorical = inv_air.select_dtypes(exclude= np.number)
numerical = inv_air.select_dtypes(include= np.number)

In [13]:
categorical = ["customer_type", "type_of_travel", "class"]

# Performing the one-hot encoding
categorical = pd.get_dummies(inv_air, columns=categorical, drop_first=True)

In [21]:
categorical_2 = categorical.drop('satisfaction', axis=1, inplace=False)

In [20]:
display(numerical)

Unnamed: 0,satisfaction,age,flight_distance,seat_comfort,departure/arrival_time_convenient,food_and_drink,gate_location,inflight_wifi_service,inflight_entertainment,online_support,ease_of_online_booking,on-board_service,leg_room_service,baggage_handling,checkin_service,cleanliness,online_boarding,departure_delay_in_minutes,arrival_delay_in_minutes
0,1,65,265,0,0,0,2,2,4,2,3,3,0,3,5,3,2,0,0.0
1,1,47,2464,0,0,0,3,0,2,2,3,4,4,4,2,3,2,310,305.0
2,1,15,2138,0,0,0,3,2,0,2,2,3,3,4,4,4,2,0,0.0
3,1,60,623,0,0,0,3,3,4,3,1,1,0,1,4,1,3,0,0.0
4,1,70,354,0,0,0,3,4,3,4,2,2,0,2,4,2,5,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,1,29,1731,5,5,5,3,2,5,2,2,3,3,4,4,4,2,0,0.0
129876,0,63,2087,2,3,2,4,2,1,1,3,2,3,3,1,2,1,174,172.0
129877,0,69,2320,3,0,3,3,3,2,2,4,4,3,4,2,3,2,155,163.0
129878,0,66,2450,3,2,3,2,3,2,2,3,3,2,3,2,1,2,193,205.0


In [22]:
#Concating the data

inv_air_treated = pd.concat([numerical, categorical_2], axis=1)
inv_air_treated

Unnamed: 0,satisfaction,age,flight_distance,seat_comfort,departure/arrival_time_convenient,food_and_drink,gate_location,inflight_wifi_service,inflight_entertainment,online_support,ease_of_online_booking,on-board_service,leg_room_service,baggage_handling,checkin_service,cleanliness,online_boarding,departure_delay_in_minutes,arrival_delay_in_minutes,age.1,flight_distance.1,seat_comfort.1,departure/arrival_time_convenient.1,food_and_drink.1,gate_location.1,inflight_wifi_service.1,inflight_entertainment.1,online_support.1,ease_of_online_booking.1,on-board_service.1,leg_room_service.1,baggage_handling.1,checkin_service.1,cleanliness.1,online_boarding.1,departure_delay_in_minutes.1,arrival_delay_in_minutes.1,customer_type_disloyal Customer,type_of_travel_Personal Travel,class_Eco,class_Eco Plus
0,1,65,265,0,0,0,2,2,4,2,3,3,0,3,5,3,2,0,0.0,65,265,0,0,0,2,2,4,2,3,3,0,3,5,3,2,0,0.0,0,1,1,0
1,1,47,2464,0,0,0,3,0,2,2,3,4,4,4,2,3,2,310,305.0,47,2464,0,0,0,3,0,2,2,3,4,4,4,2,3,2,310,305.0,0,1,0,0
2,1,15,2138,0,0,0,3,2,0,2,2,3,3,4,4,4,2,0,0.0,15,2138,0,0,0,3,2,0,2,2,3,3,4,4,4,2,0,0.0,0,1,1,0
3,1,60,623,0,0,0,3,3,4,3,1,1,0,1,4,1,3,0,0.0,60,623,0,0,0,3,3,4,3,1,1,0,1,4,1,3,0,0.0,0,1,1,0
4,1,70,354,0,0,0,3,4,3,4,2,2,0,2,4,2,5,0,0.0,70,354,0,0,0,3,4,3,4,2,2,0,2,4,2,5,0,0.0,0,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,1,29,1731,5,5,5,3,2,5,2,2,3,3,4,4,4,2,0,0.0,29,1731,5,5,5,3,2,5,2,2,3,3,4,4,4,2,0,0.0,1,1,1,0
129876,0,63,2087,2,3,2,4,2,1,1,3,2,3,3,1,2,1,174,172.0,63,2087,2,3,2,4,2,1,1,3,2,3,3,1,2,1,174,172.0,1,1,0,0
129877,0,69,2320,3,0,3,3,3,2,2,4,4,3,4,2,3,2,155,163.0,69,2320,3,0,3,3,3,2,2,4,4,3,4,2,3,2,155,163.0,1,1,1,0
129878,0,66,2450,3,2,3,2,3,2,2,3,3,2,3,2,1,2,193,205.0,66,2450,3,2,3,2,3,2,2,3,3,2,3,2,1,2,193,205.0,1,1,1,0


In [23]:
inv_air_treated.reset_index(inplace=True)

In [24]:
inv_air_treated.columns

Index(['index', 'satisfaction', 'age', 'flight_distance', 'seat_comfort',
       'departure/arrival_time_convenient', 'food_and_drink', 'gate_location',
       'inflight_wifi_service', 'inflight_entertainment', 'online_support',
       'ease_of_online_booking', 'on-board_service', 'leg_room_service', 'baggage_handling',
       'checkin_service', 'cleanliness', 'online_boarding', 'departure_delay_in_minutes',
       'arrival_delay_in_minutes', 'age', 'flight_distance', 'seat_comfort',
       'departure/arrival_time_convenient', 'food_and_drink', 'gate_location',
       'inflight_wifi_service', 'inflight_entertainment', 'online_support',
       'ease_of_online_booking', 'on-board_service', 'leg_room_service', 'baggage_handling',
       'checkin_service', 'cleanliness', 'online_boarding', 'departure_delay_in_minutes',
       'arrival_delay_in_minutes', 'customer_type_disloyal Customer',
       'type_of_travel_Personal Travel', 'class_Eco', 'class_Eco Plus'],
      dtype='object')

In [25]:
y = inv_air_treated['satisfaction']
y

0         1
1         1
2         1
3         1
4         1
         ..
129482    1
129483    0
129484    0
129485    0
129486    0
Name: satisfaction, Length: 129487, dtype: int64

In [26]:
# Building the model

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

X = inv_air_treated.drop('satisfaction',axis = 1)

y = inv_air_treated['satisfaction']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2)

LR = LogisticRegression()
LR.fit(X_train, y_train)

LR.score(X_test, y_test)

0.6966947254614256

In [29]:
print("Accuracy:", LR.score(X_test, y_test))
pred = LR.predict(X_test)
print("Precision:", precision_score(y_test, pred))
print("Recall:", recall_score(y_test, pred))
print("F1:", f1_score(y_test, pred))

Accuracy: 0.6966947254614256
Precision: 0.6848924283846428
Recall: 0.8186403352987142
F1: 0.7458175581658739


In [30]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.72      0.55      0.62     11821
           1       0.68      0.82      0.75     14077

    accuracy                           0.70     25898
   macro avg       0.70      0.69      0.68     25898
weighted avg       0.70      0.70      0.69     25898



In [31]:
confusion_matrix(y_test, pred)

array([[ 6519,  5302],
       [ 2553, 11524]], dtype=int64)