## Import

In [1]:
# Importing the libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import pickle

In [2]:
# Importing the dataset

url = 'https://raw.githubusercontent.com/liliansom/ML_CustomerSatisfaction/main/data/Invistico_Airline_treated.csv'
data = pd.read_csv(url)

## Model 1

In [3]:
# Selecting the variables that will be used at this model
new_data = data[['Inflight entertainment', 
                'Ease of Online booking', 
                'On-board service', 
                'Online boarding', 
                'Leg room service', 
                'Customer Type']]

caracteristicas = new_data
previsor = data['satisfaction']

In [5]:
# Spliting the dataset: 80% for train and 20% for test

X_train, X_test, y_train, y_test = train_test_split(caracteristicas, previsor, test_size=0.20, shuffle=True)

In [6]:
# Instantiating the model

f_log = LogisticRegression()

In [7]:
f_log.fit(X_train, y_train)

In [8]:
# Predicting with the model
previsoes = f_log.predict(X_test)
previsoes

array([1, 1, 1, ..., 1, 1, 0], dtype=int64)

In [9]:
y_previsto_train = f_log.predict(X_train)

## Analysis of the Model

### Confusion Matrix
A confusion matrix is a performance measurement tool used in machine learning to evaluate the accuracy of a classification model. It is a table that visualizes the performance of a model by comparing the actual labels of a dataset with the predicted labels generated by the model.

The confusion matrix consists of four key components:

1) True Positive (TP): The number of observations that are correctly predicted as positive or belonging to the positive class.

2) True Negative (TN): The number of observations that are correctly predicted as negative or belonging to the negative class.

3) False Positive (FP): The number of observations that are incorrectly predicted as positive when they actually belong to the negative class. Also known as a Type I error.

4) False Negative (FN): The number of observations that are incorrectly predicted as negative when they actually belong to the positive class. Also known as a Type II error.

### Classification Report
A classification report is a summary of the performance metrics for a classification model. It provides a comprehensive evaluation of the model's predictive ability for each class in a multi-class classification problem.

The classification report typically includes the following metrics for each class:

Precision: It measures the proportion of true positive predictions among all positive predictions. It indicates how well the model correctly identifies positive instances for a given class.

Recall: Also known as sensitivity or true positive rate, it measures the proportion of true positive predictions among all actual positive instances. It indicates the model's ability to correctly identify positive instances for a given class.

F1-score: The F1-score is the harmonic mean of precision and recall. It provides a balanced measure of the model's performance, taking into account both precision and recall. It is often used as a single metric to evaluate the model's overall performance.

Support: Support refers to the number of occurrences of each class in the test dataset. It provides insights into the distribution of instances across different classes.

The classification report helps in understanding the strengths and weaknesses of the model for each class and can be useful in making decisions regarding model optimization or class-specific performance improvements.

In [10]:
confusion_matrix(y_test, previsoes)

array([[ 9134,  2694],
       [ 2419, 11729]], dtype=int64)

In [11]:
print(classification_report(y_test, previsoes))
# recall = acertividade

              precision    recall  f1-score   support

           0       0.79      0.77      0.78     11828
           1       0.81      0.83      0.82     14148

    accuracy                           0.80     25976
   macro avg       0.80      0.80      0.80     25976
weighted avg       0.80      0.80      0.80     25976



## Model 2

In [12]:
# Selecting the variables that will be used at this model
new_data1 = data[['Customer Type', 'Gender', 'Class', 'Seat comfort',
       'Inflight wifi service', 'Inflight entertainment', 'Online support',
       'Ease of Online booking', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Cleanliness', 'Online boarding']]

In [13]:
caracteristicas1 = new_data1
previsor1 = data['satisfaction']

In [14]:
# Spliting the dataset: 80% for train and 20% for test
X_train1, X_test1, y_train1, y_test1 = train_test_split(caracteristicas1, previsor1, test_size=0.20, shuffle=True)

In [15]:
# Instantiating the model
f_log1 = LogisticRegression()

In [16]:
f_log1.fit(X_train1, y_train1)

In [17]:
# Predicting with the model
previsoes1 = f_log1.predict(X_test1)
previsoes1

array([0, 0, 0, ..., 1, 1, 0], dtype=int64)

In [18]:
y_previsto_train1 = f_log1.predict(X_train1)

## Analysing the Model

### Confusion Matrix


In [19]:
confusion_matrix(y_test1, previsoes1)

array([[ 9337,  2370],
       [ 2162, 12107]], dtype=int64)

### Classification Report

In [20]:
print(classification_report(y_test1, previsoes1))
# recall = acertividade

              precision    recall  f1-score   support

           0       0.81      0.80      0.80     11707
           1       0.84      0.85      0.84     14269

    accuracy                           0.83     25976
   macro avg       0.82      0.82      0.82     25976
weighted avg       0.83      0.83      0.83     25976



new_data1 = data[['Customer Type', 'Gender', 'Class', 'Seat comfort',
       'Inflight wifi service', 'Inflight entertainment', 'Online support',
       'Ease of Online booking', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Cleanliness', 'Online boarding']]

## Saving the model to test it

In [22]:
pickle.dump(f_log1, open('satisfaction_model.pkl', 'wb'))