## Import

In [42]:
# Importing the libraries

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import pickle
import xgboost as xgb

ModuleNotFoundError: No module named 'xgboost'

In [2]:
# Importing the dataset

url = 'https://raw.githubusercontent.com/liliansom/ML_CustomerSatisfaction/main/data/Invistico_Airline_treated.csv'
data = pd.read_csv(url)

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,satisfaction,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Seat comfort,Departure/Arrival time convenient,...,Online support,Ease of Online booking,On-board service,Leg room service,Baggage handling,Checkin service,Cleanliness,Online boarding,Departure Delay in Minutes,Arrival Delay in Minutes
0,0,1,0,0,65,1,1,265,0,0,...,2,3,3,0,3,5,3,2,0,0.0
1,1,1,1,0,47,1,0,2464,0,0,...,2,3,4,4,4,2,3,2,310,305.0
2,2,1,0,0,15,1,1,2138,0,0,...,2,2,3,3,4,4,4,2,0,0.0
3,3,1,0,0,60,1,1,623,0,0,...,3,1,1,0,1,4,1,3,0,0.0
4,4,1,0,0,70,1,1,354,0,0,...,4,2,2,0,2,4,2,5,0,0.0


## Model 1

In [4]:
# Selecting the variables that will be used at this model
new_data = data[['Inflight entertainment', 
                'Ease of Online booking', 
                'On-board service', 
                'Online support', 
                'Leg room service', 
                'Online boarding',
                'Checkin service',
                'Baggage handling',
                'Cleanliness',
                'Seat comfort',
                'Inflight wifi service',
                 'Gender',
                 'Class',
                 'Customer Type'
                ]]

caracteristicas = new_data
previsor = data['satisfaction']

In [5]:
# Spliting the dataset: 80% for train and 10% for test

X_train, X_test, y_train, y_test = train_test_split(caracteristicas, previsor, test_size=0.10, shuffle=True)

In [6]:
# Instantiating the model

f_log = LogisticRegression()

In [7]:
f_log.fit(X_train, y_train)

In [8]:
# Predicting with the model
previsoes = f_log.predict(X_test)
previsoes

array([0, 1, 1, ..., 0, 1, 1], dtype=int64)

In [9]:
y_previsto_train = f_log.predict(X_train)

## Analysis of the Model

### Confusion Matrix
A confusion matrix is a performance measurement tool used in machine learning to evaluate the accuracy of a classification model. It is a table that visualizes the performance of a model by comparing the actual labels of a dataset with the predicted labels generated by the model.

The confusion matrix consists of four key components:

1) True Positive (TP): The number of observations that are correctly predicted as positive or belonging to the positive class.

2) True Negative (TN): The number of observations that are correctly predicted as negative or belonging to the negative class.

3) False Positive (FP): The number of observations that are incorrectly predicted as positive when they actually belong to the negative class. Also known as a Type I error.

4) False Negative (FN): The number of observations that are incorrectly predicted as negative when they actually belong to the positive class. Also known as a Type II error.

### Classification Report
A classification report is a summary of the performance metrics for a classification model. It provides a comprehensive evaluation of the model's predictive ability for each class in a multi-class classification problem.

The classification report typically includes the following metrics for each class:

Precision: It measures the proportion of true positive predictions among all positive predictions. It indicates how well the model correctly identifies positive instances for a given class.

Recall: Also known as sensitivity or true positive rate, it measures the proportion of true positive predictions among all actual positive instances. It indicates the model's ability to correctly identify positive instances for a given class.

F1-score: The F1-score is the harmonic mean of precision and recall. It provides a balanced measure of the model's performance, taking into account both precision and recall. It is often used as a single metric to evaluate the model's overall performance.

Support: Support refers to the number of occurrences of each class in the test dataset. It provides insights into the distribution of instances across different classes.

The classification report helps in understanding the strengths and weaknesses of the model for each class and can be useful in making decisions regarding model optimization or class-specific performance improvements.

In [10]:
confusion_matrix(y_test, previsoes)

array([[4729, 1153],
       [1056, 6050]], dtype=int64)

In [11]:
print(classification_report(y_test, previsoes))
# recall = acertividade

              precision    recall  f1-score   support

           0       0.82      0.80      0.81      5882
           1       0.84      0.85      0.85      7106

    accuracy                           0.83     12988
   macro avg       0.83      0.83      0.83     12988
weighted avg       0.83      0.83      0.83     12988



## Model 2 - Balanced dataset

In [16]:
# Identifique a coluna específica em que você deseja igualar os valores 0 e 1
coluna_alvo = 'satisfaction'

# Separe as linhas com valor 0 e 1 em DataFrames diferentes
dados_0 = data[data[coluna_alvo] == 0]
dados_1 = data[data[coluna_alvo] == 1]

# Obtenha o número mínimo de linhas entre os dois DataFrames
minimo_linhas = min(len(dados_0), len(dados_1))

# Amostra aleatória das linhas com valor 0
dados_0_subamostrados = dados_0.sample(n=minimo_linhas, random_state=42)

# Amostra aleatória das linhas com valor 1
dados_1_subamostrados = dados_1.sample(n=minimo_linhas, random_state=42)

# Concatene os DataFrames subamostrados de volta em um único DataFrame
data_balanceado = pd.concat([dados_0_subamostrados, dados_1_subamostrados])

# Embaralhe as linhas do DataFrame resultante
data_bal = data_balanceado.sample(frac=1, random_state=42)


In [17]:
data_bal['satisfaction'].value_counts()

1    58793
0    58793
Name: satisfaction, dtype: int64

In [26]:
# Selecting the variables that will be used at this model
new_data1 = data_bal[['Inflight entertainment', 
                'Ease of Online booking', 
                'On-board service', 
                'Online support', 
                'Leg room service', 
                'Online boarding',
                'Checkin service',
                'Baggage handling',
                'Cleanliness',
                'Seat comfort',
                'Inflight wifi service',
                 'Gender',
                 'Class',
                 'Customer Type',
                 
                ]]

caracteristicas1 = new_data1
previsor1 = data_bal['satisfaction']

In [27]:
# Spliting the dataset: 80% for train and 20% for test
X_train1, X_test1, y_train1, y_test1 = train_test_split(caracteristicas1, previsor1, test_size=0.10, shuffle=True)

In [28]:
# Instantiating the model
f_log1 = LogisticRegression()

In [29]:
f_log1.fit(X_train1, y_train1)

In [30]:
# Predicting with the model
previsoes1 = f_log1.predict(X_test1)
previsoes1

array([0, 1, 0, ..., 0, 0, 1], dtype=int64)

In [31]:
y_previsto_train1 = f_log1.predict(X_train1)

## Analysing the Model

### Confusion Matrix


In [32]:
confusion_matrix(y_test1, previsoes1)

array([[5037,  948],
       [1026, 4748]], dtype=int64)

### Classification Report

In [33]:
print(classification_report(y_test1, previsoes1))
# recall = acertividade

              precision    recall  f1-score   support

           0       0.83      0.84      0.84      5985
           1       0.83      0.82      0.83      5774

    accuracy                           0.83     11759
   macro avg       0.83      0.83      0.83     11759
weighted avg       0.83      0.83      0.83     11759



new_data1 = data[['Customer Type', 'Gender', 'Class', 'Seat comfort',
       'Inflight wifi service', 'Inflight entertainment', 'Online support',
       'Ease of Online booking', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Cleanliness', 'Online boarding']]

# Model 3 - More variables

In [35]:
# Selecting the variables that will be used at this model
new_data2 = data_bal[['Inflight entertainment', 
                'Ease of Online booking', 
                'On-board service', 
                'Online support', 
                'Leg room service', 
                'Online boarding',
                'Checkin service',
                'Baggage handling',
                'Cleanliness',
                'Seat comfort',
                'Inflight wifi service',
                 'Gender',
                 'Class',
                 'Customer Type'                 
                ]]

caracteristicas2 = new_data2
previsor2 = data_bal['satisfaction']

In [37]:
# Spliting the dataset: 80% for train and 20% for test
X_train2, X_test2, y_train2, y_test2 = train_test_split(caracteristicas2, previsor2, test_size=0.10, shuffle=True)

In [38]:
# Criar a matriz de dados específica do XGBoost
dados_treino = xgb.DMatrix(X_treino, label=y_treino)

In [39]:
# Definir os parâmetros do modelo
parametros = {
    'max_depth': 3,
    'eta': 0.1,
    'objective': 'binary:logistic',
    'eval_metric': 'logloss'
}

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [40]:
# Treinar o modelo
modelo = xgb.train(parametros, dados_treino, num_boost_round=100)

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- Age
- Arrival Delay in Minutes
- Departure Delay in Minutes
- Departure/Arrival time convenient
- Flight Distance
- ...


In [41]:
# Prever os rótulos do conjunto de teste
dados_teste = xgb.DMatrix(X_teste)
predicoes = modelo.predict(dados_teste)

In [None]:
# Arredondar as predições para obter rótulos binários
predicoes_binarias = [round(valor) for valor in predicoes]

# Analysing the Model

In [None]:
# Avaliar a precisão do modelo
precisao = accuracy_score(y_teste, predicoes_binarias)
print("Precisão: %.2f%%" % (precisao * 100.0))

## Saving the model to test it

In [None]:
#pickle.dump(f_log1, open('satisfaction_model.pkl', 'wb'))