# Imports

In [1]:
import pandas   as pd
import numpy    as np

from matplotlib         import pyplot       as plt
from sklearn            import metrics      as mt
from sklearn            import linear_model as lm
from sklearn            import model_selection  as ms

# Load Dataset

In [2]:
#Lendo arquivo CSV de Treino
x_train = pd.read_csv('../../dataset/class/X_training.csv')
y_train = pd.read_csv('../../dataset/class/y_training.csv')

#Lendo arquivo CSV de teste
x_test = pd.read_csv('../../dataset/class/X_test.csv')
y_test = pd.read_csv('../../dataset/class/y_test.csv')

#Lendo arquivo CSV de Validação
x_val = pd.read_csv('../../dataset/class/X_validation.csv')
y_val = pd.read_csv('../../dataset/class/y_validation.csv')

In [3]:
#Seleção de Features
features = [  'customer_type', 'age', 'class', 'flight_distance',
              'inflight_wifi_service', 'departure_arrival_time_convenient',
              'ease_of_online_booking', 'gate_location', 'food_and_drink',
              'online_boarding', 'seat_comfort', 'inflight_entertainment',
              'on_board_service', 'leg_room_service', 'baggage_handling',
              'checkin_service', 'inflight_service', 'cleanliness',
              'departure_delay_in_minutes', 'arrival_delay_in_minutes',
              'gender_Female', 'gender_Male', 'type_of_travel_business_travel',
              'type_of_travel_personal_travel']

#Tratamento dos dados
x_train = x_train.loc[:,features]
y_train = y_train.values.ravel()

x_val = x_val.loc[:,features]
y_val = y_val.values.ravel()

x_test = x_test.loc[:,features]
y_test = y_test.values.ravel()

# Model Training - LogisticRegression (Training Data)

## Definindo os melhores parametros da LogisticRegression

In [4]:
# Definindo os hiperparâmetros para o GridSearch
param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['lbfgs', 'liblinear', 'saga','newton-cholesky'],
    'max_iter': [1000]
}
model = lm.LogisticRegression()

# Configuração do GridSearchCV
grid = ms.GridSearchCV(estimator=model, 
                       param_grid=param_grid, 
                       cv=5, 
                       scoring='accuracy')

# Ajuste aos dados
grid.fit(x_train, y_train)

# Melhores parâmetros
print("Melhores parâmetros:", grid.best_params_)

Melhores parâmetros: {'C': 1, 'max_iter': 1000, 'solver': 'saga'}


## Model Training

In [5]:
#define
model = lm.LogisticRegression(C=grid.best_params_['C'],
                              solver=grid.best_params_['solver'],
                              max_iter=grid.best_params_['max_iter'])

#fit
model.fit(x_train,y_train)

yhat_train = model.predict(x_train)

#performance
acc_train = mt.accuracy_score(yhat_train,y_train)
print(f'Accuracy: {acc_train}')

prec_train = mt.precision_score(yhat_train,y_train)
print(f'Precision: {prec_train}')

recall_train = mt.recall_score(yhat_train,y_train)
print(f'Recall: {prec_train}')

f1_train = mt.f1_score(yhat_train,y_train)
print(f'F1-Score: {prec_train}')

Accuracy: 0.8752120250982556
Precision: 0.8363561155657375
Recall: 0.8363561155657375
F1-Score: 0.8363561155657375


# Model Training - LogisticRegression (Validation Data)

In [6]:
#define
model = lm.LogisticRegression(C=grid.best_params_['C'],
                              solver=grid.best_params_['solver'],
                              max_iter=grid.best_params_['max_iter'])

#fit
model.fit(x_train,y_train)

yhat_val = model.predict(x_val)

#performance
acc_val = mt.accuracy_score(yhat_val,y_val)
print(f'Accuracy: {acc_val}')

prec_val = mt.precision_score(yhat_val,y_val)
print(f'Precision: {prec_val}')

recall_val = mt.recall_score(yhat_val,y_val)
print(f'Recall: {prec_val}')

f1_val = mt.f1_score(yhat_val,y_val)
print(f'F1-Score: {prec_val}')

Accuracy: 0.8739019916985746
Precision: 0.8351770732793823
Recall: 0.8351770732793823
F1-Score: 0.8351770732793823


# Model Training - LogisticRegression (Test Data)

In [7]:
#define
model = lm.LogisticRegression(C=grid.best_params_['C'],
                              solver=grid.best_params_['solver'],
                              max_iter=grid.best_params_['max_iter'])

#fit
model.fit(np.concatenate((x_train,x_val)),
          np.concatenate((y_train,y_val)))

yhat_test = model.predict(x_test)

#performance
acc_test = mt.accuracy_score(yhat_test,y_test)
print(f'Accuracy: {acc_test}')

prec_test = mt.precision_score(yhat_test,y_test)
print(f'Precision: {prec_test}')

recall_test = mt.recall_score(yhat_test,y_test)
print(f'Recall: {prec_test}')

f1_test = mt.f1_score(yhat_test,y_test)
print(f'F1-Score: {prec_test}')

Accuracy: 0.8713938129996525
Precision: 0.8329080510338759
Recall: 0.8329080510338759
F1-Score: 0.8329080510338759




# Save Results

In [8]:
train_metrics = {
    "Algorithm": "LogisticRegression Classifier",
    "Accuracy": np.round(acc_train, 3),
    "Precision": np.round(prec_train, 3),
    "Recall": np.round(recall_train, 3),
    "F1": np.round(f1_train, 3)
}
validation_metrics = {
    "Algorithm": "LogisticRegression Classifier",
    "Accuracy": np.round(acc_val, 3),
    "Precision": np.round(prec_val, 3),
    "Recall": np.round(recall_val, 3),
    "F1": np.round(f1_val, 3)
}
test_metrics = {
    "Algorithm": "LogisticRegression Classifier",
    "Accuracy": np.round(acc_test, 3),
    "Precision": np.round(prec_test, 3),
    "Recall": np.round(recall_test, 3),
    "F1": np.round(f1_test, 3)
}

pd.DataFrame(train_metrics, index=[0]).to_csv(
    "./clas_train_metrics.csv", mode="a", header=False
)
pd.DataFrame(validation_metrics, index=[0]).to_csv(
    "./clas_validation_metrics.csv", mode="a", header=False
)
pd.DataFrame(test_metrics, index=[0]).to_csv(
    "./clas_test_metrics.csv", mode="a", header=False)