# Imports

In [1]:
import pandas   as pd
import numpy    as np

from matplotlib         import pyplot           as plt
from sklearn            import metrics          as mt
from sklearn            import ensemble         as en
from sklearn            import model_selection  as ms

# Load Dataset

In [2]:
#Lendo arquivo CSV de Treino
x_train=pd.read_csv('../../dataset/class/X_training.csv')
y_train=pd.read_csv('../../dataset/class/y_training.csv')

#Lendo arquivo CSV de teste
x_test = pd.read_csv('../../dataset/class/X_test.csv')
y_test = pd.read_csv('../../dataset/class/y_test.csv')

#Lendo arquivo CSV de Validação
x_val = pd.read_csv('../../dataset/class/X_validation.csv')
y_val = pd.read_csv('../../dataset/class/y_validation.csv')

In [3]:
#Seleção de Features
features = [  'customer_type', 'age', 'class', 'flight_distance',
              'inflight_wifi_service', 'departure_arrival_time_convenient',
              'ease_of_online_booking', 'gate_location', 'food_and_drink',
              'online_boarding', 'seat_comfort', 'inflight_entertainment',
              'on_board_service', 'leg_room_service', 'baggage_handling',
              'checkin_service', 'inflight_service', 'cleanliness',
              'departure_delay_in_minutes', 'arrival_delay_in_minutes',
              'gender_Female', 'gender_Male', 'type_of_travel_business_travel',
              'type_of_travel_personal_travel']

#Tratamento dos dados
x_train = x_train.loc[:,features]
y_train = y_train.values.ravel()

x_val = x_val.loc[:,features]
y_val = y_val.values.ravel()

x_test = x_test.loc[:,features]
y_test = y_test.values.ravel()

# Model Training - RandomForestClassifier (Training Data)

## Definindo os melhores parametros da RandomForest

In [4]:
# Definindo os hiperparâmetros para o GridSearch
param_grid = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [None, 10, 20, 30, 50]
}

# Configuração do GridSearchCV
grid_search = ms.GridSearchCV(
    estimator = en.RandomForestClassifier(),
    param_grid = param_grid,
    cv = 5,  # Validação cruzada com 5 folds
    scoring = 'accuracy',  # Métrica de avaliação
    n_jobs = -1  # Paralelismo total para acelerar
)

# Ajuste aos dados
grid_search.fit(x_train, y_train)

# Melhores parâmetros
print("Melhores parâmetros:", grid_search.best_params_)



Melhores parâmetros: {'max_depth': 30, 'n_estimators': 500}


## Model Training

In [5]:
#define
model = en.RandomForestClassifier(  n_estimators=grid_search.best_params_['n_estimators'],
                                    max_depth=grid_search.best_params_['max_depth'],
                                    random_state=0  )

#fit
model.fit(x_train,y_train)
yhat_train = model.predict(x_train)

#performance
acc_train = mt.accuracy_score(yhat_train,y_train)
prec_train = mt.precision_score(yhat_train,y_train)
recall_train = mt.recall_score(yhat_train,y_train)
f1_train = mt.f1_score(yhat_train,y_train)

# Model Training - RandomForestClassifier (Validation Data)

In [6]:
#define
model = en.RandomForestClassifier(  n_estimators=grid_search.best_params_['n_estimators'],
                                    max_depth=grid_search.best_params_['max_depth'],
                                    random_state=0  )

#fit
model.fit(x_train,y_train)
yhat_val = model.predict(x_val)

#performance
acc_val = mt.accuracy_score(yhat_val,y_val)
prec_val = mt.precision_score(yhat_val,y_val)
recall_val = mt.recall_score(yhat_val,y_val)
f1_val = mt.f1_score(yhat_val,y_val)

# Model Training - RandomForestClassifier (Test Data)

In [7]:
#define
model = en.RandomForestClassifier(  n_estimators=grid_search.best_params_['n_estimators'],
                                    max_depth=grid_search.best_params_['max_depth'],
                                    random_state=0  )

#fit
model.fit(np.concatenate((x_train,x_val)),
          np.concatenate((y_train,y_val)))
yhat_test = model.predict(x_test)

#performance
acc_test = mt.accuracy_score(yhat_test,y_test)
prec_test = mt.precision_score(yhat_test,y_test)
recall_test = mt.recall_score(yhat_test,y_test)
f1_test = mt.f1_score(yhat_test,y_test)



# Save Results

In [8]:
train_metrics = {
    "Algorithm": "Random Forest Classifier",
    "Accuracy": np.round(acc_train, 3),
    "Precision": np.round(prec_train, 3),
    "Recall": np.round(recall_train, 3),
    "F1": np.round(f1_train, 3)
}
validation_metrics = {
    "Algorithm": "Random Forest Classifier",
    "Accuracy": np.round(acc_val, 3),
    "Precision": np.round(prec_val, 3),
    "Recall": np.round(recall_val, 3),
    "F1": np.round(f1_val, 3)
}
test_metrics = {
    "Algorithm": "Random Forest Classifier",
    "Accuracy": np.round(acc_test, 3),
    "Precision": np.round(prec_test, 3),
    "Recall": np.round(recall_test, 3),
    "F1": np.round(f1_test, 3)
}

pd.DataFrame(train_metrics, index=[0]).to_csv(
    "./clas_train_metrics.csv", mode="a", header=False
)
pd.DataFrame(validation_metrics, index=[0]).to_csv(
    "./clas_validation_metrics.csv", mode="a", header=False
)
pd.DataFrame(test_metrics, index=[0]).to_csv(
    "./clas_test_metrics.csv", mode="a", header=False)