# Model training and hyperparameter tuning

O objetivo deste notebook é realizar testes de treino em diferentes modelos e diferentes hiperparâmetros

<u>Output</u>: Artefatos do modelo treinado

In [1]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, log_loss, make_scorer, confusion_matrix, ConfusionMatrixDisplay
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import numpy as np

## 1. Load prepare and split data

In [2]:
df = pd.read_csv('../data/train_test/full_modeling_dataset.csv', dtype = {'emissor_cartao': 'str'})

In [3]:
df.head(3)

Unnamed: 0,Valor,CBK,periodo_do_dia,dia_da_semana,emissor_cartao
0,36.54,0,madrugada,Sexta-feira,36518
1,36.54,0,madrugada,Sexta-feira,36518
2,69.0,0,madrugada,Sexta-feira,53211


In [4]:
df.dtypes

Valor             float64
CBK                 int64
periodo_do_dia     object
dia_da_semana      object
emissor_cartao     object
dtype: object

#### 1.1 Data split

In [5]:
X = df.drop(columns='CBK').copy()
y = df['CBK'].copy()

In [6]:
# Train test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train val
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2


In [7]:
print(f'Train data points:{X_train.shape[0]}')
print(f'Test data points:{X_test.shape[0]}')
print(f'Validation data points:{X_val.shape[0]}')

Train data points:6602
Test data points:2201
Validation data points:2201


# 2. Model experimenting

Utils

In [8]:
def evaluate_model(model, X_val, y_val):
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred, average='macro')
    recall = recall_score(y_val, y_pred, average='macro')
    f1 = f1_score(y_val, y_pred, average='macro')
    return accuracy, precision, recall, f1

## 2.1 Primeiro experimento: Dados desbalanceados, varios modelos

#### 2.1.1 Transformacao dos dados

In [9]:
numeric_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train.select_dtypes(exclude=[np.number]).columns.tolist()

numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

X_train_transformed = preprocessor.fit_transform(X_train)
X_val_transformed = preprocessor.transform(X_val)
X_test_transformed = preprocessor.transform(X_test)

# Para lgb com suporte a categ[oricas]
X_train_lgb = X_train.copy()
X_val_lgb = X_val.copy()
X_test_lgb = X_test.copy()

for col in categorical_features:
    X_train_lgb[col] = X_train_lgb[col].astype('category')
    X_val_lgb[col] = X_val_lgb[col].astype('category')
    X_test_lgb[col] = X_test_lgb[col].astype('category')

In [10]:
param_grid_lr = {'C': [0.1, 1, 10]}
param_grid_rf = {'n_estimators': [50, 100, 200], 'max_depth': [3, 6, 9]}
param_grid_svm = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
param_grid_xgb = {'n_estimators': [50, 100, 200], 'max_depth': [3, 6, 9], 'learning_rate': [0.01, 0.1, 0.2]}

param_grids = {
    'Logistic Regression': (LogisticRegression(), param_grid_lr),
    'Random Forest': (RandomForestClassifier(), param_grid_rf),
    'XGBoost': (XGBClassifier(use_label_encoder=False, eval_metric='logloss'), param_grid_xgb),
}

LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
best_models = {}
for name, (model, param_grid) in param_grids.items():
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring=LogLoss)
    grid_search.fit(X_train_transformed, y_train)
    best_models[name] = grid_search.best_estimator_
    print(f"{name} best params: {grid_search.best_params_}")



Logistic Regression best params: {'C': 10}
Random Forest best params: {'max_depth': 9, 'n_estimators': 100}
XGBoost best params: {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 200}


In [11]:
for name, model in best_models.items():
    y_pred = model.predict(X_val_transformed)
    report = classification_report(y_val, y_pred)
    print(f"{name}")
    print(report)

Logistic Regression
              precision    recall  f1-score   support

           0       0.96      0.99      0.97      2087
           1       0.59      0.15      0.24       114

    accuracy                           0.95      2201
   macro avg       0.77      0.57      0.61      2201
weighted avg       0.94      0.95      0.94      2201

Random Forest
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      2087
           1       0.00      0.00      0.00       114

    accuracy                           0.95      2201
   macro avg       0.47      0.50      0.49      2201
weighted avg       0.90      0.95      0.92      2201

XGBoost
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      2087
           1       0.82      0.41      0.55       114

    accuracy                           0.97      2201
   macro avg       0.90      0.70      0.77      2201
weighted avg       0.96      0.9

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
param_grid_lgb = {
    'n_estimators': [50, 100, 200],
    'num_leaves': [31, 50, 100],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 9]
}

# Perform grid search with cross-validation
grid_search = GridSearchCV(LGBMClassifier(verbosity=-1), param_grid_lgb, cv=5, scoring=LogLoss)
grid_search.fit(X_train_lgb, y_train, categorical_feature=categorical_features)
best_model = grid_search.best_estimator_

KeyboardInterrupt: 

In [None]:
grid_search.best_params_

In [None]:
best_model

In [None]:
y_pred_lgb = best_model.predict(X_val_lgb)
report = classification_report(y_val, y_pred_lgb)
print(f"LGB Classifier")
print(report)

In [None]:
cm = confusion_matrix(y_val, y_pred_lgb, labels=best_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()

## 2.1 Segundo experimento: Dados balanceados, lightgbm

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [None]:
X_full_transformed = preprocessor.transform(X)

In [None]:

print("Original class distribution:", Counter(y))

# SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X_full_transformed, y)

# RandomUnderSampler
undersample = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_resampled_under, y_resampled_under = undersample.fit_resample(X_full_transformed, y)

# Class distribution after resampling
print("Class distribution after SMOTE:", Counter(y_resampled_smote))
print("Class distribution after RandomUnderSampler:", Counter(y_resampled_under))

In [None]:
y_train

# Todo: Anomaly unsupervised

In [None]:
import sys
sys.path.append('..')
from src.data.make_dataset import data_preparation
from src.features.build_features import cbk_feature_engineering

In [None]:
unsupervised_test = pd.read_excel('../data/raw/Missão_Stone_-_Dados_de_trx_(3).xlsx', engine='openpyxl', sheet_name='Aba 2')

In [None]:
def cbk_data_prep(df, test=False):
    df = data_preparation(df, test)
    df = cbk_feature_engineering(df)
    return df

processed_test = cbk_data_prep(test, test=True)