In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from skopt import BayesSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import logging
import warnings
warnings.filterwarnings("ignore")

In [54]:
data = pd.read_csv('../../data/preprocessed/preprocessed_data_combined.csv')

features = ['personal_loan', 'securities_account', 'cd_account', 'online',
            'cat__age_bracket_name_Baby boomers', 'cat__age_bracket_name_Generation X',
            'cat__age_bracket_name_Generation Z', 'cat__age_bracket_name_Millennials',
            'cat__education_ensino_medio', 'cat__education_ensino_superior',
            'cat__education_pos_graduacao']

X, y = data[features], data['credit_card']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42
)

rus = RandomUnderSampler(random_state=42)

X_res, y_res = rus.fit_resample(X_train, y_train)

In [57]:
def rand_search_cv(model, param_grid):
        rand_search = RandomizedSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=1, scoring='precision')
        rand_search.fit(X_res, y_res)

        best_model = rand_search.best_estimator_
        predictions = best_model.predict(X_test)
        best_params = rand_search.best_params_
        best_score = rand_search.best_score_

        accuracy = accuracy_score(predictions, y_test)
        recall = recall_score(predictions, y_test)
        precision = precision_score(predictions, y_test)
        f1 = f1_score(predictions, y_test)

        print('Best Model: {}'.format(best_model))
        print('Best Params: {}'.format(best_params))
        print('Best Score: {}'.format(best_score))

        print('Acurácia: {}'.format(accuracy))
        print('Revocação: {}'.format(recall))
        print('Precisão: {}'.format(precision))
        print('F1 Score: {}'.format(f1))

        return


In [58]:
def grid_search_cv(model, param_grid):
        grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=1, scoring='precision')
        grid_search.fit(X_res, y_res)

        best_model = grid_search.best_estimator_
        predictions = best_model.predict(X_test)
        best_params = grid_search.best_params_
        best_score = grid_search.best_score_

        accuracy = accuracy_score(predictions, y_test)
        recall = recall_score(predictions, y_test)
        precision = precision_score(predictions, y_test)
        f1 = f1_score(predictions, y_test)

        print('Best Model: {}'.format(best_model))
        print('Best Params: {}'.format(best_params))
        print('Best Score: {}'.format(best_score))

        print('Acurácia: {}'.format(accuracy))
        print('Revocação: {}'.format(recall))
        print('Precisão: {}'.format(precision))
        print('F1 Score: {}'.format(f1))

        return

In [59]:
def bayesian_search_cv(model, param_grid):
        bayesian_search = BayesSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=1, scoring='precision')
        bayesian_search.fit(X_res, y_res)

        best_model = bayesian_search.best_estimator_
        predictions = best_model.predict(X_test)
        best_params = bayesian_search.best_params_
        best_score = bayesian_search.best_score_

        accuracy = accuracy_score(predictions, y_test)
        recall = recall_score(predictions, y_test)
        precision = precision_score(predictions, y_test)
        f1 = f1_score(predictions, y_test)

        print('Best Model: {}'.format(best_model))
        print('Best Params: {}'.format(best_params))
        print('Best Score: {}'.format(best_score))

        print('Acurácia: {}'.format(accuracy))
        print('Revocação: {}'.format(recall))
        print('Precisão: {}'.format(precision))
        print('F1 Score: {}'.format(f1))

        return

In [61]:
# DecisionTreeClassifier
decision_tree_params = {
    "criterion": ["gini", "entropy", "log_loss"],  # Critério para medir qualidade do split
    "splitter": ["best", "random"],  # Estratégia para dividir o nó
    "max_depth": [None, 10, 20, 30, 50],  # Profundidade máxima da árvore
    "min_samples_split": [2, 5, 10],  # Mínimo de amostras para dividir um nó
    "min_samples_leaf": [1, 2, 5],  # Mínimo de amostras em uma folha
    "max_features": [None, "sqrt", "log2"],  # Número de features a considerar em cada split
    "class_weight": [None, "balanced"]  # Balanceamento entre classes
}

# KNeighborsClassifier
knn_params = {
    "n_neighbors": [3, 5, 10, 15],  # Número de vizinhos
    "weights": ["uniform", "distance"],  # Peso dos vizinhos
    "metric": ["minkowski", "euclidean", "manhattan"],  # Métrica de distância
    "p": [1, 2]  # Potência para Minkowski (1=Manhattan, 2=Euclidean)
}

# RandomForestClassifier
random_forest_params = {
    "n_estimators": [100, 200, 500],  # Número de árvores
    "criterion": ["gini", "entropy", "log_loss"],  # Critério de divisão
    "max_depth": [None, 10, 20, 30, 50],  # Profundidade máxima
    "min_samples_split": [2, 5, 10],  # Mínimo de amostras para dividir um nó
    "min_samples_leaf": [1, 2, 5],  # Mínimo de amostras em uma folha
    "max_features": [None, "sqrt", "log2"],  # Número de features para cada split
    "bootstrap": [True, False],  # Uso de amostras de bootstrapping
    "class_weight": [None, "balanced", "balanced_subsample"]  # Balanceamento entre classes
}

# GradientBoostingClassifier
gradient_boosting_params = {
    "n_estimators": [100, 200, 500],  # Número de estágios de boosting
    "learning_rate": [0.01, 0.1, 0.2],  # Taxa de aprendizado
    "max_depth": [3, 5, 10],  # Profundidade máxima das árvores
    "min_samples_split": [2, 5, 10],  # Mínimo de amostras para dividir um nó
    "min_samples_leaf": [1, 2, 5],  # Mínimo de amostras em uma folha
    "subsample": [0.8, 1.0],  # Fração de amostras usadas para treinar cada base learner
    "max_features": [None, "sqrt", "log2"]  # Número de features por split
}

# AdaBoostClassifier
adaboost_params = {
    "n_estimators": [50, 100, 200],  # Número de estimadores
    "learning_rate": [0.01, 0.1, 1.0],  # Taxa de aprendizado
    "algorithm": ["SAMME", "SAMME.R"],  # Algoritmo de boosting
    "base_estimator": [None]  # Normalmente DecisionTreeClassifier
}

# XGBClassifier
xgb_params = {
    "n_estimators": [100, 200, 500],  # Número de árvores
    "learning_rate": [0.01, 0.05, 0.1],  # Taxa de aprendizado
    "max_depth": [3, 5, 10],  # Profundidade máxima da árvore
    "min_child_weight": [1, 3, 5],  # Peso mínimo de uma folha
    "gamma": [0, 0.1, 0.2],  # Redução mínima na perda para split
    "subsample": [0.8, 1.0],  # Fração de amostras usadas
    "colsample_bytree": [0.8, 1.0],  # Fração de features usadas por árvore
    "reg_alpha": [0, 0.1, 1],  # Regularização L1
    "reg_lambda": [1, 10, 100],  # Regularização L2
    "scale_pos_weight": [1, 10, 50]  # Balanceamento de classes desbalanceadas
}

# XGBRFClassifier
xgbrf_params = {
    "n_estimators": [100, 200, 500],  # Número de árvores
    "learning_rate": [0.01, 0.05, 0.1],  # Taxa de aprendizado
    "max_depth": [3, 5, 10],  # Profundidade máxima da árvore
    "min_child_weight": [1, 3, 5],  # Peso mínimo de uma folha
    "gamma": [0, 0.1, 0.2],  # Redução mínima na perda para split
    "subsample": [0.8, 1.0],  # Fração de amostras usadas
    "colsample_bytree": [0.8, 1.0],  # Fração de features usadas por árvore
    "reg_alpha": [0, 0.1, 1],  # Regularização L1
    "reg_lambda": [1, 10, 100],  # Regularização L2
    "scale_pos_weight": [1, 10, 50],  # Balanceamento de classes desbalanceadas
    "base_score": [0.5, 0.6, 0.7]  # Previsão inicial
}


#### Modelos a serem treinados

- DecisionTreeClassifier
- KNeighborsClassifier
- RandomForestClassifier
- GradientBoostingClassifier
- AdaBoostClassifier
- XGBClassifier
- XGBRFClassifier

##### DecisionTreeClassifier

In [62]:
rand_search_cv(DecisionTreeClassifier(), decision_tree_params)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Model: DecisionTreeClassifier(max_depth=30, max_features='log2', min_samples_leaf=2,
                       min_samples_split=5)
Best Params: {'splitter': 'best', 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'log2', 'max_depth': 30, 'criterion': 'gini', 'class_weight': None}
Best Score: 0.6148474291188755
Acurácia: 0.6432947583390062
Revocação: 0.391304347826087
Precisão: 0.34851936218678814
F1 Score: 0.3686746987951807


In [63]:
grid_search_cv(DecisionTreeClassifier(), decision_tree_params)

Fitting 5 folds for each of 1620 candidates, totalling 8100 fits
Best Model: DecisionTreeClassifier(criterion='log_loss', max_features='sqrt',
                       min_samples_leaf=5, min_samples_split=10,
                       splitter='random')
Best Params: {'class_weight': None, 'criterion': 'log_loss', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 10, 'splitter': 'random'}
Best Score: 0.6312165072343745
Acurácia: 0.65214431586113
Revocação: 0.4032258064516129
Precisão: 0.3416856492027335
F1 Score: 0.36991368680641185


In [64]:
bayesian_search_cv(DecisionTreeClassifier(), decision_tree_params)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

##### KNN

In [65]:
rand_search_cv(KNeighborsClassifier(), knn_params)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Model: KNeighborsClassifier(n_neighbors=3)
Best Params: {'weights': 'uniform', 'p': 2, 'n_neighbors': 3, 'metric': 'minkowski'}
Best Score: 0.8644924812030077
Acurácia: 0.7256637168141593
Revocação: 0.7307692307692307
Precisão: 0.12984054669703873
F1 Score: 0.2205029013539652


In [66]:
grid_search_cv(KNeighborsClassifier(), knn_params)


Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Model: KNeighborsClassifier(n_neighbors=3, p=1)
Best Params: {'metric': 'minkowski', 'n_neighbors': 3, 'p': 1, 'weights': 'uniform'}
Best Score: 0.8644924812030077
Acurácia: 0.7256637168141593
Revocação: 0.7307692307692307
Precisão: 0.12984054669703873
F1 Score: 0.2205029013539652


In [67]:
bayesian_search_cv(KNeighborsClassifier(), knn_params)


Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

##### RandomForest

In [68]:
rand_search_cv(RandomForestClassifier(), random_forest_params)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Model: RandomForestClassifier(bootstrap=False, class_weight='balanced',
                       criterion='entropy', max_depth=20, min_samples_split=5,
                       n_estimators=200)
Best Params: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 20, 'criterion': 'entropy', 'class_weight': 'balanced', 'bootstrap': False}
Best Score: 0.6086397199423803
Acurácia: 0.6466984343090538
Revocação: 0.398989898989899
Precisão: 0.35990888382687924
F1 Score: 0.3784431137724551


In [69]:
grid_search_cv(RandomForestClassifier(), random_forest_params)


Fitting 5 folds for each of 7290 candidates, totalling 36450 fits


KeyboardInterrupt: 

In [None]:
bayesian_search_cv(RandomForestClassifier(), random_forest_params)


##### GradientBoosting

In [None]:
rand_search_cv(GradientBoostingClassifier(), gradient_boosting_params)


In [None]:
grid_search_cv(GradientBoostingClassifier(), gradient_boosting_params)


In [None]:
bayesian_search_cv(GradientBoostingClassifier(), gradient_boosting_params)


##### AdaBoost

In [None]:
rand_search_cv(AdaBoostClassifier(), adaboost_params)


In [None]:
grid_search_cv(AdaBoostClassifier(), adaboost_params)


In [None]:
bayesian_search_cv(AdaBoostClassifier(), adaboost_params)


##### XGBClassfier

In [None]:
rand_search_cv(XGBClassifier(), xgb_params)

In [None]:
grid_search_cv(XGBClassifier(), xgb_params)

In [None]:
bayesian_search_cv(XGBClassifier(), xgb_params)

##### XGBRFClassifier

In [None]:
rand_search_cv(XGBRFClassifier, xgbrf_params)

In [None]:
grid_search_cv(XGBRFClassifier, xgbrf_params)

In [None]:
bayesian_search_cv(XGBRFClassifier, xgbrf_params)