In [1]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, ParameterGrid
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score,recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE

import pickle
import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm

from useful.constants import SEED, N_ITER, TRAINVAL_SPLITS
from useful.training_tools import ModelTraining, ModelMetrics

warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', None)

In [2]:
# Carrega dados

with open('../data/processed/german-credit-data.pkl', 'rb') as file: 
    df = pickle.load(file)
    
df = pd.DataFrame(df)

# LOUCURAS

df['age_group'] = pd.cut(df['age'], 
                           bins=[0, 25, 40, 60, 100], 
                           labels=['Young Adult', 'Adult', 'Middle-Aged', 'Senior'])

df['amount_group'] = pd.cut(df['credit_amount'], 
                            bins=[0, 2500, 5000, 10000, 20000], 
                            labels=['Low', 'Medium', 'High', 'Very High'])

df['monthly_payment_ratio'] = df['credit_amount'] / df['duration']

df['debt_to_age_ratio'] = df['credit_amount'] / df['age']


df = pd.get_dummies(df, columns=['checking_status', 'credit_history', 'purpose', 'savings_status',
                                    'employment', 'personal_status', 'other_parties', 'property_magnitude',
                                    'other_payment_plans', 'housing', 'job', 'foreign_worker', 'monthly_payment_ratio',
                                    'debt_to_age_ratio', 'age_group', 'amount_group'],
                      drop_first=True)

In [3]:
# Carrega dados
df = df.copy()
X = df.drop(columns=['Target'])
y = LabelEncoder().fit_transform(df['Target'].values)
y = pd.Series(LabelEncoder().fit_transform(df['Target'].values))


# Divide treino/teste
train_idx, test_idx = next(StratifiedKFold(n_splits=TRAINVAL_SPLITS, shuffle=True, random_state=SEED).split(X, y))
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

# Separa colunas
cat_cols = X.select_dtypes(include='category').columns.tolist()
num_cols = X.select_dtypes(include='number').columns.tolist()

In [4]:
y.value_counts()

1    700
0    300
Name: count, dtype: int64

In [5]:
# Preprocessamento
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_cols)
])

In [6]:
# Modelos para comparar
models = {
    'Logistic Regression': LogisticRegression(random_state=SEED, max_iter=1000, class_weight='balanced'),
    'Random Forest': RandomForestClassifier(random_state=SEED, class_weight='balanced'),
    'XGBClassifier': XGBClassifier(random_state=SEED, is_unbalance=True),
    'SVM (RBF Kernel)': SVC(random_state=SEED),
    'KNN': KNeighborsClassifier()
}

lr_param_grid = {
    'classifier__solver': ['lbfgs', 'newton-cg', 'sag', 'saga'],
    'classifier__penalty': ['l2', 'l1', 'elasticnet', None],
    'classifier__l1_ratio': [None, 0.25, 0.5, 0.75],
    'classifier__C': [1, 0.1, 0.01, 10]
}

lr_valid_params = []
for param in ParameterGrid(param_grid=lr_param_grid):
    if param['classifier__solver'] in ['lbfgs', 'newton-cg', 'sag'] and param['classifier__penalty'] in ['l1', 'elasticnet'] or \
       param['classifier__penalty'] in ['l1', 'l2', None] and param['classifier__l1_ratio'] is not None or \
       param['classifier__penalty'] == 'elasticnet' and param['classifier__l1_ratio'] is None:
        continue

    if param['classifier__penalty'] is None:
        param.pop('classifier__l1_ratio')
        param.pop('classifier__C')

    param = {k: [v] for k, v in param.items()}
    lr_valid_params.append(param)
    
svm_param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__gamma': ['scale', 'auto', 1, 0.1, 0.01, 0.001, 0.0001],
    'classifier__kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'classifier__degree': [2, 3, 4],
    'classifier__coef0': [0.0, 0.1, 0.5]
}

# SVM param grid - valid params

svm_valid_params = []
for param in ParameterGrid(svm_param_grid):
    kernel = param['classifier__kernel']
    if kernel == 'linear' and ('classifier__degree' in param or 'classifier__coef0' in param):
        continue
    if kernel == 'rbf' and ('classifier__degree' in param or 'classifier__coef0' in param):
        continue
    if kernel in ['poly', 'sigmoid']:
        pass  # degree and coef0 are valid
    else:
        param.pop('classifier__degree', None)
        param.pop('classifier__coef0', None)
    param = {k: [v] for k, v in param.items()}
    svm_valid_params.append(param)

# Param grids
param_grids = {
    'Logistic Regression': lr_valid_params,
    'Random Forest': {
        'classifier__n_estimators': [25, 50],
        'classifier__max_depth': [None, 3],
        'classifier__min_samples_split': [2, 5, 10],
        'classifier__min_samples_leaf': [1, 3, 5,10],
        'classifier__bootstrap': [True, False],
        'classifier__max_features': ['sqrt'],
        'classifier__criterion': ['entropy']
    },
    'XGBClassifier': {
        'oversampling__k_neighbors': [3, 5, 7],
        'oversampling__sampling_strategy': ['minority', 'not majority'],
        'classifier__n_estimators': [100, 300, 500],
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [3, 5, 7],
        'classifier__learning_rate': [0.05, 0.1],
        'classifier__scale_pos_weight': [1, y_train.value_counts()[0] / y_train.value_counts()[1]]
    },
    'SVM (RBF Kernel)': svm_valid_params,
    'KNN': {
        'oversampling__k_neighbors': [3, 5, 7],
        'oversampling__sampling_strategy': ['minority', 'not majority'],
        'classifier__n_neighbors': [3, 5, 7],
        'classifier__weights': ['uniform', 'distance'],
        'classifier__metric': ['euclidean', 'manhattan']
    }
}

In [7]:

lr = ModelTraining(
    'Logistic Regression',
    models['Logistic Regression'],
    param_grids['Logistic Regression'],
    preprocessor,
    X,
    y
)

lr_results, lr_mean_scores, lr_best_model = lr.run(X_train, X_test, y_train, y_test)

Logistic Regression: 100%|██████████| 48/48 [00:28<00:00,  1.68it/s]


In [8]:
lr_mean_scores

Unnamed: 0_level_0,Unnamed: 1_level_0,Precision,Recall,F1 Score
Model,Class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Logistic Regression,0,0.38,0.55,0.45
Logistic Regression,1,0.76,0.62,0.68


In [9]:
rfc = ModelTraining(
    'Random Forest',
    models['Random Forest'],
    param_grids['Random Forest'],
    preprocessor,
    X,
    y
)


rfc_results, rfc_mean_scores, rfc_best_model = rfc.run(X_train, X_test, y_train, y_test)

In [10]:
#pos label
rfc_mean_scores

Unnamed: 0_level_0,Unnamed: 1_level_0,Precision,Recall,F1 Score
Model,Class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Random Forest,0,0.41,0.37,0.39
Random Forest,1,0.74,0.77,0.76


In [11]:
xgb = ModelTraining(
    'XGBClassifier',
    models['XGBClassifier'],
    param_grids['XGBClassifier'],
    preprocessor,
    X,
    y,
    resampler=SMOTE(random_state=SEED)
)

xgb_results, xgb_mean_scores, xgb_best_model = xgb.run(X_train, X_test, y_train, y_test)

ValueError: 
All the 1440 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1440 fits failed with the following error:
Traceback (most recent call last):
  File "c:\projects\statistics-main-folder\german-credit\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\projects\statistics-main-folder\german-credit\.venv\Lib\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\projects\statistics-main-folder\german-credit\.venv\Lib\site-packages\imblearn\pipeline.py", line 526, in fit
    self._final_estimator.fit(Xt, yt, **last_step_params["fit"])
    ~~~~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\projects\statistics-main-folder\german-credit\.venv\Lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
  File "c:\projects\statistics-main-folder\german-credit\.venv\Lib\site-packages\xgboost\sklearn.py", line 1640, in fit
    raise ValueError(
    ...<2 lines>...
    )
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1], got ['0' '1']


In [None]:
xgb_mean_scores

In [None]:
knn = ModelTraining(
    'KNN',
    models['KNN'],
    param_grids['KNN'],
    preprocessor,
    X,
    y,
    resampler=SMOTE(random_state=SEED)
)

knn_results, knn_mean_scores, knn_best_model = knn.run(X_train, X_test, y_train, y_test)

In [None]:
knn_mean_scores

In [None]:
# Crie uma instância de ModelMetrics com o melhor modelo
model_metrics_xgb = ModelMetrics(xgb_best_model, X_train, X_test, y_train, y_test)

# Plote a matriz de confusão
cm_fig_xgb = model_metrics_xgb.plot_confusion_matrix()
cm_fig_xgb.show()

gb_best_model