# Comparación entre Modelos de Clasificación

In [15]:
%load_ext autoreload
%autoreload 2

import pandas as pd

from src import soporte_comparacion_modelos as sup_mod

import shap

import time

import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


---
---

## Importamos los datos

In [16]:
df = pd.read_pickle('datos/encoded_financial_data.pkl')
df.sample()

Unnamed: 0,is_fraudulent,card_type_American Express,card_type_Discover,card_type_MasterCard,card_type_Visa,location_City-1,location_City-10,location_City-11,location_City-12,location_City-13,...,location_City-48,location_City-49,location_City-5,location_City-50,location_City-6,location_City-7,location_City-8,location_City-9,purchase_category,customer_age
5298,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.900628,0.826923


## Separamos nuestras variables X e y.

In [17]:
X = df.drop(columns=['is_fraudulent'])
y = df['is_fraudulent']

## Definimos las combinaciones de parametros que les proporcionaremos a los diferentes modelos.

In [18]:
parametros_logistic =  [{'penalty': ['l1'], 'solver': ['saga'], 'C': [0.001, 0.01, 0.1, 1, 10, 100], 'max_iter': [10000]},
    {'penalty': ['l2'], 'solver': ['liblinear'], 'C': [0.001, 0.01, 0.1, 1, 10, 100], 'max_iter': [10000]},
    {'penalty': ['elasticnet'], 'solver': ['saga'], 'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9], 'C': [0.001, 0.01, 0.1, 1, 10, 100], 'max_iter': [10000]},
    {'penalty': ['none'], 'solver': ['lbfgs'], 'max_iter': [10000]}]

parametros_dt = {'max_depth': [4, 10, 14, 25, 40],
            'min_samples_split': [6, 8, 10, 15, 20, 25],
            'min_samples_leaf': [6, 8, 10, 15, 20, 25],
            'max_leaf_nodes': [10, 15, 20, 25, 40, 60, 90, 110]}

parametros_rf = {
            'n_estimators' : [10,20,30,40],
            'max_depth': [5, 10, 15],
            'min_samples_split': [8, 10, 15, 20],
            'min_samples_leaf': [6, 8, 10, 15, 20],
            'max_leaf_nodes': [10, 15, 20, 25]
}

parametros_gb = {
    'n_estimators': [50,100],
    'max_depth': [3,5,7],
    'min_samples_split': [3,4,7],
    'min_samples_leaf': [3,4,7],
    'max_leaf_nodes': [6,10,12]
}

parametros_xgb = {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
}

## Declaramos nuestra clase modelo la cual nos hará el trabajo más sencillo

In [19]:
model = sup_mod.ClassificationModel(X, y, random_state=42)

##  Ahora entrenamos a cada uno de los modelos

In [20]:
print("Training Regresión Logística\n")
start_time = time.time()
logistic = model.train("logistic", params=parametros_logistic, scoring="recall", verbose = 1)
print()
print("Time:", time.time() - start_time)
print()
print(logistic)
print("----"*20)

Training Regresión Logística

Fitting 5 folds for each of 43 candidates, totalling 215 fits

Time: 12.488847732543945

LogisticRegression(C=0.01, max_iter=10000, penalty='l1', random_state=42,
                   solver='saga')
--------------------------------------------------------------------------------


In [21]:
print("Training Árbol de Decisión\n")
start_time = time.time()
decisiontree = model.train("decision_tree", params = parametros_dt, scoring="recall", verbose = 1)
print()
print("Time:", time.time() - start_time)
print()
print(decisiontree)
print("----"*20)

Training Árbol de Decisión

Fitting 5 folds for each of 1440 candidates, totalling 7200 fits

Time: 51.45868968963623

DecisionTreeClassifier(max_depth=10, max_leaf_nodes=10, min_samples_leaf=6,
                       min_samples_split=6, random_state=42)
--------------------------------------------------------------------------------


In [None]:
print("Training Random Forest\n")
start_time = time.time()
randomforest = model.train("random_forest", params = parametros_rf, scoring="recall", verbose = 1)
print()
print("Time:", time.time() - start_time)
print()
print(randomforest)
print("----"*20)

Training Random Forest

Fitting 5 folds for each of 960 candidates, totalling 4800 fits


In [None]:
print("Training Gradient Boosting\n")
start_time = time.time()
gradientboost = model.train("gradient_boosting", params = parametros_gb, scoring="recall", verbose = 1)
print()
print("Time:", time.time() - start_time)
print()
print(gradientboost)
print("----"*20)

In [None]:
print("Training XGBoost\n")
start_time = time.time()
xgboost = model.train("xgboost", params = parametros_xgb, scoring="recall", verbose = 1)
print()
print("Time:", time.time() - start_time)
print()
print(xgboost)

In [None]:
model.plot_confusion_matrix()

In [None]:
model.plot_roc_curves()


In [None]:
df_metrics = pd.DataFrame()
for k in model.resultados.keys():
    df_it = model.resultados[k]["metrics"]
    df_it["method"] = k
    df_metrics = pd.concat([df_metrics, df_it])

df_resultados_colores = df_metrics.copy()
df_resultados_colores.reset_index(inplace = True)
df_resultados_colores = df_resultados_colores.style.apply(sup_mod.color_filas_por_modelo, axis=1)
df_resultados_colores