In [1]:
import pandas as pd
import numpy as np


In [2]:
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.impute import KNNImputer

In [3]:
from sklearn.tree import DecisionTreeClassifier

In [4]:
from preprocessing import traer_datasets
from preprocessing import separar_dataset
from preprocessing import feature_engineering_general

In [5]:
from funciones_auxiliares import encontrar_hiperparametros_RGSCV

### Obtención de los datos y preprocesamiento

Traemos datasets y particionamos

In [6]:
df, df_sin_target, solo_target = traer_datasets()

X_train, X_test, y_train, y_test = separar_dataset(df_sin_target, solo_target)
X_train.is_copy=False
X_test.is_copy=False
y_train.is_copy=False
y_train.is_copy=False

y_train.set_index('id', inplace=True)
y_train.sort_values(by=['id'], inplace=True, ascending=True)

y_test.set_index('id', inplace=True)
y_test.sort_values(by=['id'], inplace=True, ascending=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train.sort_values(by=['id'], inplace=True, ascending=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test.sort_values(by=['id'], inplace=True, ascending=True)


Aplicamos metodo de feature engineering general

In [7]:
X_train, X_test = feature_engineering_general(X_train, X_test)

### Búsqueda de Hiperparámetros

Buscamos hiperparámetros con RandomGridSearch, pues GridSearch iterativamente es bastante lento.

In [8]:
params = {'criterion': ['gini','entropy'], 'min_samples_leaf':np.arange(1,16),'max_depth': np.arange(1,30)}
hiperparametros = encontrar_hiperparametros_RGSCV(DecisionTreeClassifier(), params=params, X=X_train, y=y_train)

In [9]:
criterio_elegido = hiperparametros['criterion']
profundidad_elegida = hiperparametros['max_depth']
min_leaf_elegido = hiperparametros['min_samples_leaf']
print(f'Mejor criterio: {criterio_elegido}')
print(f'Mejor profundidad máxima: {profundidad_elegida}')
print(f'Mejor mínima cantidad de instancias por hoja: {min_leaf_elegido}')

Mejor criterio: entropy
Mejor profundidad máxima: 8
Mejor mínima cantidad de instancias por hoja: 11


### Entrenando el Modelo con CrossValidation

Procedemos a testear con kfolds, stratificados pues nuestro dataset es desbalanceado. Además, usamos los hiperparámetros encontrados previamente.

In [10]:
kf = StratifiedKFold(n_splits=5)
for fold_idx, (train_index, test_index) in enumerate(kf.split(X_train, y_train)):
    clf = DecisionTreeClassifier(max_depth=profundidad_elegida, min_samples_leaf=min_leaf_elegido, criterion=criterio_elegido, random_state=117)
    clf.fit(X_train.iloc[train_index], y_train.iloc[train_index])
    print ('Reporte para el FOLD ' + str(fold_idx))
    print(classification_report(y_train.iloc[test_index], clf.predict(X_train.iloc[test_index])))

Reporte para el FOLD 0
              precision    recall  f1-score   support

          no       0.86      0.95      0.90     15891
          si       0.73      0.45      0.55      4581

    accuracy                           0.84     20472
   macro avg       0.79      0.70      0.73     20472
weighted avg       0.83      0.84      0.82     20472

Reporte para el FOLD 1
              precision    recall  f1-score   support

          no       0.87      0.90      0.89     15891
          si       0.61      0.54      0.57      4580

    accuracy                           0.82     20471
   macro avg       0.74      0.72      0.73     20471
weighted avg       0.81      0.82      0.81     20471

Reporte para el FOLD 2
              precision    recall  f1-score   support

          no       0.86      0.93      0.90     15891
          si       0.68      0.49      0.57      4580

    accuracy                           0.83     20471
   macro avg       0.77      0.71      0.73     20471
weigh