## <a id = '0'> Índice </a>

* [**Entorno**](#1)  
   * [Librerías](#1d1)  
   * [Funciones](#1d2)  
   * [Constantes](#1d3)

* [**Lectura de datos**](#2)


## <a id = '1'> Entorno </a>
[índice](#0)

### <a id = '1d1'> Librerías </a>

In [None]:
import os
import pandas as pd
import numpy as np

from sklearn.preprocessing import  LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, ParameterGrid

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, log_loss, classification_report, make_scorer, confusion_matrix
from scipy.stats import ks_2samp

import matplotlib.pyplot as plt

import math
import itertools

import joblib
# from config import data_folder

In [7]:
os.chdir("../")

In [None]:
MODEL_PATH = "output/models/" 

### <a id = '1d2'> Funciones </a>

In [7]:
from src.utils import *

## <a id = '2'> Lectura de datos </a>
[índice](#0)

In [10]:
train_data = pd.read_csv("output/features/transform/train_features_pca.csv")
test_data = pd.read_csv("output/features/transform/test_features_pca.csv")

In [32]:
#Columnas que no vamos a usar en el modelado
skip_columns = ["patient", "label"]
le = LabelEncoder()

#Generamos las características y la variable objetivo 
X_train = train_data.drop(columns = skip_columns)
y_train = le.fit_transform(train_data["label"])

X_test = test_data.drop(columns = skip_columns)
y_test = le.fit_transform(test_data["label"])


In [None]:
recall_macro = make_scorer(recall_score, average='macro') #Todas las clases tienen el mismo peso
f1_score_macro = make_scorer(f1_score, average='macro') #Todas las clases tienen el mismo peso

## Logistic Regression

In [91]:
# Estimador
lr = LogisticRegression(penalty='l2', 
                        C=1e5, 
                        solver='lbfgs', 
                        multi_class='multinomial',
                        random_state = 42)

# Parámetros
params_lr = {
    'penalty': ['l2'],
    'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]
}
       
# Grid Search
grid_lr = GridSearchCV(estimator=lr,
                       param_grid=params_lr,
                       scoring=recall_macro,
                       cv=5,

                       verbose=1,
                       n_jobs=-1)

In [92]:
grid_lr.fit(X_train, y_train)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [93]:
#Pronóstico
y_pred_grid_lr = grid_lr.predict(X_test)
y_pred_prob_grid_lr = grid_lr.predict_proba(X_test)

In [138]:
joblib.dump(grid_lr, MODEL_PATH + "LR/logistic_regression_model.pkl")

LR_preds = pd.DataFrame({
    "patient" : test_data["patient"],
    "label" : test_data["label"],
    "y_true" : y_test,
    "pred" : y_pred_grid_lr,
})

LR_probas = pd.DataFrame(y_pred_prob_grid_lr, columns=[f"proba_clase_{c}" for c in grid_lr.classes_])
LR_results = pd.concat([LR_preds, LR_probas], axis=1)
LR_results.to_csv(MODEL_PATH + "LR/predictions.csv", index=False)


In [94]:
table_metrics(y_test,y_pred_grid_lr, y_pred_prob_grid_lr)

Unnamed: 0,metric,value
0,accuracy,0.527083
1,precision_weighted,0.528874
2,recall_weighted,0.527083
3,f1_weighted,0.527638
4,roc_auc_ovr,0.790475
5,log_loss,1.068772
6,gini_normalized,0.580949
7,ks_test_clase_0,0.408333
8,ks_test_clase_1,0.525
9,ks_test_clase_2,0.388889


In [103]:
confusion_matrix(y_test, y_pred_grid_lr)

array([[58, 20, 17, 25],
       [20, 68, 25,  7],
       [18, 17, 59, 26],
       [23,  6, 23, 68]])

In [95]:
print(classification_report(y_test, y_pred_grid_lr))

              precision    recall  f1-score   support

           0       0.49      0.48      0.49       120
           1       0.61      0.57      0.59       120
           2       0.48      0.49      0.48       120
           3       0.54      0.57      0.55       120

    accuracy                           0.53       480
   macro avg       0.53      0.53      0.53       480
weighted avg       0.53      0.53      0.53       480



In [154]:
genera_metricas_markdown(y_test,y_pred_grid_lr, y_pred_prob_grid_lr)

|    | metric             |    value |
|---:|:-------------------|---------:|
|  0 | accuracy           | 0.527083 |
|  1 | precision_weighted | 0.528874 |
|  2 | recall_weighted    | 0.527083 |
|  3 | f1_weighted        | 0.527638 |
|  4 | roc_auc_ovr        | 0.790475 |
|  5 | log_loss           | 1.06877  |
|  6 | gini_normalized    | 0.580949 |
|  7 | ks_test_clase_0    | 0.408333 |
|  8 | ks_test_clase_1    | 0.525    |
|  9 | ks_test_clase_2    | 0.388889 |
| 10 | ks_test_clase_3    | 0.494444 |


|              |   precision |   recall |   f1-score |    support |
|:-------------|------------:|---------:|-----------:|-----------:|
| 0            |    0.487395 | 0.483333 |   0.485356 | 120        |
| 1            |    0.612613 | 0.566667 |   0.588745 | 120        |
| 2            |    0.475806 | 0.491667 |   0.483607 | 120        |
| 3            |    0.539683 | 0.566667 |   0.552846 | 120        |
| accuracy     |    0.527083 | 0.527083 |   0.527083 |   0.527083 |
| macro avg    

In [157]:
cm_lr = confusion_matrix(y_test, y_pred_grid_lr)
df_cm_lr = pd.DataFrame(cm_lr,
                         index = [f"Real {label}" for label in grid_lr.classes_],
                         columns = [f"Pred {label}" for label in grid_lr.classes_])
print(df_cm_lr.to_markdown())


|        |   Pred 0 |   Pred 1 |   Pred 2 |   Pred 3 |
|:-------|---------:|---------:|---------:|---------:|
| Real 0 |       58 |       20 |       17 |       25 |
| Real 1 |       20 |       68 |       25 |        7 |
| Real 2 |       18 |       17 |       59 |       26 |
| Real 3 |       23 |        6 |       23 |       68 |


## Random Forest Classifier

In [98]:
#Estimador
rfc = RandomForestClassifier(random_state = 42, 
                             n_jobs = -1, 
                             bootstrap = True)

#Parámetros
params_rfc = {'n_estimators': [100, 350, 500],
             'max_features': ['log2', 'sqrt'],
             'max_depth': [5, 10, 20],
             'min_samples_split': [2, 10, 30],
             'min_samples_leaf': [2, 10, 30]}

#Grid Search
grid_rfc = GridSearchCV(estimator=rfc,
                       param_grid=params_rfc,
                       scoring=recall_macro,
                       cv=5,
                       verbose=1,
                       n_jobs=-1)

In [99]:
grid_rfc.fit(X_train, y_train)

Fitting 5 folds for each of 162 candidates, totalling 810 fits


In [110]:
#Pronóstico
y_pred_grid_rfc = grid_rfc.predict(X_test)
y_pred_prob_grid_rfc = grid_rfc.predict_proba(X_test)

In [139]:
joblib.dump(grid_rfc, MODEL_PATH + "RFC/random_forest_model.pkl")

RFC_preds = pd.DataFrame({
    "patient" : test_data["patient"],
    "label" : test_data["label"],
    "y_true" : y_test,
    "pred" : y_pred_grid_rfc,
})

RFC_probas = pd.DataFrame(y_pred_prob_grid_rfc, columns=[f"proba_clase_{c}" for c in grid_rfc.classes_])
RFC_results = pd.concat([RFC_preds, RFC_probas], axis=1)
RFC_results.to_csv(MODEL_PATH + "RFC/predictions.csv", index=False)


In [120]:
table_metrics(y_test,y_pred_grid_rfc, y_pred_prob_grid_rfc)

Unnamed: 0,metric,value
0,accuracy,0.483333
1,precision_weighted,0.481121
2,recall_weighted,0.483333
3,f1_weighted,0.48103
4,roc_auc_ovr,0.751476
5,log_loss,1.241261
6,gini_normalized,0.502951
7,ks_test_clase_0,0.302778
8,ks_test_clase_1,0.522222
9,ks_test_clase_2,0.327778


In [159]:
confusion_matrix(y_test, y_pred_grid_rfc)

array([[48, 22, 20, 30],
       [23, 70, 21,  6],
       [25, 21, 45, 29],
       [27,  9, 15, 69]])

In [113]:
print(classification_report(y_test, y_pred_grid_rfc))

              precision    recall  f1-score   support

           0       0.39      0.40      0.40       120
           1       0.57      0.58      0.58       120
           2       0.45      0.38      0.41       120
           3       0.51      0.57      0.54       120

    accuracy                           0.48       480
   macro avg       0.48      0.48      0.48       480
weighted avg       0.48      0.48      0.48       480



In [153]:
genera_metricas_markdown(y_test,y_pred_grid_rfc, y_pred_prob_grid_rfc)

|    | metric             |    value |
|---:|:-------------------|---------:|
|  0 | accuracy           | 0.483333 |
|  1 | precision_weighted | 0.481121 |
|  2 | recall_weighted    | 0.483333 |
|  3 | f1_weighted        | 0.48103  |
|  4 | roc_auc_ovr        | 0.751476 |
|  5 | log_loss           | 1.24126  |
|  6 | gini_normalized    | 0.502951 |
|  7 | ks_test_clase_0    | 0.302778 |
|  8 | ks_test_clase_1    | 0.522222 |
|  9 | ks_test_clase_2    | 0.327778 |
| 10 | ks_test_clase_3    | 0.45     |


|              |   precision |   recall |   f1-score |    support |
|:-------------|------------:|---------:|-----------:|-----------:|
| 0            |    0.390244 | 0.4      |   0.395062 | 120        |
| 1            |    0.57377  | 0.583333 |   0.578512 | 120        |
| 2            |    0.445545 | 0.375    |   0.40724  | 120        |
| 3            |    0.514925 | 0.575    |   0.543307 | 120        |
| accuracy     |    0.483333 | 0.483333 |   0.483333 |   0.483333 |
| macro avg    

In [160]:
cm_rfc = confusion_matrix(y_test, y_pred_grid_rfc)
df_cm_rfc = pd.DataFrame(cm_rfc,
                         index = [f"Real {label}" for label in grid_rfc.classes_],
                         columns = [f"Pred {label}" for label in grid_rfc.classes_])
print(df_cm_rfc.to_markdown())


|        |   Pred 0 |   Pred 1 |   Pred 2 |   Pred 3 |
|:-------|---------:|---------:|---------:|---------:|
| Real 0 |       48 |       22 |       20 |       30 |
| Real 1 |       23 |       70 |       21 |        6 |
| Real 2 |       25 |       21 |       45 |       29 |
| Real 3 |       27 |        9 |       15 |       69 |


# Gradient Boosting Classifier

In [108]:
#Estimador
gb = GradientBoostingClassifier(learning_rate=0.05, 
                                subsample=0.5, 
                                max_depth=6, 
                                n_estimators=10,
                                 random_state=42,
                                )

#Parámetros
params_gb = {'n_estimators': [1,10,100], 
             'learning_rate' : [0.01,0.05,0.1],
             'subsample' : [0.1,0.5,1.0], 
             'max_depth': [5,10,20],
             'min_samples_split': [2, 10, 30],
             'min_samples_leaf': [2, 10, 30],
             'max_features': ['log2', 'sqrt']}

#Grid Search
grid_gb = GridSearchCV(estimator=gb,
                       param_grid=params_gb,
                       scoring=recall_macro,
                       cv = 5,
                       verbose=1,
                       n_jobs=-1)

In [109]:
#Entrenamiento
grid_gb.fit(X_train, y_train)

Fitting 5 folds for each of 1458 candidates, totalling 7290 fits


In [114]:
#Predicciones
y_pred_grid_gb = grid_gb.predict(X_test)
y_pred_prob_grid_gb = grid_gb.predict_proba(X_test)

In [140]:
joblib.dump(grid_rfc, MODEL_PATH + "GB/gradient_boosting_model.pkl")

GB_preds = pd.DataFrame({
    "patient" : test_data["patient"],
    "label" : test_data["label"],
    "y_true" : y_test,
    "pred" : y_pred_grid_gb,
})

GB_probas = pd.DataFrame(y_pred_prob_grid_gb, columns=[f"proba_clase_{c}" for c in grid_gb.classes_])
GB_results = pd.concat([GB_preds, GB_probas], axis=1)
GB_results.to_csv(MODEL_PATH + "GB/predictions.csv", index=False)


In [None]:
table_metrics(y_test,y_pred_grid_gb, y_pred_prob_grid_gb)

Unnamed: 0,metric,value
0,accuracy,0.5
1,precision_weighted,0.501884
2,recall_weighted,0.5
3,f1_weighted,0.500224
4,roc_auc_ovr,0.751817
5,log_loss,1.257561
6,gini_normalized,0.503634
7,ks_test_clase_0,0.35
8,ks_test_clase_1,0.480556
9,ks_test_clase_2,0.344444


In [116]:
confusion_matrix(y_test, y_pred_grid_gb)

array([[57, 20, 20, 23],
       [26, 65, 24,  5],
       [21, 19, 50, 30],
       [28,  7, 17, 68]])

In [146]:
print(classification_report(y_test, y_pred_grid_gb))

              precision    recall  f1-score   support

           0       0.43      0.47      0.45       120
           1       0.59      0.54      0.56       120
           2       0.45      0.42      0.43       120
           3       0.54      0.57      0.55       120

    accuracy                           0.50       480
   macro avg       0.50      0.50      0.50       480
weighted avg       0.50      0.50      0.50       480



In [152]:
genera_metricas_markdown(y_test,y_pred_grid_gb, y_pred_prob_grid_gb)

|    | metric             |    value |
|---:|:-------------------|---------:|
|  0 | accuracy           | 0.5      |
|  1 | precision_weighted | 0.501884 |
|  2 | recall_weighted    | 0.5      |
|  3 | f1_weighted        | 0.500224 |
|  4 | roc_auc_ovr        | 0.751817 |
|  5 | log_loss           | 1.25756  |
|  6 | gini_normalized    | 0.503634 |
|  7 | ks_test_clase_0    | 0.35     |
|  8 | ks_test_clase_1    | 0.480556 |
|  9 | ks_test_clase_2    | 0.344444 |
| 10 | ks_test_clase_3    | 0.480556 |


|              |   precision |   recall |   f1-score |   support |
|:-------------|------------:|---------:|-----------:|----------:|
| 0            |    0.431818 | 0.475    |   0.452381 |     120   |
| 1            |    0.585586 | 0.541667 |   0.562771 |     120   |
| 2            |    0.45045  | 0.416667 |   0.4329   |     120   |
| 3            |    0.539683 | 0.566667 |   0.552846 |     120   |
| accuracy     |    0.5      | 0.5      |   0.5      |       0.5 |
| macro avg    |    0.

In [158]:
cm_gb = confusion_matrix(y_test, y_pred_grid_gb)
df_cm_gb = pd.DataFrame(cm_gb,
                         index = [f"Real {label}" for label in grid_gb.classes_],
                         columns = [f"Pred {label}" for label in grid_gb.classes_])
print(df_cm_gb.to_markdown())

|        |   Pred 0 |   Pred 1 |   Pred 2 |   Pred 3 |
|:-------|---------:|---------:|---------:|---------:|
| Real 0 |       57 |       20 |       20 |       23 |
| Real 1 |       26 |       65 |       24 |        5 |
| Real 2 |       21 |       19 |       50 |       30 |
| Real 3 |       28 |        7 |       17 |       68 |


## XGB Classifier

In [118]:
#Transformación de datos para entrenar modelo
xgb_dmatrix = xgb.DMatrix(data=X_train, label=y_train)
xgb_dmatrix_test = xgb.DMatrix(data=X_test, label=y_test)

#Parámetros
params_gbm = {'learning_rate': [0.01, 0.1, 0.5],
              'n_estimators' : [100, 350, 500],
              'subsample' : [0.3, 0.5, 0.9 ],
              'max_depth': [5, 10, 20],
              'min_child_weight': [1, 5, 10],
              'colsample_bytree': [0.3, 0.5, 0.9],
}

#Estimador
gbm = xgb.XGBClassifier()

#Grid Search
grid_gbm = GridSearchCV(estimator=gbm,
                       param_grid=params_gbm,
                       scoring=recall_macro,
                       cv=5,
                       verbose=1,
                       n_jobs=-1)

In [119]:
grid_gbm.fit(X_train, y_train)

Fitting 5 folds for each of 729 candidates, totalling 3645 fits




KeyboardInterrupt: 