## <a id = '0'> Índice </a>

* [**Entorno**](#1)  
   * [Librerías](#1d1)  
   * [Funciones](#1d2)  
   * [Constantes](#1d3)

* [**Lectura de datos**](#2)


## <a id = '1'> Entorno </a>
[índice](#0)

### <a id = '1d1'> Librerías </a>

In [1]:
import os
import pandas as pd

from sklearn.preprocessing import  LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import recall_score, f1_score, classification_report, make_scorer, confusion_matrix

import joblib
# from config import data_folder

In [2]:
os.chdir("../")

In [3]:
MODEL_PATH = "output/models/V4/" 

In [4]:
folders = ["LR", "GB", "GNB", "RFC", "XGB"]

# Crear las carpetas dentro de MODEL_PATH
for folder in folders:
    os.makedirs(os.path.join(MODEL_PATH, folder), exist_ok=True)

### <a id = '1d2'> Funciones </a>

In [5]:
from src.utils import table_metrics, params_to_markdown, genera_metricas_markdown, get_metrics_mode, table_metrics_clase
from src.TicToc import TicToc
tt = TicToc()

## <a id = '2'> Lectura de datos </a>
[índice](#0)

In [6]:
train_data = pd.read_csv("output/chunk_data/chunk_5/pre_model/train_data.csv")
test_data = pd.read_csv("output/chunk_data/chunk_5/pre_model/test_data.csv")
val_data = pd.read_csv("output/chunk_data/chunk_5/pre_model/val_data.csv")

In [7]:
#Columnas que no vamos a usar en el modelado
skip_columns = ["patient_id", "label", "chunk"]
le = LabelEncoder()

#Generamos las características y la variable objetivo 
X_train = train_data.drop(columns = skip_columns)
y_train = le.fit_transform(train_data["label"])
train_preds = train_data[["patient_id", "label"]]
train_preds["y_true"] = y_train

X_test = test_data.drop(columns = skip_columns)
y_test = le.fit_transform(test_data["label"])
test_preds = test_data[["patient_id", "label"]]
test_preds["y_true"] = y_test

X_val = val_data.drop(columns = skip_columns)
y_val = le.fit_transform(val_data["label"])
val_preds = val_data[["patient_id", "label"]]
val_preds["y_true"] = y_val


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_preds["y_true"] = y_train
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_preds["y_true"] = y_test
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_preds["y_true"] = y_val


In [8]:
recall_macro = make_scorer(recall_score, average='macro') #Todas las clases tienen el mismo peso
f1_score_macro = make_scorer(f1_score, average='macro') #Todas las clases tienen el mismo peso

## Logistic Regression

In [9]:
# Estimador
lr = LogisticRegression(penalty='l2', 
                        C=1e5, 
                        solver='lbfgs', 
                        random_state = 42)

# Parámetros
params_lr = {
    'penalty': ['l2'],
    'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]
}
       
# Grid Search
grid_lr = GridSearchCV(estimator=lr,
                       param_grid=params_lr,
                       scoring='accuracy',
                       cv=5,
                       verbose=1,
                       n_jobs=-1)

In [10]:
tt.tic()
grid_lr.fit(X_train, y_train)
tt.toc()

Fitting 5 folds for each of 7 candidates, totalling 35 fits
Elapsed time: 8.838327 seconds


8.838327169418335

In [11]:
#Pronóstico
y_pred_grid_lr = grid_lr.best_estimator_.predict(X_test)
y_pred_prob_grid_lr = grid_lr.best_estimator_.predict_proba(X_test)

In [12]:
test_preds_LR = test_preds.copy()
test_preds_LR["pred"] = y_pred_grid_lr
print(get_metrics_mode(test_preds_LR))

| metric             |    value |
|:-------------------|---------:|
| accuracy           | 0.266667 |
| precision_weighted | 0.241029 |
| recall_weighted    | 0.266667 |
| f1_weighted        | 0.21731  |


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
grid_lr.best_params_

{'C': 0.0001, 'penalty': 'l2'}

In [14]:
print(params_to_markdown(grid_lr.best_params_))

| parámetro   | valor   |
|:------------|:--------|
| C           | 0.0001  |
| penalty     | l2      |


In [15]:
grid_lr.best_score_

0.2627929597035664

In [16]:
grid_lr.best_estimator_

In [17]:
joblib.dump(grid_lr, MODEL_PATH + "LR/logistic_regression_model.pkl")

LR_preds = pd.DataFrame({
    "patient_id" : test_data["patient_id"],
    "label" : test_data["label"],
    "y_true" : y_test,
    "pred" : y_pred_grid_lr,
})

LR_probas = pd.DataFrame(y_pred_prob_grid_lr, columns=[f"proba_clase_{c}" for c in grid_lr.classes_])
LR_results = pd.concat([LR_preds, LR_probas], axis=1)
LR_results.to_csv(MODEL_PATH + "LR/predictions.csv", index=False)


In [18]:
table_metrics(y_test,y_pred_grid_lr, y_pred_prob_grid_lr)

Unnamed: 0,metric,value
0,accuracy,0.258167
1,precision_weighted,0.257688
2,recall_weighted,0.258167
3,f1_weighted,0.232356
4,roc_auc_ovr,0.509649
5,log_loss,1.386275
6,gini_normalized,0.019299
7,ks_test_clase_0,0.062778
8,ks_test_clase_1,0.075389
9,ks_test_clase_2,0.017944


In [19]:
f1_score(y_test, y_pred_grid_lr, average='macro'),

(0.23235600922923122,)

In [20]:
confusion_matrix(y_test, y_pred_grid_lr)

array([[1622, 1865,  223, 2290],
       [1243, 1946,  210, 2601],
       [1586, 1730,  208, 2476],
       [1248, 2124,  208, 2420]])

In [21]:
print(classification_report(y_test, y_pred_grid_lr))

              precision    recall  f1-score   support

           0       0.28      0.27      0.28      6000
           1       0.25      0.32      0.28      6000
           2       0.24      0.03      0.06      6000
           3       0.25      0.40      0.31      6000

    accuracy                           0.26     24000
   macro avg       0.26      0.26      0.23     24000
weighted avg       0.26      0.26      0.23     24000



In [22]:
genera_metricas_markdown(y_test,y_pred_grid_lr, y_pred_prob_grid_lr)

| metric             |     value |
|:-------------------|----------:|
| accuracy           | 0.258167  |
| precision_weighted | 0.257688  |
| recall_weighted    | 0.258167  |
| f1_weighted        | 0.232356  |
| roc_auc_ovr        | 0.509649  |
| log_loss           | 1.38628   |
| gini_normalized    | 0.0192987 |
| ks_test_clase_0    | 0.0627778 |
| ks_test_clase_1    | 0.0753889 |
| ks_test_clase_2    | 0.0179444 |
| ks_test_clase_3    | 0.0317778 |


|              |   precision |    recall |   f1-score |      support |
|:-------------|------------:|----------:|-----------:|-------------:|
| 0            |    0.284611 | 0.270333  |  0.277289  |  6000        |
| 1            |    0.253881 | 0.324333  |  0.284815  |  6000        |
| 2            |    0.244994 | 0.0346667 |  0.0607388 |  6000        |
| 3            |    0.247267 | 0.403333  |  0.306581  |  6000        |
| accuracy     |    0.258167 | 0.258167  |  0.258167  |     0.258167 |
| macro avg    |    0.257688 | 0.258167  |  0.

In [23]:
cm_lr = confusion_matrix(y_test, y_pred_grid_lr)
df_cm_lr = pd.DataFrame(cm_lr,
                         index = [f"Real {label}" for label in grid_lr.classes_],
                         columns = [f"Pred {label}" for label in grid_lr.classes_])
print(df_cm_lr.to_markdown())


|        |   Pred 0 |   Pred 1 |   Pred 2 |   Pred 3 |
|:-------|---------:|---------:|---------:|---------:|
| Real 0 |     1622 |     1865 |      223 |     2290 |
| Real 1 |     1243 |     1946 |      210 |     2601 |
| Real 2 |     1586 |     1730 |      208 |     2476 |
| Real 3 |     1248 |     2124 |      208 |     2420 |


## Random Forest Classifier

In [11]:
#Estimador
rfc = RandomForestClassifier(random_state = 42, 
                             n_jobs = -1, 
                             bootstrap = True)

#Parámetros
params_rfc = {'n_estimators': [100, 350],
             'max_features': ['sqrt'],
             'max_depth': [10, 20],
             'min_samples_split': [2, 10],
             'min_samples_leaf': [2, 10]}

#Grid Search
grid_rfc = GridSearchCV(estimator=rfc,
                       param_grid=params_rfc,
                       scoring='accuracy',
                       cv=5,
                       verbose=1,
                       n_jobs=-1)

In [12]:
tt.tic()
grid_rfc.fit(X_train, y_train)
tt.toc()

Fitting 5 folds for each of 16 candidates, totalling 80 fits




Elapsed time: 3740.929950 seconds


3740.9299499988556

In [13]:
grid_rfc.best_params_

{'max_depth': 20,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_estimators': 350}

In [14]:
print(params_to_markdown(grid_rfc.best_params_))

| parámetro         | valor   |
|:------------------|:--------|
| max_depth         | 20      |
| max_features      | sqrt    |
| min_samples_leaf  | 2       |
| min_samples_split | 2       |
| n_estimators      | 350     |


In [15]:
grid_rfc.best_score_

0.4037054191755442

In [16]:
grid_rfc.best_estimator_

In [17]:
#Pronóstico
y_pred_grid_rfc = grid_rfc.predict(X_test)
y_pred_prob_grid_rfc = grid_rfc.predict_proba(X_test)

In [18]:
test_preds_RFC = test_preds.copy()
test_preds_RFC["pred"] = y_pred_grid_rfc
print(get_metrics_mode(test_preds_RFC))

| metric             |    value |
|:-------------------|---------:|
| accuracy           | 0.508333 |
| precision_weighted | 0.492711 |
| recall_weighted    | 0.508333 |
| f1_weighted        | 0.492884 |


In [19]:
joblib.dump(grid_rfc, MODEL_PATH + "RFC/random_forest_model.pkl")

RFC_preds = pd.DataFrame({
    "patient_id" : test_data["patient_id"],
    "label" : test_data["label"],
    "y_true" : y_test,
    "pred" : y_pred_grid_rfc,
})

RFC_probas = pd.DataFrame(y_pred_prob_grid_rfc, columns=[f"proba_clase_{c}" for c in grid_rfc.classes_])
RFC_results = pd.concat([RFC_preds, RFC_probas], axis=1)
RFC_results.to_csv(MODEL_PATH + "RFC/predictions.csv", index=False)


In [20]:
table_metrics(y_test,y_pred_grid_rfc, y_pred_prob_grid_rfc)

Unnamed: 0,metric,value
0,accuracy,0.390792
1,precision_weighted,0.382425
2,recall_weighted,0.390792
3,f1_weighted,0.384946
4,roc_auc_ovr,0.659797
5,log_loss,1.271188
6,gini_normalized,0.319593
7,ks_test_clase_0,0.153889
8,ks_test_clase_1,0.395667
9,ks_test_clase_2,0.1075


In [21]:
confusion_matrix(y_test, y_pred_grid_rfc)

array([[1751, 1495, 1245, 1509],
       [ 815, 3406, 1087,  692],
       [1366, 1214, 1560, 1860],
       [1161,  721, 1456, 2662]])

In [22]:
print(classification_report(y_test, y_pred_grid_rfc))

              precision    recall  f1-score   support

           0       0.34      0.29      0.32      6000
           1       0.50      0.57      0.53      6000
           2       0.29      0.26      0.27      6000
           3       0.40      0.44      0.42      6000

    accuracy                           0.39     24000
   macro avg       0.38      0.39      0.38     24000
weighted avg       0.38      0.39      0.38     24000



In [23]:
genera_metricas_markdown(y_test,y_pred_grid_rfc, y_pred_prob_grid_rfc)

| metric             |    value |
|:-------------------|---------:|
| accuracy           | 0.390792 |
| precision_weighted | 0.382425 |
| recall_weighted    | 0.390792 |
| f1_weighted        | 0.384946 |
| roc_auc_ovr        | 0.659797 |
| log_loss           | 1.27119  |
| gini_normalized    | 0.319593 |
| ks_test_clase_0    | 0.153889 |
| ks_test_clase_1    | 0.395667 |
| ks_test_clase_2    | 0.1075   |
| ks_test_clase_3    | 0.285111 |


|              |   precision |   recall |   f1-score |      support |
|:-------------|------------:|---------:|-----------:|-------------:|
| 0            |    0.343805 | 0.291833 |   0.315695 |  6000        |
| 1            |    0.498245 | 0.567667 |   0.530695 |  6000        |
| 2            |    0.291698 | 0.26     |   0.274938 |  6000        |
| 3            |    0.395954 | 0.443667 |   0.418455 |  6000        |
| accuracy     |    0.390792 | 0.390792 |   0.390792 |     0.390792 |
| macro avg    |    0.382425 | 0.390792 |   0.384946 | 24000      

In [24]:
cm_rfc = confusion_matrix(y_test, y_pred_grid_rfc)
df_cm_rfc = pd.DataFrame(cm_rfc,
                         index = [f"Real {label}" for label in grid_rfc.classes_],
                         columns = [f"Pred {label}" for label in grid_rfc.classes_])
print(df_cm_rfc.to_markdown())


|        |   Pred 0 |   Pred 1 |   Pred 2 |   Pred 3 |
|:-------|---------:|---------:|---------:|---------:|
| Real 0 |     1751 |     1495 |     1245 |     1509 |
| Real 1 |      815 |     3406 |     1087 |      692 |
| Real 2 |     1366 |     1214 |     1560 |     1860 |
| Real 3 |     1161 |      721 |     1456 |     2662 |


# Gradient Boosting Classifier

In [13]:
#Estimador
gb = GradientBoostingClassifier(learning_rate=0.05, 
                                subsample=0.5, 
                                max_depth=6, 
                                n_estimators=10,
                                 random_state=42,
                                )

#Parámetros
params_gb = {'n_estimators': [100], 
             'learning_rate' : [0.1],
             'subsample' : [1], 
             'max_depth': [5],
             'min_samples_split': [2, 10],
            #  'min_samples_leaf': [2, 10],
             'max_features': ['log2']}

#Grid Search
grid_gb = GridSearchCV(estimator=gb,
                       param_grid=params_gb,
                       scoring=recall_macro,
                       cv = 5,
                       verbose=3,
                       n_jobs=-1)

In [14]:
#Entrenamiento
tt.tic()
grid_gb.fit(X_train, y_train)
tt.toc()

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 3/5] END learning_rate=0.1, max_depth=5, max_features=log2, min_samples_split=10, n_estimators=100, subsample=1;, score=0.367 total time= 4.3min
[CV 2/5] END learning_rate=0.1, max_depth=5, max_features=log2, min_samples_split=2, n_estimators=100, subsample=1;, score=0.398 total time= 4.3min
[CV 2/5] END learning_rate=0.1, max_depth=5, max_features=log2, min_samples_split=10, n_estimators=100, subsample=1;, score=0.399 total time= 4.3min
[CV 1/5] END learning_rate=0.1, max_depth=5, max_features=log2, min_samples_split=10, n_estimators=100, subsample=1;, score=0.406 total time= 4.3min
[CV 5/5] END learning_rate=0.1, max_depth=5, max_features=log2, min_samples_split=2, n_estimators=100, subsample=1;, score=0.392 total time= 4.3min
[CV 1/5] END learning_rate=0.1, max_depth=5, max_features=log2, min_samples_split=2, n_estimators=100, subsample=1;, score=0.406 total time= 4.3min
[CV 3/5] END learning_rate=0.1, max_depth=5, max_

627.2744576931

In [None]:
#Entrenamiento
tt.tic()
grid_gb.fit(X_train, y_train)
tt.toc()
#928m 58.4s

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 3/5] END learning_rate=0.1, max_depth=5, max_features=log2, min_samples_split=2, n_estimators=10, subsample=0.8;, score=0.345 total time=  24.3s
[CV 1/5] END learning_rate=0.1, max_depth=5, max_features=log2, min_samples_split=2, n_estimators=10, subsample=0.8;, score=0.379 total time=  24.4s
[CV 4/5] END learning_rate=0.1, max_depth=5, max_features=log2, min_samples_split=2, n_estimators=10, subsample=0.8;, score=0.370 total time=  24.4s
[CV 2/5] END learning_rate=0.1, max_depth=5, max_features=log2, min_samples_split=2, n_estimators=10, subsample=0.8;, score=0.375 total time=  24.3s
[CV 5/5] END learning_rate=0.1, max_depth=5, max_features=log2, min_samples_split=2, n_estimators=10, subsample=0.8;, score=0.371 total time=  24.7s
[CV 1/5] END learning_rate=0.1, max_depth=5, max_features=log2, min_samples_split=10, n_estimators=10, subsample=0.8;, score=0.377 total time=  23.5s
[CV 2/5] END learning_rate=0.1, max_depth=5, 



[CV 5/5] END learning_rate=0.1, max_depth=20, max_features=log2, min_samples_split=2, n_estimators=10, subsample=0.8;, score=0.383 total time=67.0min
[CV 3/5] END learning_rate=0.1, max_depth=20, max_features=log2, min_samples_split=2, n_estimators=10, subsample=0.8;, score=0.356 total time=68.0min
[CV 4/5] END learning_rate=0.1, max_depth=20, max_features=log2, min_samples_split=2, n_estimators=10, subsample=0.8;, score=0.379 total time=70.2min
[CV 2/5] END learning_rate=0.1, max_depth=20, max_features=log2, min_samples_split=10, n_estimators=10, subsample=0.8;, score=0.388 total time=28.0min
[CV 1/5] END learning_rate=0.1, max_depth=20, max_features=log2, min_samples_split=10, n_estimators=10, subsample=0.8;, score=0.391 total time=28.9min
[CV 3/5] END learning_rate=0.1, max_depth=20, max_features=log2, min_samples_split=10, n_estimators=10, subsample=0.8;, score=0.361 total time=29.1min
[CV 4/5] END learning_rate=0.1, max_depth=20, max_features=log2, min_samples_split=10, n_estimato

KeyboardInterrupt: 

In [15]:
grid_gb.best_params_

{'learning_rate': 0.1,
 'max_depth': 5,
 'max_features': 'log2',
 'min_samples_split': 10,
 'n_estimators': 100,
 'subsample': 1}

In [16]:
print(params_to_markdown(grid_gb.best_params_))

| parámetro         | valor   |
|:------------------|:--------|
| learning_rate     | 0.1     |
| max_depth         | 5       |
| max_features      | log2    |
| min_samples_split | 10      |
| n_estimators      | 100     |
| subsample         | 1       |


In [17]:
grid_gb.best_score_

0.3910741685563114

In [18]:
grid_gb.best_estimator_

In [19]:
#Predicciones
y_pred_grid_gb = grid_gb.predict(X_test)
y_pred_prob_grid_gb = grid_gb.predict_proba(X_test)

In [20]:
test_preds_GB = test_preds.copy()
test_preds_GB["pred"] = y_pred_grid_gb
print(get_metrics_mode(test_preds_GB))

| metric             |    value |
|:-------------------|---------:|
| accuracy           | 0.491667 |
| precision_weighted | 0.468082 |
| recall_weighted    | 0.491667 |
| f1_weighted        | 0.46567  |


In [21]:
joblib.dump(grid_gb, MODEL_PATH + "GB/gradient_boosting_model.pkl")

GB_preds = pd.DataFrame({
    "patient_id" : test_data["patient_id"],
    "label" : test_data["label"],
    "y_true" : y_test,
    "pred" : y_pred_grid_gb,
})

GB_probas = pd.DataFrame(y_pred_prob_grid_gb, columns=[f"proba_clase_{c}" for c in grid_gb.classes_])
GB_results = pd.concat([GB_preds, GB_probas], axis=1)
GB_results.to_csv(MODEL_PATH + "GB/predictions.csv", index=False)


In [22]:
table_metrics(y_test,y_pred_grid_gb, y_pred_prob_grid_gb)

Unnamed: 0,metric,value
0,accuracy,0.37875
1,precision_weighted,0.366769
2,recall_weighted,0.37875
3,f1_weighted,0.368525
4,roc_auc_ovr,0.642532
5,log_loss,1.293906
6,gini_normalized,0.285063
7,ks_test_clase_0,0.153389
8,ks_test_clase_1,0.356167
9,ks_test_clase_2,0.085778


In [23]:
confusion_matrix(y_test, y_pred_grid_gb)

array([[1737, 1605, 1038, 1620],
       [ 843, 3358,  825,  974],
       [1320, 1426, 1239, 2015],
       [1159,  815, 1270, 2756]])

In [24]:
print(classification_report(y_test, y_pred_grid_gb))

              precision    recall  f1-score   support

           0       0.34      0.29      0.31      6000
           1       0.47      0.56      0.51      6000
           2       0.28      0.21      0.24      6000
           3       0.37      0.46      0.41      6000

    accuracy                           0.38     24000
   macro avg       0.37      0.38      0.37     24000
weighted avg       0.37      0.38      0.37     24000



In [25]:
genera_metricas_markdown(y_test,y_pred_grid_gb, y_pred_prob_grid_gb)

| metric             |     value |
|:-------------------|----------:|
| accuracy           | 0.37875   |
| precision_weighted | 0.366769  |
| recall_weighted    | 0.37875   |
| f1_weighted        | 0.368525  |
| roc_auc_ovr        | 0.642532  |
| log_loss           | 1.29391   |
| gini_normalized    | 0.285063  |
| ks_test_clase_0    | 0.153389  |
| ks_test_clase_1    | 0.356167  |
| ks_test_clase_2    | 0.0857778 |
| ks_test_clase_3    | 0.2645    |


|              |   precision |   recall |   f1-score |     support |
|:-------------|------------:|---------:|-----------:|------------:|
| 0            |    0.343348 | 0.2895   |   0.314133 |  6000       |
| 1            |    0.46613  | 0.559667 |   0.508634 |  6000       |
| 2            |    0.283394 | 0.2065   |   0.238912 |  6000       |
| 3            |    0.374202 | 0.459333 |   0.412421 |  6000       |
| accuracy     |    0.37875  | 0.37875  |   0.37875  |     0.37875 |
| macro avg    |    0.366769 | 0.37875  |   0.368525 | 24000

In [26]:
cm_gb = confusion_matrix(y_test, y_pred_grid_gb)
df_cm_gb = pd.DataFrame(cm_gb,
                         index = [f"Real {label}" for label in grid_gb.classes_],
                         columns = [f"Pred {label}" for label in grid_gb.classes_])
print(df_cm_gb.to_markdown())

|        |   Pred 0 |   Pred 1 |   Pred 2 |   Pred 3 |
|:-------|---------:|---------:|---------:|---------:|
| Real 0 |     1737 |     1605 |     1038 |     1620 |
| Real 1 |      843 |     3358 |      825 |      974 |
| Real 2 |     1320 |     1426 |     1239 |     2015 |
| Real 3 |     1159 |      815 |     1270 |     2756 |


## Naive Bayes

In [27]:
gnb = GaussianNB()

params_gnb = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
}
#Grid Search
grid_gnb = GridSearchCV(estimator=gnb,
                       param_grid=params_gnb,
                       scoring=recall_macro,
                       cv=5,
                       verbose=1,
                       n_jobs=-1)

In [28]:
#Entrenamiento
tt.tic()
grid_gnb.fit(X_train, y_train)
tt.toc()

Fitting 5 folds for each of 7 candidates, totalling 35 fits
Elapsed time: 4.368601 seconds


4.368600845336914

In [29]:
grid_gnb.best_params_

{'var_smoothing': 1e-09}

In [30]:
print(params_to_markdown(grid_gnb.best_params_))

| parámetro     |   valor |
|:--------------|--------:|
| var_smoothing |   1e-09 |


In [31]:
grid_gnb.best_score_

0.2718141577338006

In [32]:
#Predicciones
y_pred_grid_gnb = grid_gnb.predict(X_test)
y_pred_prob_grid_gnb = grid_gnb.predict_proba(X_test)

In [33]:
test_preds_GNB = test_preds.copy()
test_preds_GNB["pred"] = y_pred_grid_gnb
print(get_metrics_mode(test_preds_GNB))

| metric             |    value |
|:-------------------|---------:|
| accuracy           | 0.25     |
| precision_weighted | 0.14693  |
| recall_weighted    | 0.25     |
| f1_weighted        | 0.115846 |


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [34]:
joblib.dump(grid_gnb, MODEL_PATH + "GNB/Gaussian_Naive_Bayes_model.pkl")

GNB_preds = pd.DataFrame({
    "patient_id" : test_data["patient_id"],
    "label" : test_data["label"],
    "y_true" : y_test,
    "pred" : y_pred_grid_gnb,
})

GNB_probas = pd.DataFrame(y_pred_prob_grid_gnb, columns=[f"proba_clase_{c}" for c in grid_gnb.classes_])
GNB_results = pd.concat([GNB_preds, GNB_probas], axis=1)
GNB_results.to_csv(MODEL_PATH + "GNB/predictions.csv", index=False)


In [35]:
table_metrics(y_test,y_pred_grid_gnb, y_pred_prob_grid_gnb)

Unnamed: 0,metric,value
0,accuracy,0.268083
1,precision_weighted,0.280652
2,recall_weighted,0.268083
3,f1_weighted,0.188427
4,roc_auc_ovr,0.527915
5,log_loss,5.011683
6,gini_normalized,0.055829
7,ks_test_clase_0,0.070111
8,ks_test_clase_1,0.107111
9,ks_test_clase_2,0.065111


In [36]:
confusion_matrix(y_test, y_pred_grid_gnb)

array([[ 168, 4564,  475,  793],
       [  63, 5020,  447,  470],
       [ 158, 4742,  491,  609],
       [ 190, 4724,  331,  755]])

In [37]:
print(classification_report(y_test, y_pred_grid_gnb))

              precision    recall  f1-score   support

           0       0.29      0.03      0.05      6000
           1       0.26      0.84      0.40      6000
           2       0.28      0.08      0.13      6000
           3       0.29      0.13      0.18      6000

    accuracy                           0.27     24000
   macro avg       0.28      0.27      0.19     24000
weighted avg       0.28      0.27      0.19     24000



In [38]:
genera_metricas_markdown(y_test,y_pred_grid_gnb, y_pred_prob_grid_gnb)

| metric             |     value |
|:-------------------|----------:|
| accuracy           | 0.268083  |
| precision_weighted | 0.280652  |
| recall_weighted    | 0.268083  |
| f1_weighted        | 0.188427  |
| roc_auc_ovr        | 0.527915  |
| log_loss           | 5.01168   |
| gini_normalized    | 0.0558292 |
| ks_test_clase_0    | 0.0701111 |
| ks_test_clase_1    | 0.107111  |
| ks_test_clase_2    | 0.0651111 |
| ks_test_clase_3    | 0.0232778 |


|              |   precision |    recall |   f1-score |      support |
|:-------------|------------:|----------:|-----------:|-------------:|
| 0            |    0.290155 | 0.028     |  0.0510716 |  6000        |
| 1            |    0.263517 | 0.836667  |  0.400798  |  6000        |
| 2            |    0.281537 | 0.0818333 |  0.126808  |  6000        |
| 3            |    0.2874   | 0.125833  |  0.175032  |  6000        |
| accuracy     |    0.268083 | 0.268083  |  0.268083  |     0.268083 |
| macro avg    |    0.280652 | 0.268083  |  0.

In [39]:
cm_gnb = confusion_matrix(y_test, y_pred_grid_gnb)
df_cm_gnb = pd.DataFrame(cm_gnb,
                         index = [f"Real {label}" for label in grid_gnb.classes_],
                         columns = [f"Pred {label}" for label in grid_gnb.classes_])
print(df_cm_gnb.to_markdown())

|        |   Pred 0 |   Pred 1 |   Pred 2 |   Pred 3 |
|:-------|---------:|---------:|---------:|---------:|
| Real 0 |      168 |     4564 |      475 |      793 |
| Real 1 |       63 |     5020 |      447 |      470 |
| Real 2 |      158 |     4742 |      491 |      609 |
| Real 3 |      190 |     4724 |      331 |      755 |


## XGB Classifier

In [40]:
xgb_base = xgb.XGBClassifier(
    learning_rate=0.1,
    n_estimators=4000,
    # eval_metric="merror",
    eval_metric="auc",
    objective="multi:softprob",
    early_stopping_rounds=500,
    
)

In [42]:
tt.tic()
xgb_base.fit(X_train, y_train,
             eval_set=[(X_test, y_test)],             
             verbose = 1
             )

# xgb_n_estimator = xgb_base.best_iteration
tt.toc()

[0]	validation_0-auc:0.61255
[1]	validation_0-auc:0.61707
[2]	validation_0-auc:0.62200
[3]	validation_0-auc:0.62491
[4]	validation_0-auc:0.62840
[5]	validation_0-auc:0.63026
[6]	validation_0-auc:0.63132
[7]	validation_0-auc:0.63275
[8]	validation_0-auc:0.63376
[9]	validation_0-auc:0.63448
[10]	validation_0-auc:0.63556
[11]	validation_0-auc:0.63601
[12]	validation_0-auc:0.63773
[13]	validation_0-auc:0.63788
[14]	validation_0-auc:0.63959
[15]	validation_0-auc:0.64036
[16]	validation_0-auc:0.64088
[17]	validation_0-auc:0.64159
[18]	validation_0-auc:0.64219
[19]	validation_0-auc:0.64277
[20]	validation_0-auc:0.64387
[21]	validation_0-auc:0.64433
[22]	validation_0-auc:0.64504
[23]	validation_0-auc:0.64563
[24]	validation_0-auc:0.64587
[25]	validation_0-auc:0.64651
[26]	validation_0-auc:0.64660
[27]	validation_0-auc:0.64685
[28]	validation_0-auc:0.64757
[29]	validation_0-auc:0.64767
[30]	validation_0-auc:0.64747
[31]	validation_0-auc:0.64783
[32]	validation_0-auc:0.64824
[33]	validation_0-au

95.43612003326416

In [43]:
#Predicciones
y_pred_grid_xgb_b = xgb_base.predict(X_val)
y_pred_prob_grid_xgb_b = xgb_base.predict_proba(X_val)

In [44]:
table_metrics(y_val,y_pred_grid_xgb_b, y_pred_prob_grid_xgb_b)

Unnamed: 0,metric,value
0,accuracy,0.408223
1,precision_weighted,0.398195
2,recall_weighted,0.408223
3,f1_weighted,0.398398
4,roc_auc_ovr,0.667674
5,log_loss,1.30001
6,gini_normalized,0.335349
7,ks_test_clase_0,0.145802
8,ks_test_clase_1,0.4105
9,ks_test_clase_2,0.150387


In [45]:
y_pred_prob_grid_xgb_b

array([[0.63901174, 0.00176058, 0.06784602, 0.29138172],
       [0.3603599 , 0.03100612, 0.40182474, 0.20680922],
       [0.51238877, 0.04070616, 0.34007436, 0.10683068],
       ...,
       [0.13718277, 0.08874112, 0.5559282 , 0.21814796],
       [0.19448136, 0.10870648, 0.4922283 , 0.20458385],
       [0.17673934, 0.11790288, 0.3212633 , 0.38409445]], dtype=float32)

In [46]:
val_preds["y_true"] = y_val
val_preds["pred"] = y_pred_grid_xgb_b

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_preds["y_true"] = y_val
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_preds["pred"] = y_pred_grid_xgb_b


In [47]:
print(get_metrics_mode(val_preds))

| metric             |    value |
|:-------------------|---------:|
| accuracy           | 0.512397 |
| precision_weighted | 0.509052 |
| recall_weighted    | 0.512397 |
| f1_weighted        | 0.472916 |


In [48]:

mode_pred = val_preds.groupby(["patient_id", "y_true"])["pred"].agg(lambda x: pd.Series.mode(x)[0]).reset_index()
mode_pred

Unnamed: 0,patient_id,y_true,pred
0,patient_1004,2,2
1,patient_10070,3,3
2,patient_1042,0,0
3,patient_1074,2,0
4,patient_130,0,1
...,...,...,...
116,patient_917,2,0
117,patient_9313,3,3
118,patient_9443,3,3
119,patient_951,3,3


In [49]:
table_metrics_clase(mode_pred["y_true"], mode_pred["pred"])

Unnamed: 0,metric,value
0,accuracy,0.512397
1,precision_weighted,0.509052
2,recall_weighted,0.512397
3,f1_weighted,0.472916


In [50]:
val_data.groupby(["patient_id", "label"]).size()

patient_id     label  
patient_1004   sttc       200
patient_10070  sttc_mi    200
patient_1042   mi         200
patient_1074   sttc       200
patient_130    mi         200
                         ... 
patient_917    sttc       200
patient_9313   sttc_mi    200
patient_9443   sttc_mi    200
patient_951    sttc_mi    200
patient_961    sttc       200
Length: 121, dtype: int64

In [51]:
genera_metricas_markdown(y_val,y_pred_grid_xgb_b, y_pred_prob_grid_xgb_b)

| metric             |    value |
|:-------------------|---------:|
| accuracy           | 0.408223 |
| precision_weighted | 0.398195 |
| recall_weighted    | 0.408223 |
| f1_weighted        | 0.398398 |
| roc_auc_ovr        | 0.667674 |
| log_loss           | 1.30001  |
| gini_normalized    | 0.335349 |
| ks_test_clase_0    | 0.145802 |
| ks_test_clase_1    | 0.4105   |
| ks_test_clase_2    | 0.150387 |
| ks_test_clase_3    | 0.289844 |


|              |   precision |   recall |   f1-score |      support |
|:-------------|------------:|---------:|-----------:|-------------:|
| 0            |    0.342726 | 0.249333 |   0.288664 |  6000        |
| 1            |    0.498062 | 0.578167 |   0.535133 |  6000        |
| 2            |    0.350038 | 0.295161 |   0.320266 |  6200        |
| 3            |    0.403559 | 0.514    |   0.452133 |  6000        |
| accuracy     |    0.408223 | 0.408223 |   0.408223 |     0.408223 |
| macro avg    |    0.398596 | 0.409165 |   0.399049 | 24200      

In [52]:
cm_xgb_base = confusion_matrix(y_val, y_pred_grid_xgb_b)
df_cm_xgb_base = pd.DataFrame(cm_xgb_base,
                         index = [f"Real {label}" for label in xgb_base.classes_],
                         columns = [f"Pred {label}" for label in xgb_base.classes_])
print(df_cm_xgb_base.to_markdown())

|        |   Pred 0 |   Pred 1 |   Pred 2 |   Pred 3 |
|:-------|---------:|---------:|---------:|---------:|
| Real 0 |     1496 |     1430 |     1266 |     1808 |
| Real 1 |      664 |     3469 |     1136 |      731 |
| Real 2 |     1231 |     1120 |     1830 |     2019 |
| Real 3 |      974 |      946 |      996 |     3084 |


In [53]:
print(params_to_markdown(xgb_base.get_params()))

| parámetro             | valor          |
|:----------------------|:---------------|
| objective             | multi:softprob |
| early_stopping_rounds | 500            |
| enable_categorical    | False          |
| eval_metric           | auc            |
| learning_rate         | 0.1            |
| missing               | nan            |
| n_estimators          | 4000           |


In [57]:
val_data

Unnamed: 0,patient_id,label,chunk,I_1,I_2,I_3,I_4,I_5,II_1,II_2,...,V5_1,V5_2,V5_3,V5_4,V5_5,V6_1,V6_2,V6_3,V6_4,V6_5
0,patient_1450,mi,chunk_1,0.820,0.666,0.368,0.019,-0.168,-0.906,-0.729,...,-0.718,-0.567,-0.222,0.066,0.242,-0.954,-0.821,-0.509,-0.244,-0.084
1,patient_1450,mi,chunk_2,-0.208,-0.216,-0.209,-0.214,-0.214,0.160,0.163,...,0.326,0.337,0.355,0.362,0.365,-0.016,0.015,0.032,0.029,0.044
2,patient_1450,mi,chunk_3,-0.211,-0.228,-0.222,-0.224,-0.217,0.208,0.220,...,0.372,0.370,0.379,0.386,0.393,0.054,0.056,0.061,0.074,0.090
3,patient_1450,mi,chunk_4,-0.210,-0.232,-0.237,-0.234,-0.247,0.273,0.281,...,0.407,0.414,0.435,0.475,0.510,0.093,0.103,0.115,0.148,0.192
4,patient_1450,mi,chunk_5,-0.254,-0.202,-0.182,-0.199,-0.183,0.401,0.440,...,0.535,0.563,0.562,0.544,0.529,0.216,0.236,0.235,0.224,0.217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24195,patient_168,other,chunk_196,-0.087,-0.076,-0.077,-0.105,-0.102,-0.035,-0.035,...,-0.150,-0.157,-0.153,-0.150,-0.148,0.065,0.060,0.060,0.059,0.054
24196,patient_168,other,chunk_197,-0.095,-0.093,-0.086,-0.079,-0.082,-0.045,-0.045,...,-0.144,-0.149,-0.151,-0.150,-0.150,0.056,0.052,0.049,0.051,0.046
24197,patient_168,other,chunk_198,-0.087,-0.077,-0.073,-0.070,-0.074,-0.036,-0.032,...,-0.150,-0.150,-0.151,-0.150,-0.145,0.045,0.045,0.040,0.040,0.039
24198,patient_168,other,chunk_199,-0.083,-0.074,-0.077,-0.076,-0.061,-0.046,-0.040,...,-0.143,-0.137,-0.136,-0.127,-0.112,0.035,0.034,0.040,0.046,0.056


In [58]:
joblib.dump(grid_gnb, MODEL_PATH + "XGB/XGB_base_model.pkl")

XGB_base_preds = pd.DataFrame({
    "patient_id" : val_data["patient_id"],
    "label" : val_data["label"],
    "y_true" : y_val,
    "pred" : val_preds["pred"],
})

XGB_base_probas = pd.DataFrame(val_preds["pred"], columns=[f"proba_clase_{c}" for c in xgb_base.classes_])
XGB_base_results = pd.concat([XGB_base_preds, XGB_base_probas], axis=1)
XGB_base_results.to_csv(MODEL_PATH + "XGB/predictions_base.csv", index=False)