## <a id = '0'> Índice </a>

* [**Entorno**](#1)  
   * [Librerías](#1d1)  
   * [Funciones](#1d2)  
   * [Constantes](#1d3)

* [**Lectura de datos**](#2)


## <a id = '1'> Entorno </a>
[índice](#0)

### <a id = '1d1'> Librerías </a>

In [1]:
import os
import pandas as pd

from sklearn.preprocessing import  LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import recall_score, f1_score, classification_report, make_scorer, confusion_matrix

import joblib
# from config import data_folder

In [2]:
os.chdir("../")

In [3]:
MODEL_PATH = "output/models/V5/" 

In [4]:
folders = ["LR", "GB", "GNB", "RFC", "XGB"]

# Crear las carpetas dentro de MODEL_PATH
for folder in folders:
    os.makedirs(os.path.join(MODEL_PATH, folder), exist_ok=True)

### <a id = '1d2'> Funciones </a>

In [5]:
from src.utils import table_metrics, params_to_markdown, genera_metricas_markdown, get_metrics_mode

In [6]:
from src.TicToc import TicToc
tt = TicToc()

## <a id = '2'> Lectura de datos </a>
[índice](#0)

In [7]:
train_data = pd.read_csv("output/chunk_data/chunk_100/transform/train_features_pca.csv")
test_data = pd.read_csv("output/chunk_data/chunk_100/transform/test_features_pca.csv")
val_data = pd.read_csv("output/chunk_data/chunk_100/transform/val_features_pca.csv")

In [8]:
train_data.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20,PC21,PC22,PC23,PC24,PC25,PC26,PC27,PC28,PC29,PC30,PC31,PC32,PC33,PC34,PC35,PC36,PC37,PC38,PC39,PC40,...,PC173,PC174,PC175,PC176,PC177,PC178,PC179,PC180,PC181,PC182,PC183,PC184,PC185,PC186,PC187,PC188,PC189,PC190,PC191,PC192,PC193,PC194,PC195,PC196,PC197,PC198,PC199,PC200,PC201,PC202,PC203,PC204,PC205,PC206,PC207,PC208,PC209,patient_id,label,chunk
0,10.148276,-1.40223,0.153016,3.813868,2.250402,-2.019743,-7.350832,0.630218,0.27116,2.658599,-2.141872,7.057871,-1.649907,-2.788314,5.676169,0.039248,-1.928396,-2.321654,3.952954,0.331818,-2.383477,-0.712138,-0.036235,-4.390949,2.672222,1.158139,0.252262,0.621585,1.730759,-3.234782,-2.156805,-1.823658,0.556581,2.159809,1.24364,-0.655469,-0.238544,-2.302586,-1.079131,1.191304,...,-1.492365,0.439138,-0.740307,1.945167,-0.614293,-0.457568,-1.357542,-0.648893,-1.988111,1.039707,0.232446,0.524833,0.758554,0.828379,-0.908278,-0.207626,0.744383,-1.010319,0.368094,0.357309,0.666896,0.822895,0.348808,-0.003874,0.769284,-0.030587,-0.676205,-0.386461,-0.927155,0.063207,0.496542,-0.287975,0.321459,1.467755,0.041362,-0.092136,-1.339324,patient_2415,mi,chunk_1
1,4.873685,-1.461977,-2.058691,4.553419,2.935823,-1.138437,-9.595811,1.357203,-0.472607,3.578254,-1.03031,3.109858,-0.885181,3.253007,4.126568,-1.385853,-1.607381,0.319206,1.616469,-2.058686,-0.294959,-0.877783,-0.309718,-7.930002,-0.41225,1.82572,2.08336,1.070589,0.499507,-4.554582,-3.443378,-1.676603,-0.531183,1.242262,-0.468809,-0.020872,-1.789332,-2.062501,-1.177218,0.751684,...,-0.629291,0.66396,-1.169181,1.798722,-0.390367,-1.565237,-1.509858,-1.41608,-0.186446,0.053241,0.219231,-0.209449,-1.280882,-0.231139,-0.043334,-1.741421,0.53597,-0.059562,1.145145,0.907467,-0.355314,-1.248045,-0.265758,0.574446,0.193662,0.651722,-1.09722,0.351268,-0.229389,-0.851568,0.849078,0.113429,-0.87421,0.055748,0.340438,0.766049,-0.743386,patient_2415,mi,chunk_2
2,1.998927,-3.802342,-0.592979,1.90137,4.419999,2.422615,-9.007257,0.775835,0.159821,-0.84785,1.757799,3.543745,1.89681,1.165673,3.538612,0.358387,-1.6386,1.330554,1.339022,-0.731971,0.084149,-0.896483,-0.071354,-5.737998,2.615555,1.048662,1.483151,-1.182454,1.97148,-2.356402,-3.146314,-1.554848,-0.637629,3.636393,0.427627,0.771211,-2.089188,-0.605176,-0.874739,0.322919,...,-0.004274,0.480277,-0.679322,0.563043,-1.331406,-0.666544,-1.392999,-0.183538,-0.11691,0.992789,1.203849,-0.365046,-0.322859,0.198757,-1.470981,-0.659867,-0.935897,-1.318334,0.582393,-0.127576,0.009138,-0.291834,-1.169602,0.196443,0.01579,0.294788,-0.359305,0.028365,-0.237452,-0.293907,1.087799,-1.360083,-0.480918,-0.305267,0.684268,1.480274,0.273683,patient_2415,mi,chunk_3
3,5.53832,0.15352,-3.554823,0.266149,3.783472,0.041783,-2.39658,0.788168,1.702126,-0.460574,-1.527564,6.548032,3.804456,-2.876596,0.636407,1.363994,-2.484851,-3.971149,2.773054,-1.692976,-0.270864,-1.116901,1.521629,-4.504526,4.035549,-0.446027,0.74541,0.999437,1.006302,-2.682152,-2.062914,-0.817249,-0.475916,2.496263,2.862525,1.934082,1.710814,-3.711843,-2.318868,-3.502922,...,0.08092,1.391779,-1.70525,1.095878,-0.044347,-1.071598,-0.664306,0.653375,-1.229281,0.457194,0.314625,-1.629657,0.768755,-0.074172,-1.661767,-0.936656,-0.150647,0.06123,1.07607,0.002966,1.340098,-0.650224,-0.447057,0.448374,0.868794,-0.270561,-2.100368,0.484121,-0.33546,-0.822153,0.381775,-1.033509,-0.128976,-0.449143,-0.03323,1.638922,0.393439,patient_2415,mi,chunk_4
4,6.602051,-0.410189,-2.279475,1.474503,3.340888,2.278934,-6.314006,2.789341,0.245165,1.044686,-0.779924,5.628798,0.820595,-0.861285,3.932437,1.145331,-3.118806,-0.811243,1.10838,0.960424,1.28687,-2.227382,0.162098,-4.490962,2.396699,0.080832,-0.610122,-1.045246,0.128221,-3.588759,-3.623985,-1.440632,-0.661872,1.619195,1.354921,1.320794,-0.281195,-1.537361,-0.521036,0.310191,...,-0.68983,0.906636,-1.805045,1.325876,-0.518347,-0.796667,-0.408704,-0.215765,-0.425125,0.532956,0.39183,0.02754,0.33168,0.057826,-0.93623,-0.609598,-0.629675,-0.705543,0.594026,0.378067,0.952553,-0.274359,0.140547,-0.26723,0.673836,-0.544581,-0.106491,-0.451732,-0.34868,-0.619559,0.485155,-0.951435,0.436977,-0.181716,0.621003,0.74037,0.03005,patient_2415,mi,chunk_5


In [9]:
#Columnas que no vamos a usar en el modelado
skip_columns = ["patient_id", "label", "chunk"]
le = LabelEncoder()

#Generamos las características y la variable objetivo 
X_train = train_data.drop(columns = skip_columns)
y_train = le.fit_transform(train_data["label"])
train_preds = train_data[["patient_id", "label"]]
train_preds["y_true"] = y_train

X_test = test_data.drop(columns = skip_columns)
y_test = le.fit_transform(test_data["label"])
test_preds = test_data[["patient_id", "label"]]
test_preds["y_true"] = y_test

X_val = val_data.drop(columns = skip_columns)
y_val = le.fit_transform(val_data["label"])
val_preds = val_data[["patient_id", "label"]]
val_preds["y_true"] = y_val


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_preds["y_true"] = y_train
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_preds["y_true"] = y_test
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_preds["y_true"] = y_val


In [10]:
recall_macro = make_scorer(recall_score, average='macro') #Todas las clases tienen el mismo peso
f1_score_macro = make_scorer(f1_score, average='macro') #Todas las clases tienen el mismo peso

## Logistic Regression

In [11]:
# Estimador
lr = LogisticRegression(penalty='l2', 
                        C=1e5, 
                        solver='lbfgs', 
                        random_state = 42)

# Parámetros
params_lr = {
    'penalty': ['l2'],
    'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]
}
       
# Grid Search
grid_lr = GridSearchCV(estimator=lr,
                       param_grid=params_lr,
                       scoring='accuracy',
                       cv=5,
                       verbose=1,
                       n_jobs=-1)

In [12]:
tt.tic()
grid_lr.fit(X_train, y_train)
tt.toc()

Fitting 5 folds for each of 7 candidates, totalling 35 fits


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Elapsed time: 8.020092 seconds


8.020092010498047

In [13]:
#Pronóstico
y_pred_grid_lr = grid_lr.best_estimator_.predict(X_test)
y_pred_prob_grid_lr = grid_lr.best_estimator_.predict_proba(X_test)

In [14]:
test_preds_LR = test_preds.copy()
test_preds_LR["pred"] = y_pred_grid_lr
print(get_metrics_mode(test_preds_LR))

| metric             |    value |
|:-------------------|---------:|
| accuracy           | 0.554167 |
| precision_weighted | 0.561159 |
| recall_weighted    | 0.554167 |
| f1_weighted        | 0.555268 |


In [15]:
grid_lr.best_params_

{'C': 0.0001, 'penalty': 'l2'}

In [16]:
print(params_to_markdown(grid_lr.best_params_))

| parámetro   | valor   |
|:------------|:--------|
| C           | 0.0001  |
| penalty     | l2      |


In [17]:
grid_lr.best_score_

0.5622199062011465

In [18]:
grid_lr.best_estimator_

In [19]:
joblib.dump(grid_lr, MODEL_PATH + "LR/logistic_regression_model.pkl")

LR_preds = pd.DataFrame({
    "patient_id" : test_data["patient_id"],
    "label" : test_data["label"],
    "y_true" : y_test,
    "pred" : y_pred_grid_lr,
})

LR_probas = pd.DataFrame(y_pred_prob_grid_lr, columns=[f"proba_clase_{c}" for c in grid_lr.classes_])
LR_results = pd.concat([LR_preds, LR_probas], axis=1)
LR_results.to_csv(MODEL_PATH + "LR/predictions.csv", index=False)


In [20]:
table_metrics(y_test,y_pred_grid_lr, y_pred_prob_grid_lr)

Unnamed: 0,metric,value
0,accuracy,0.519167
1,precision_weighted,0.526183
2,recall_weighted,0.519167
3,f1_weighted,0.521181
4,roc_auc_ovr,0.783896
5,log_loss,1.090416
6,gini_normalized,0.567791
7,ks_test_clase_0,0.401667
8,ks_test_clase_1,0.548889
9,ks_test_clase_2,0.402222


In [21]:
table_metrics(y_test,y_pred_grid_lr, y_pred_prob_grid_lr)

Unnamed: 0,metric,value
0,accuracy,0.519167
1,precision_weighted,0.526183
2,recall_weighted,0.519167
3,f1_weighted,0.521181
4,roc_auc_ovr,0.783896
5,log_loss,1.090416
6,gini_normalized,0.567791
7,ks_test_clase_0,0.401667
8,ks_test_clase_1,0.548889
9,ks_test_clase_2,0.402222


In [22]:
f1_score(y_test, y_pred_grid_lr, average='macro'),

(0.521181026913814,)

In [23]:
confusion_matrix(y_test, y_pred_grid_lr)

array([[296,  66,  89, 149],
       [123, 328, 108,  41],
       [ 67,  83, 312, 138],
       [137,  24, 129, 310]])

In [24]:
print(classification_report(y_test, y_pred_grid_lr))

              precision    recall  f1-score   support

           0       0.48      0.49      0.48       600
           1       0.65      0.55      0.60       600
           2       0.49      0.52      0.50       600
           3       0.49      0.52      0.50       600

    accuracy                           0.52      2400
   macro avg       0.53      0.52      0.52      2400
weighted avg       0.53      0.52      0.52      2400



In [25]:
genera_metricas_markdown(y_test,y_pred_grid_lr, y_pred_prob_grid_lr)

| metric             |    value |
|:-------------------|---------:|
| accuracy           | 0.519167 |
| precision_weighted | 0.526183 |
| recall_weighted    | 0.519167 |
| f1_weighted        | 0.521181 |
| roc_auc_ovr        | 0.783896 |
| log_loss           | 1.09042  |
| gini_normalized    | 0.567791 |
| ks_test_clase_0    | 0.401667 |
| ks_test_clase_1    | 0.548889 |
| ks_test_clase_2    | 0.402222 |
| ks_test_clase_3    | 0.441111 |


|              |   precision |   recall |   f1-score |     support |
|:-------------|------------:|---------:|-----------:|------------:|
| 0            |    0.47512  | 0.493333 |   0.484056 |  600        |
| 1            |    0.654691 | 0.546667 |   0.595822 |  600        |
| 2            |    0.489028 | 0.52     |   0.504039 |  600        |
| 3            |    0.485893 | 0.516667 |   0.500808 |  600        |
| accuracy     |    0.519167 | 0.519167 |   0.519167 |    0.519167 |
| macro avg    |    0.526183 | 0.519167 |   0.521181 | 2400        |
| we

In [26]:
cm_lr = confusion_matrix(y_test, y_pred_grid_lr)
df_cm_lr = pd.DataFrame(cm_lr,
                         index = [f"Real {label}" for label in grid_lr.classes_],
                         columns = [f"Pred {label}" for label in grid_lr.classes_])
print(df_cm_lr.to_markdown())


|        |   Pred 0 |   Pred 1 |   Pred 2 |   Pred 3 |
|:-------|---------:|---------:|---------:|---------:|
| Real 0 |      296 |       66 |       89 |      149 |
| Real 1 |      123 |      328 |      108 |       41 |
| Real 2 |       67 |       83 |      312 |      138 |
| Real 3 |      137 |       24 |      129 |      310 |


## Random Forest Classifier

In [27]:
#Estimador
rfc = RandomForestClassifier(random_state = 42, 
                             n_jobs = -1, 
                             bootstrap = True)

#Parámetros
params_rfc = {'n_estimators': [100, 350, 500],
             'max_features': [ 'sqrt'],
             'max_depth': [5, 10, 20],
             'min_samples_split': [2, 10, 30],
             'min_samples_leaf': [2, 10, 30]}

#Grid Search
grid_rfc = GridSearchCV(estimator=rfc,
                       param_grid=params_rfc,
                       scoring=recall_macro,
                       cv=5,
                       verbose=1,
                       n_jobs=-1)

In [28]:
tt.tic()
grid_rfc.fit(X_train, y_train)
tt.toc()
#37m 48.7s

Fitting 5 folds for each of 81 candidates, totalling 405 fits




Elapsed time: 2191.364112 seconds


2191.3641119003296

In [29]:
grid_rfc.best_params_

{'max_depth': 20,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 10,
 'n_estimators': 500}

In [30]:
print(params_to_markdown(grid_rfc.best_params_))

| parámetro         | valor   |
|:------------------|:--------|
| max_depth         | 20      |
| max_features      | sqrt    |
| min_samples_leaf  | 2       |
| min_samples_split | 10      |
| n_estimators      | 500     |


In [31]:
grid_rfc.best_score_

0.5483854166666667

In [32]:
grid_rfc.best_estimator_

In [33]:
#Pronóstico
y_pred_grid_rfc = grid_rfc.predict(X_test)
y_pred_prob_grid_rfc = grid_rfc.predict_proba(X_test)

In [34]:
test_preds_RFC = test_preds.copy()
test_preds_RFC["pred"] = y_pred_grid_rfc
print(get_metrics_mode(test_preds_RFC))

| metric             |    value |
|:-------------------|---------:|
| accuracy           | 0.545833 |
| precision_weighted | 0.557403 |
| recall_weighted    | 0.545833 |
| f1_weighted        | 0.548086 |


In [35]:
joblib.dump(grid_rfc, MODEL_PATH + "RFC/random_forest_model.pkl")

RFC_preds = pd.DataFrame({
    "patient_id" : test_data["patient_id"],
    "label" : test_data["label"],
    "y_true" : y_test,
    "pred" : y_pred_grid_rfc,
})

RFC_probas = pd.DataFrame(y_pred_prob_grid_rfc, columns=[f"proba_clase_{c}" for c in grid_rfc.classes_])
RFC_results = pd.concat([RFC_preds, RFC_probas], axis=1)
RFC_results.to_csv(MODEL_PATH + "RFC/predictions.csv", index=False)


In [36]:
table_metrics(y_test,y_pred_grid_rfc, y_pred_prob_grid_rfc)

Unnamed: 0,metric,value
0,accuracy,0.52875
1,precision_weighted,0.538321
2,recall_weighted,0.52875
3,f1_weighted,0.531489
4,roc_auc_ovr,0.780963
5,log_loss,1.163508
6,gini_normalized,0.561925
7,ks_test_clase_0,0.376111
8,ks_test_clase_1,0.537778
9,ks_test_clase_2,0.382222


In [37]:
confusion_matrix(y_test, y_pred_grid_rfc)

array([[319,  62,  93, 126],
       [119, 338, 110,  33],
       [109,  75, 295, 121],
       [141,  20, 122, 317]])

In [38]:
print(classification_report(y_test, y_pred_grid_rfc))

              precision    recall  f1-score   support

           0       0.46      0.53      0.50       600
           1       0.68      0.56      0.62       600
           2       0.48      0.49      0.48       600
           3       0.53      0.53      0.53       600

    accuracy                           0.53      2400
   macro avg       0.54      0.53      0.53      2400
weighted avg       0.54      0.53      0.53      2400



In [39]:
genera_metricas_markdown(y_test,y_pred_grid_rfc, y_pred_prob_grid_rfc)

| metric             |    value |
|:-------------------|---------:|
| accuracy           | 0.52875  |
| precision_weighted | 0.538321 |
| recall_weighted    | 0.52875  |
| f1_weighted        | 0.531489 |
| roc_auc_ovr        | 0.780963 |
| log_loss           | 1.16351  |
| gini_normalized    | 0.561925 |
| ks_test_clase_0    | 0.376111 |
| ks_test_clase_1    | 0.537778 |
| ks_test_clase_2    | 0.382222 |
| ks_test_clase_3    | 0.473889 |


|              |   precision |   recall |   f1-score |    support |
|:-------------|------------:|---------:|-----------:|-----------:|
| 0            |    0.463663 | 0.531667 |   0.495342 |  600       |
| 1            |    0.682828 | 0.563333 |   0.617352 |  600       |
| 2            |    0.475806 | 0.491667 |   0.483607 |  600       |
| 3            |    0.530988 | 0.528333 |   0.529657 |  600       |
| accuracy     |    0.52875  | 0.52875  |   0.52875  |    0.52875 |
| macro avg    |    0.538321 | 0.52875  |   0.531489 | 2400       |
| weighted a

In [40]:
cm_rfc = confusion_matrix(y_test, y_pred_grid_rfc)
df_cm_rfc = pd.DataFrame(cm_rfc,
                         index = [f"Real {label}" for label in grid_rfc.classes_],
                         columns = [f"Pred {label}" for label in grid_rfc.classes_])
print(df_cm_rfc.to_markdown())


|        |   Pred 0 |   Pred 1 |   Pred 2 |   Pred 3 |
|:-------|---------:|---------:|---------:|---------:|
| Real 0 |      319 |       62 |       93 |      126 |
| Real 1 |      119 |      338 |      110 |       33 |
| Real 2 |      109 |       75 |      295 |      121 |
| Real 3 |      141 |       20 |      122 |      317 |


# Gradient Boosting Classifier

In [41]:
#Estimador
gb = GradientBoostingClassifier(learning_rate=0.05, 
                                subsample=0.5, 
                                max_depth=6, 
                                n_estimators=10,
                                 random_state=42,
                                )

#Parámetros
params_gb = {'n_estimators': [10,100], 
             'learning_rate' : [0.01,0.1],
             'subsample' : [0.5,1.0], 
             'max_depth': [5,10,20],
             'min_samples_split': [2, 10],
             'min_samples_leaf': [10, 30],
             'max_features': [ 'sqrt']}

#Grid Search
grid_gb = GridSearchCV(estimator=gb,
                       param_grid=params_gb,
                       scoring=recall_macro,
                       cv = 5,
                       verbose=1,
                       n_jobs=-1)

In [42]:
#Entrenamiento
tt.tic()
grid_gb.fit(X_train, y_train)
tt.toc()
#38m 33.4s

Fitting 5 folds for each of 96 candidates, totalling 480 fits
Elapsed time: 2417.115517 seconds


2417.1155169010162

In [43]:
grid_gb.best_params_

{'learning_rate': 0.01,
 'max_depth': 20,
 'max_features': 'sqrt',
 'min_samples_leaf': 30,
 'min_samples_split': 2,
 'n_estimators': 100,
 'subsample': 1.0}

In [44]:
print(params_to_markdown(grid_gb.best_params_))

| parámetro         | valor   |
|:------------------|:--------|
| learning_rate     | 0.01    |
| max_depth         | 20      |
| max_features      | sqrt    |
| min_samples_leaf  | 30      |
| min_samples_split | 2       |
| n_estimators      | 100     |
| subsample         | 1.0     |


In [45]:
grid_gb.best_score_

0.5492677887961029

In [46]:
grid_gb.best_estimator_

In [47]:
#Predicciones
y_pred_grid_gb = grid_gb.predict(X_test)
y_pred_prob_grid_gb = grid_gb.predict_proba(X_test)

In [48]:
test_preds_GB = test_preds.copy()
test_preds_GB["pred"] = y_pred_grid_gb
print(get_metrics_mode(test_preds_GB))

| metric             |    value |
|:-------------------|---------:|
| accuracy           | 0.554167 |
| precision_weighted | 0.566695 |
| recall_weighted    | 0.554167 |
| f1_weighted        | 0.557091 |


In [49]:
joblib.dump(grid_rfc, MODEL_PATH + "GB/gradient_boosting_model.pkl")

GB_preds = pd.DataFrame({
    "patient_id" : test_data["patient_id"],
    "label" : test_data["label"],
    "y_true" : y_test,
    "pred" : y_pred_grid_gb,
})

GB_probas = pd.DataFrame(y_pred_prob_grid_gb, columns=[f"proba_clase_{c}" for c in grid_gb.classes_])
GB_results = pd.concat([GB_preds, GB_probas], axis=1)
GB_results.to_csv(MODEL_PATH + "GB/predictions.csv", index=False)


In [50]:
table_metrics(y_test,y_pred_grid_gb, y_pred_prob_grid_gb)

Unnamed: 0,metric,value
0,accuracy,0.530833
1,precision_weighted,0.541456
2,recall_weighted,0.530833
3,f1_weighted,0.533487
4,roc_auc_ovr,0.789092
5,log_loss,1.190363
6,gini_normalized,0.578184
7,ks_test_clase_0,0.373333
8,ks_test_clase_1,0.550556
9,ks_test_clase_2,0.415


In [51]:
confusion_matrix(y_test, y_pred_grid_gb)

array([[325,  67,  88, 120],
       [117, 328, 117,  38],
       [ 97,  71, 309, 123],
       [160,  15, 113, 312]])

In [52]:
print(classification_report(y_test, y_pred_grid_gb))

              precision    recall  f1-score   support

           0       0.46      0.54      0.50       600
           1       0.68      0.55      0.61       600
           2       0.49      0.52      0.50       600
           3       0.53      0.52      0.52       600

    accuracy                           0.53      2400
   macro avg       0.54      0.53      0.53      2400
weighted avg       0.54      0.53      0.53      2400



In [53]:
genera_metricas_markdown(y_test,y_pred_grid_gb, y_pred_prob_grid_gb)

| metric             |    value |
|:-------------------|---------:|
| accuracy           | 0.530833 |
| precision_weighted | 0.541456 |
| recall_weighted    | 0.530833 |
| f1_weighted        | 0.533487 |
| roc_auc_ovr        | 0.789092 |
| log_loss           | 1.19036  |
| gini_normalized    | 0.578184 |
| ks_test_clase_0    | 0.373333 |
| ks_test_clase_1    | 0.550556 |
| ks_test_clase_2    | 0.415    |
| ks_test_clase_3    | 0.485556 |


|              |   precision |   recall |   f1-score |     support |
|:-------------|------------:|---------:|-----------:|------------:|
| 0            |    0.46495  | 0.541667 |   0.500385 |  600        |
| 1            |    0.681913 | 0.546667 |   0.606846 |  600        |
| 2            |    0.492823 | 0.515    |   0.503667 |  600        |
| 3            |    0.526138 | 0.52     |   0.523051 |  600        |
| accuracy     |    0.530833 | 0.530833 |   0.530833 |    0.530833 |
| macro avg    |    0.541456 | 0.530833 |   0.533487 | 2400        |
| we

In [54]:
cm_gb = confusion_matrix(y_test, y_pred_grid_gb)
df_cm_gb = pd.DataFrame(cm_gb,
                         index = [f"Real {label}" for label in grid_gb.classes_],
                         columns = [f"Pred {label}" for label in grid_gb.classes_])
print(df_cm_gb.to_markdown())

|        |   Pred 0 |   Pred 1 |   Pred 2 |   Pred 3 |
|:-------|---------:|---------:|---------:|---------:|
| Real 0 |      325 |       67 |       88 |      120 |
| Real 1 |      117 |      328 |      117 |       38 |
| Real 2 |       97 |       71 |      309 |      123 |
| Real 3 |      160 |       15 |      113 |      312 |


## Naive Bayes

In [55]:
gnb = GaussianNB()

params_gnb = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3]
}
#Grid Search
grid_gnb = GridSearchCV(estimator=gnb,
                       param_grid=params_gnb,
                       scoring=recall_macro,
                       cv=5,
                       verbose=1,
                       n_jobs=-1)

In [56]:
#Entrenamiento
tt.tic()
grid_gnb.fit(X_train, y_train)
tt.toc()

Fitting 5 folds for each of 7 candidates, totalling 35 fits
Elapsed time: 0.610569 seconds


0.6105692386627197

In [57]:
grid_gnb.best_params_

{'var_smoothing': 1e-05}

In [58]:
print(params_to_markdown(grid_gnb.best_params_))

| parámetro     |   valor |
|:--------------|--------:|
| var_smoothing |   1e-05 |


In [59]:
grid_gnb.best_score_

0.38713324199721644

In [60]:
#Predicciones
y_pred_grid_gnb = grid_gnb.predict(X_test)
y_pred_prob_grid_gnb = grid_gnb.predict_proba(X_test)

In [61]:
test_preds_GNB = test_preds.copy()
test_preds_GNB["pred"] = y_pred_grid_gnb
print(get_metrics_mode(test_preds_GNB))

| metric             |    value |
|:-------------------|---------:|
| accuracy           | 0.383333 |
| precision_weighted | 0.420908 |
| recall_weighted    | 0.383333 |
| f1_weighted        | 0.364589 |


In [62]:
joblib.dump(grid_gnb, MODEL_PATH + "GNB/Gaussian_Naive_Bayes_model.pkl")

GNB_preds = pd.DataFrame({
    "patient_id" : test_data["patient_id"],
    "label" : test_data["label"],
    "y_true" : y_test,
    "pred" : y_pred_grid_gnb,
})

GNB_probas = pd.DataFrame(y_pred_prob_grid_gnb, columns=[f"proba_clase_{c}" for c in grid_gnb.classes_])
GNB_results = pd.concat([GNB_preds, GNB_probas], axis=1)
GNB_results.to_csv(MODEL_PATH + "GNB/predictions.csv", index=False)


In [63]:
table_metrics(y_test,y_pred_grid_gnb, y_pred_prob_grid_gnb)

Unnamed: 0,metric,value
0,accuracy,0.360417
1,precision_weighted,0.375583
2,recall_weighted,0.360417
3,f1_weighted,0.342724
4,roc_auc_ovr,0.649275
5,log_loss,5.066784
6,gini_normalized,0.29855
7,ks_test_clase_0,0.23
8,ks_test_clase_1,0.313889
9,ks_test_clase_2,0.187222


In [64]:
confusion_matrix(y_test, y_pred_grid_gnb)

array([[167, 266,  90,  77],
       [100, 407,  35,  58],
       [ 47, 359, 130,  64],
       [ 94, 181, 164, 161]])

In [65]:
print(classification_report(y_test, y_pred_grid_gnb))

              precision    recall  f1-score   support

           0       0.41      0.28      0.33       600
           1       0.34      0.68      0.45       600
           2       0.31      0.22      0.26       600
           3       0.45      0.27      0.34       600

    accuracy                           0.36      2400
   macro avg       0.38      0.36      0.34      2400
weighted avg       0.38      0.36      0.34      2400



In [66]:
genera_metricas_markdown(y_test,y_pred_grid_gnb, y_pred_prob_grid_gnb)

| metric             |    value |
|:-------------------|---------:|
| accuracy           | 0.360417 |
| precision_weighted | 0.375583 |
| recall_weighted    | 0.360417 |
| f1_weighted        | 0.342724 |
| roc_auc_ovr        | 0.649275 |
| log_loss           | 5.06678  |
| gini_normalized    | 0.29855  |
| ks_test_clase_0    | 0.23     |
| ks_test_clase_1    | 0.313889 |
| ks_test_clase_2    | 0.187222 |
| ks_test_clase_3    | 0.257778 |


|              |   precision |   recall |   f1-score |     support |
|:-------------|------------:|---------:|-----------:|------------:|
| 0            |    0.409314 | 0.278333 |   0.331349 |  600        |
| 1            |    0.335532 | 0.678333 |   0.44898  |  600        |
| 2            |    0.310263 | 0.216667 |   0.255152 |  600        |
| 3            |    0.447222 | 0.268333 |   0.335417 |  600        |
| accuracy     |    0.360417 | 0.360417 |   0.360417 |    0.360417 |
| macro avg    |    0.375583 | 0.360417 |   0.342724 | 2400        |
| we

In [67]:
cm_gnb = confusion_matrix(y_test, y_pred_grid_gnb)
df_cm_gnb = pd.DataFrame(cm_gnb,
                         index = [f"Real {label}" for label in grid_gnb.classes_],
                         columns = [f"Pred {label}" for label in grid_gnb.classes_])
print(df_cm_gnb.to_markdown())

|        |   Pred 0 |   Pred 1 |   Pred 2 |   Pred 3 |
|:-------|---------:|---------:|---------:|---------:|
| Real 0 |      167 |      266 |       90 |       77 |
| Real 1 |      100 |      407 |       35 |       58 |
| Real 2 |       47 |      359 |      130 |       64 |
| Real 3 |       94 |      181 |      164 |      161 |


## XGB Classifier

In [68]:
xgb_base = xgb.XGBClassifier(
    learning_rate=0.1,
    n_estimators=10000,
    # eval_metric="merror",
    eval_metric="auc",
    objective="multi:softprob",
    early_stopping_rounds=1000,
    
)

In [70]:
#Predicciones
y_pred_grid_xgb_b = xgb_base.predict(X_val)
y_pred_prob_grid_xgb_b = xgb_base.predict_proba(X_val)

In [71]:
test_preds_XGB = val_preds.copy()
test_preds_XGB["pred"] = y_pred_grid_xgb_b
print(get_metrics_mode(test_preds_XGB))

| metric             |    value |
|:-------------------|---------:|
| accuracy           | 0.593361 |
| precision_weighted | 0.594469 |
| recall_weighted    | 0.593361 |
| f1_weighted        | 0.593719 |


In [72]:
table_metrics(y_val,y_pred_grid_xgb_b, y_pred_prob_grid_xgb_b)

Unnamed: 0,metric,value
0,accuracy,0.573444
1,precision_weighted,0.573925
2,recall_weighted,0.573444
3,f1_weighted,0.573673
4,roc_auc_ovr,0.824416
5,log_loss,1.280273
6,gini_normalized,0.648832
7,ks_test_clase_0,0.459024
8,ks_test_clase_1,0.611123
9,ks_test_clase_2,0.447823


In [73]:
table_metrics(y_val,y_pred_grid_xgb_b, y_pred_prob_grid_xgb_b)

Unnamed: 0,metric,value
0,accuracy,0.573444
1,precision_weighted,0.573925
2,recall_weighted,0.573444
3,f1_weighted,0.573673
4,roc_auc_ovr,0.824416
5,log_loss,1.280273
6,gini_normalized,0.648832
7,ks_test_clase_0,0.459024
8,ks_test_clase_1,0.611123
9,ks_test_clase_2,0.447823


In [74]:
joblib.dump(xgb_base, MODEL_PATH + "XGB/XGB_base_model.pkl")

XGB_base_preds = pd.DataFrame({
    "patient_id" : val_data["patient_id"],
    "label" : val_data["label"],
    "y_true" : y_val,
    "pred" : y_pred_grid_xgb_b,
})

XGB_base_probas = pd.DataFrame(y_pred_prob_grid_xgb_b, columns=[f"proba_clase_{c}" for c in xgb_base.classes_])
XGB_base_results = pd.concat([XGB_base_preds, XGB_base_probas], axis=1)
XGB_base_results.to_csv(MODEL_PATH + "XGB/predictions_base.csv", index=False)


In [75]:
genera_metricas_markdown(y_val,y_pred_grid_xgb_b, y_pred_prob_grid_xgb_b)

| metric             |    value |
|:-------------------|---------:|
| accuracy           | 0.573444 |
| precision_weighted | 0.573925 |
| recall_weighted    | 0.573444 |
| f1_weighted        | 0.573673 |
| roc_auc_ovr        | 0.824416 |
| log_loss           | 1.28027  |
| gini_normalized    | 0.648832 |
| ks_test_clase_0    | 0.459024 |
| ks_test_clase_1    | 0.611123 |
| ks_test_clase_2    | 0.447823 |
| ks_test_clase_3    | 0.500985 |


|              |   precision |   recall |   f1-score |     support |
|:-------------|------------:|---------:|-----------:|------------:|
| 0            |    0.516447 | 0.523333 |   0.519868 |  600        |
| 1            |    0.688552 | 0.681667 |   0.685092 |  600        |
| 2            |    0.534314 | 0.536066 |   0.535188 |  610        |
| 3            |    0.557047 | 0.553333 |   0.555184 |  600        |
| accuracy     |    0.573444 | 0.573444 |   0.573444 |    0.573444 |
| macro avg    |    0.57409  | 0.5736   |   0.573833 | 2410        |
| we

In [76]:
cm_xgb_base = confusion_matrix(y_val, y_pred_grid_xgb_b)
df_cm_xgb_base = pd.DataFrame(cm_xgb_base,
                         index = [f"Real {label}" for label in xgb_base.classes_],
                         columns = [f"Pred {label}" for label in xgb_base.classes_])
print(df_cm_xgb_base.to_markdown())

|        |   Pred 0 |   Pred 1 |   Pred 2 |   Pred 3 |
|:-------|---------:|---------:|---------:|---------:|
| Real 0 |      314 |      107 |       64 |      115 |
| Real 1 |       89 |      409 |       85 |       17 |
| Real 2 |       90 |       61 |      327 |      132 |
| Real 3 |      115 |       17 |      136 |      332 |


In [77]:
print(params_to_markdown(xgb_base.get_params()))

| parámetro             | valor          |
|:----------------------|:---------------|
| objective             | multi:softprob |
| early_stopping_rounds | 1000           |
| enable_categorical    | False          |
| eval_metric           | auc            |
| learning_rate         | 0.1            |
| missing               | nan            |
| n_estimators          | 10000          |
