## Carga de datos

In [1]:
# Código obtenido desde https://archive.ics.uci.edu/dataset/579/myocardial+infarction+complications

import numpy as np
import pandas as pd
from ucimlrepo import fetch_ucirepo 
  
# Extraer dataset
myocardial_infarction_complications = fetch_ucirepo(id=579);

# Data (como DataFrames de Pandas) 
X = myocardial_infarction_complications.data.features 
y = myocardial_infarction_complications.data.targets 

# Metadata 
# print(myocardial_infarction_complications.metadata) 

# Información de variables 
# print(myocardial_infarction_complications.variables) 

## Ingeniería de variables

In [2]:
# Construcción de variables y target
X_1 = X.copy()
X_1 = X_1.fillna(-1)

y_1 = y['LET_IS']!=0

In [3]:
# Selección de variables: Se utiliza un Arbol de decisión que busque identificar todas las variables necesarias 
# para identificar completamente cada elemento del target.

from sklearn.tree import DecisionTreeClassifier

feature_selection_model = DecisionTreeClassifier(random_state=123)
feature_selection_model.fit(X_1, y_1)

feature_importances = pd.DataFrame({
    'Feature': X_1.columns,
    'Importance': feature_selection_model.feature_importances_
})

In [4]:
# Las variables que no tienen importancia 0, se descartan.
features_to_delete = feature_importances.query('Importance==0')['Feature'].values
X_1.drop(columns=features_to_delete, inplace=True)
print(f'Se ha reducido el número de variables de {X.shape[1]} a {X_1.shape[1]}.')

Se ha reducido el número de variables de 111 a 58.


In [5]:
# Adaptado de https://github.com/optuna/optuna-examples/blob/main/sklearn/sklearn_simple.py

import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def objective(trial):

    classifier_name = trial.suggest_categorical("classifier", ["RandomForest"])
    if classifier_name == "RandomForest":
        rf_max_depth = trial.suggest_int("rf_max_depth", 3, 15, log=False)
        rf_n_estimators = trial.suggest_int("rf_n_estimators", 20, 500, log=True)
        rf_min_samples_leaf = trial.suggest_int("rf_min_samples_leaf", 3, 100, log=True)
        rf_criterion = trial.suggest_categorical("rf_criterion", ["entropy","gini","log_loss"])
        classifier_obj = RandomForestClassifier(
            max_depth=rf_max_depth,
            n_estimators=rf_n_estimators,
            min_samples_leaf=rf_min_samples_leaf,
            criterion=rf_criterion,
            random_state=123,
            n_jobs=-1
        )

    score = cross_val_score(classifier_obj, X_1, y_1, n_jobs=-1, cv=5, scoring='f1_weighted')
    f1_weighted = score.mean()
    return f1_weighted


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=40)
print(study.best_trial)

  from .autonotebook import tqdm as notebook_tqdm
[I 2024-06-17 17:55:19,722] A new study created in memory with name: no-name-3fcaf9aa-381e-4026-aaec-fb0e16b85121
[I 2024-06-17 17:55:22,795] Trial 0 finished with value: 0.767786055252772 and parameters: {'classifier': 'RandomForest', 'rf_max_depth': 6, 'rf_n_estimators': 217, 'rf_min_samples_leaf': 84, 'rf_criterion': 'gini'}. Best is trial 0 with value: 0.767786055252772.
[I 2024-06-17 17:55:25,081] Trial 1 finished with value: 0.8950814719085767 and parameters: {'classifier': 'RandomForest', 'rf_max_depth': 5, 'rf_n_estimators': 306, 'rf_min_samples_leaf': 38, 'rf_criterion': 'log_loss'}. Best is trial 1 with value: 0.8950814719085767.
[I 2024-06-17 17:55:26,644] Trial 2 finished with value: 0.8950814719085767 and parameters: {'classifier': 'RandomForest', 'rf_max_depth': 14, 'rf_n_estimators': 72, 'rf_min_samples_leaf': 26, 'rf_criterion': 'gini'}. Best is trial 1 with value: 0.8950814719085767.
[I 2024-06-17 17:55:26,889] Trial 3 

FrozenTrial(number=25, state=1, values=[0.9046656463038818], datetime_start=datetime.datetime(2024, 6, 17, 17, 55, 31, 888193), datetime_complete=datetime.datetime(2024, 6, 17, 17, 55, 32, 32909), params={'classifier': 'RandomForest', 'rf_max_depth': 10, 'rf_n_estimators': 37, 'rf_min_samples_leaf': 3, 'rf_criterion': 'log_loss'}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'classifier': CategoricalDistribution(choices=('RandomForest',)), 'rf_max_depth': IntDistribution(high=15, log=False, low=3, step=1), 'rf_n_estimators': IntDistribution(high=500, log=True, low=20, step=1), 'rf_min_samples_leaf': IntDistribution(high=100, log=True, low=3, step=1), 'rf_criterion': CategoricalDistribution(choices=('entropy', 'gini', 'log_loss'))}, trial_id=25, value=None)


In [6]:
# Mejores parámetros
study.best_trial.params

{'classifier': 'RandomForest',
 'rf_max_depth': 10,
 'rf_n_estimators': 37,
 'rf_min_samples_leaf': 3,
 'rf_criterion': 'log_loss'}

In [7]:
X_1.shape

(1700, 58)

## Validación cruzada

In [15]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

N_FOLDS = 5

kf = KFold(n_splits=N_FOLDS, random_state=123, shuffle=True)
y_hat_rf = np.zeros(len(X_1))

params = {
    'max_depth': study.best_trial.params['rf_max_depth'],
    'n_estimators': study.best_trial.params['rf_n_estimators'],
    'min_samples_leaf': study.best_trial.params['rf_min_samples_leaf'],
    'criterion': study.best_trial.params['rf_criterion'],
    'n_jobs': -1,
    'random_state': 123,
}

model = RandomForestClassifier(**params)
# Se entrena para guardar el artefacto
model.fit(X_1, y_1)

auc_score = cross_val_score(model, X_1, y_1, n_jobs=-1, cv=10, scoring='roc_auc')
wF1_score = cross_val_score(model, X_1, y_1, n_jobs=-1, cv=10, scoring='f1_weighted')
precision_score = cross_val_score(model, X_1, y_1, n_jobs=-1, cv=10, scoring='precision_weighted')
recall_score = cross_val_score(model, X_1, y_1, n_jobs=-1, cv=10, scoring='recall_weighted')
accuracy_score = cross_val_score(model, X_1, y_1, n_jobs=-1, cv=10, scoring='accuracy')

print(f'F1 ponderado: {np.round(wF1_score.mean(), 3)}. Garces et. al. : 0.900')
print(f'Precisión: {np.round(precision_score.mean(), 3)}. Garces et. al. : 0.899')
print(f'Recall: {np.round(recall_score.mean(), 3)}. Garces et. al. : 0.903')
print(f'Accuracy: {np.round(accuracy_score.mean(), 3)}. Garces et. al. : 0.903')

F1 ponderado: 0.906. Garces et. al. : 0.900
Precisión: 0.92. Garces et. al. : 0.899
Recall: 0.918. Garces et. al. : 0.903
Accuracy: 0.918. Garces et. al. : 0.903


### Opcional: Registro de Experimento y Métricas en MLFlow

In [9]:
# Ejecutar el siguiente comando en terminal: mlflow ui

In [16]:
# Registro en MLFlow
# Adaptado de https://mlflow.org/docs/latest/getting-started/intro-quickstart/index.html
import mlflow
from mlflow.models import infer_signature

mlflow.set_tracking_uri(uri="http://localhost:5000")

mlflow.set_experiment("Modelo de Prueba")

with mlflow.start_run():
    # Registro de hiperparámetros
    mlflow.log_params(params)

    # Registro de métricas
    mlflow.log_metric("ROC_AUC", auc_score.mean())
    mlflow.log_metric("Weighted_F1", wF1_score.mean())

    # Etiquetas
    mlflow.set_tag("Random Forest", "Modelo base")

    # Signature del modelo
    signature = infer_signature(X_1, model.predict_proba(X_1)[:,1])

    # Registro del modelo
    model_info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="modelos_experimento",
        signature=signature,
        input_example=X_1.head(),
        registered_model_name="base-randomforest-tuned",
    )

Registered model 'base-randomforest-tuned' already exists. Creating a new version of this model...
2024/06/17 17:58:24 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: base-randomforest-tuned, version 3
Created version '3' of model 'base-randomforest-tuned'.
