## Carga de datos

In [54]:
# Código obtenido desde https://archive.ics.uci.edu/dataset/579/myocardial+infarction+complications

import numpy as np
import pandas as pd
from ucimlrepo import fetch_ucirepo 
  
# Extraer dataset
myocardial_infarction_complications = fetch_ucirepo(id=579);

# Data (como DataFrames de Pandas) 
X = myocardial_infarction_complications.data.features 
y = myocardial_infarction_complications.data.targets 

## Ingeniería de variables

In [38]:
# Construcción de variables y target
X_1 = X.copy()

y_1 = y['LET_IS']!=0

In [39]:
# Selección de variables: Se utiliza un Arbol de decisión que busque identificar todas las variables necesarias 
# para identificar completamente cada elemento del target.

from sklearn.tree import DecisionTreeClassifier

feature_selection_model = DecisionTreeClassifier(random_state=123)
feature_selection_model.fit(X_1, y_1)

feature_importances = pd.DataFrame({
    'Feature': X_1.columns,
    'Importance': feature_selection_model.feature_importances_
})

In [40]:
# Las variables que no tienen importancia 0, se descartan.
features_to_delete = feature_importances.query('Importance==0')['Feature'].values
X_1.drop(columns=features_to_delete, inplace=True)
print(f'Se ha reducido el número de variables de {X.shape[1]} a {X_1.shape[1]}.')

Se ha reducido el número de variables de 111 a 48.


In [48]:
# Adaptado de https://github.com/optuna/optuna-examples/blob/main/sklearn/sklearn_simple.py

import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def objective(trial):

    classifier_name = trial.suggest_categorical("classifier", ["RandomForest"])
    if classifier_name == "RandomForest":
        rf_max_depth = trial.suggest_int("rf_max_depth", 3, 15, log=False)
        rf_n_estimators = trial.suggest_int("rf_n_estimators", 20, 500, log=True)
        rf_min_samples_leaf = trial.suggest_int("rf_min_samples_leaf", 3, 100, log=True)
        rf_criterion = trial.suggest_categorical("rf_criterion", ["entropy","gini","log_loss"])
        classifier_obj = RandomForestClassifier(
            max_depth=rf_max_depth,
            n_estimators=rf_n_estimators,
            min_samples_leaf=rf_min_samples_leaf,
            criterion=rf_criterion,
            random_state=123,
            n_jobs=-1
        )

    score = cross_val_score(classifier_obj, X_1, y_1, n_jobs=-1, cv=5, scoring='f1_weighted')
    f1_weighted = score.mean()
    return f1_weighted


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=40)
print(study.best_trial)

[I 2024-06-17 15:03:16,340] A new study created in memory with name: no-name-ad29b486-f8a0-495c-8c31-036201e72390
[I 2024-06-17 15:03:16,835] Trial 0 finished with value: 0.905521420367559 and parameters: {'classifier': 'RandomForest', 'rf_max_depth': 13, 'rf_n_estimators': 154, 'rf_min_samples_leaf': 5, 'rf_criterion': 'entropy'}. Best is trial 0 with value: 0.905521420367559.
[I 2024-06-17 15:03:16,955] Trial 1 finished with value: 0.9006321513482256 and parameters: {'classifier': 'RandomForest', 'rf_max_depth': 13, 'rf_n_estimators': 24, 'rf_min_samples_leaf': 7, 'rf_criterion': 'entropy'}. Best is trial 0 with value: 0.905521420367559.
[I 2024-06-17 15:03:17,403] Trial 2 finished with value: 0.767786055252772 and parameters: {'classifier': 'RandomForest', 'rf_max_depth': 8, 'rf_n_estimators': 190, 'rf_min_samples_leaf': 100, 'rf_criterion': 'log_loss'}. Best is trial 0 with value: 0.905521420367559.
[I 2024-06-17 15:03:17,563] Trial 3 finished with value: 0.8943308340084994 and par

FrozenTrial(number=26, state=1, values=[0.9095447747758685], datetime_start=datetime.datetime(2024, 6, 17, 15, 3, 24, 433202), datetime_complete=datetime.datetime(2024, 6, 17, 15, 3, 24, 636443), params={'classifier': 'RandomForest', 'rf_max_depth': 8, 'rf_n_estimators': 65, 'rf_min_samples_leaf': 3, 'rf_criterion': 'gini'}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'classifier': CategoricalDistribution(choices=('RandomForest',)), 'rf_max_depth': IntDistribution(high=15, log=False, low=3, step=1), 'rf_n_estimators': IntDistribution(high=500, log=True, low=20, step=1), 'rf_min_samples_leaf': IntDistribution(high=100, log=True, low=3, step=1), 'rf_criterion': CategoricalDistribution(choices=('entropy', 'gini', 'log_loss'))}, trial_id=26, value=None)


In [50]:
study.best_trial.params

{'classifier': 'RandomForest',
 'rf_max_depth': 8,
 'rf_n_estimators': 65,
 'rf_min_samples_leaf': 3,
 'rf_criterion': 'gini'}

## Entrenamiento del modelo

In [51]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

N_FOLDS = 5

kf = KFold(n_splits=N_FOLDS, random_state=123, shuffle=True)
y_hat_rf = np.zeros(len(X_1))

params = {
    'max_depth': study.best_trial.params['rf_max_depth'],
    'n_estimators': study.best_trial.params['rf_n_estimators'],
    'min_samples_leaf': study.best_trial.params['rf_min_samples_leaf'],
    'n_jobs': -1,
    'random_state': 123,
    'criterion': 'entropy',
}

for i, (train_index, valid_index) in enumerate(kf.split(X)):
    X_train = X_1.iloc[train_index, :]
    y_train = y_1.iloc[train_index]
    X_valid = X_1.iloc[valid_index, :]
    y_valid = y_1.iloc[valid_index]

    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)

    y_hat_valid = model.predict_proba(X_valid)[:,1]
    y_hat_rf[valid_index] = y_hat_valid

## Evaluación de resultados

In [52]:
from sklearn.metrics import roc_auc_score, f1_score

auc_score = roc_auc_score(y_1, y_hat_rf)
wF1_score = f1_score(y_1, y_hat_rf>=0.5, average='weighted')

print(f'ROC-AUC : {auc_score}.')
print(f'F1 ponderado: {wF1_score}.')

ROC-AUC : 0.9312553097539372.
F1 ponderado: 0.905658250855685.


### Opcional: Registro de Experimento y Métricas en MLFlow

In [13]:
# Ejecutar el siguiente comando en terminal: mlflow ui

In [53]:
# Registro en MLFlow
# Adaptado de https://mlflow.org/docs/latest/getting-started/intro-quickstart/index.html
import mlflow
from mlflow.models import infer_signature

mlflow.set_tracking_uri(uri="http://localhost:5000")

mlflow.set_experiment("Modelo de Prueba")

with mlflow.start_run():
    # Registro de hiperparámetros
    mlflow.log_params(params)

    # Registro de métricas
    mlflow.log_metric("ROC_AUC", auc_score)
    mlflow.log_metric("Weighted_F1", wF1_score)

    # Etiquetas
    mlflow.set_tag("Random Forest", "Modelo base")

    # Signature del modelo
    signature = infer_signature(X_train, model.predict_proba(X_train)[:,1])

    # Registro del modelo
    model_info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="modelos_experimento",
        signature=signature,
        input_example=X_train.head(),
        registered_model_name="base-randomforest-tuned",
    )

Successfully registered model 'base-randomforest-tuned'.
2024/06/17 15:08:29 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: base-randomforest-tuned, version 1
Created version '1' of model 'base-randomforest-tuned'.
