## Carga de datos

In [1]:
!pip3 install -U ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [2]:
# Código obtenido desde https://archive.ics.uci.edu/dataset/579/myocardial+infarction+complications

import numpy as np
import pandas as pd
from ucimlrepo import fetch_ucirepo

# Extraer dataset
myocardial_infarction_complications = fetch_ucirepo(id=579);

# Data (como DataFrames de Pandas)
X = myocardial_infarction_complications.data.features
y = myocardial_infarction_complications.data.targets

# Metadata
# print(myocardial_infarction_complications.metadata)

# Información de variables
# print(myocardial_infarction_complications.variables)

## Ingeniería de variables

In [3]:
# Construcción de variables y target
X_1 = X.copy()
X_1 = X_1.fillna(-1)

y_1 = y['LET_IS']!=0

In [4]:
# Selección de variables: Se utiliza un Arbol de decisión que busque identificar todas las variables necesarias
# para identificar completamente cada elemento del target.

from sklearn.tree import DecisionTreeClassifier

feature_selection_model = DecisionTreeClassifier(random_state=123)
feature_selection_model.fit(X_1, y_1)

feature_importances = pd.DataFrame({
    'Feature': X_1.columns,
    'Importance': feature_selection_model.feature_importances_
})

In [5]:
# Las variables que no tienen importancia 0, se descartan.
features_to_delete = feature_importances.query('Importance==0')['Feature'].values
X_1.drop(columns=features_to_delete, inplace=True)
print(f'Se ha reducido el número de variables de {X.shape[1]} a {X_1.shape[1]}.')

Se ha reducido el número de variables de 111 a 58.


In [6]:
pip install optuna

Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.5 alembic-1.13.1 colorlog-6.8.2 optuna-3.6.1


# RandomForest

In [7]:
# Adaptado de https://github.com/optuna/optuna-examples/blob/main/sklearn/sklearn_simple.py

import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

def objective(trial):

    classifier_name = trial.suggest_categorical("classifier", ["RandomForest"])
    if classifier_name == "RandomForest":
        rf_max_depth = trial.suggest_int("rf_max_depth", 3, 15, log=False)
        rf_n_estimators = trial.suggest_int("rf_n_estimators", 20, 500, log=True)
        rf_min_samples_leaf = trial.suggest_int("rf_min_samples_leaf", 3, 100, log=True)
        rf_criterion = trial.suggest_categorical("rf_criterion", ["entropy","gini","log_loss"])
        classifier_obj = RandomForestClassifier(
            max_depth=rf_max_depth,
            n_estimators=rf_n_estimators,
            min_samples_leaf=rf_min_samples_leaf,
            criterion=rf_criterion,
            random_state=123,
            n_jobs=-1
        )

    score = cross_val_score(classifier_obj, X_1, y_1, n_jobs=-1, cv=5, scoring='f1_weighted')
    f1_weighted = score.mean()
    return f1_weighted


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=40)
print(study.best_trial)

[I 2024-06-24 18:15:17,117] A new study created in memory with name: no-name-6a331a89-0ead-4a40-8427-c8517e396a89
[I 2024-06-24 18:15:27,148] Trial 0 finished with value: 0.8955935541501873 and parameters: {'classifier': 'RandomForest', 'rf_max_depth': 14, 'rf_n_estimators': 121, 'rf_min_samples_leaf': 20, 'rf_criterion': 'entropy'}. Best is trial 0 with value: 0.8955935541501873.
[I 2024-06-24 18:15:28,985] Trial 1 finished with value: 0.767786055252772 and parameters: {'classifier': 'RandomForest', 'rf_max_depth': 12, 'rf_n_estimators': 48, 'rf_min_samples_leaf': 90, 'rf_criterion': 'log_loss'}. Best is trial 0 with value: 0.8955935541501873.
[I 2024-06-24 18:15:36,559] Trial 2 finished with value: 0.8950814719085768 and parameters: {'classifier': 'RandomForest', 'rf_max_depth': 6, 'rf_n_estimators': 234, 'rf_min_samples_leaf': 7, 'rf_criterion': 'log_loss'}. Best is trial 0 with value: 0.8955935541501873.
[I 2024-06-24 18:15:37,905] Trial 3 finished with value: 0.8897918242793359 an

FrozenTrial(number=36, state=TrialState.COMPLETE, values=[0.9048269290984438], datetime_start=datetime.datetime(2024, 6, 24, 18, 16, 34, 419044), datetime_complete=datetime.datetime(2024, 6, 24, 18, 16, 35, 440012), params={'classifier': 'RandomForest', 'rf_max_depth': 13, 'rf_n_estimators': 66, 'rf_min_samples_leaf': 3, 'rf_criterion': 'entropy'}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'classifier': CategoricalDistribution(choices=('RandomForest',)), 'rf_max_depth': IntDistribution(high=15, log=False, low=3, step=1), 'rf_n_estimators': IntDistribution(high=500, log=True, low=20, step=1), 'rf_min_samples_leaf': IntDistribution(high=100, log=True, low=3, step=1), 'rf_criterion': CategoricalDistribution(choices=('entropy', 'gini', 'log_loss'))}, trial_id=36, value=None)


In [8]:
# Mejores parámetros
study.best_trial.params

{'classifier': 'RandomForest',
 'rf_max_depth': 13,
 'rf_n_estimators': 66,
 'rf_min_samples_leaf': 3,
 'rf_criterion': 'entropy'}

In [9]:
X_1.shape

(1700, 58)

## Validación cruzada

In [10]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

N_FOLDS = 5

kf = KFold(n_splits=N_FOLDS, random_state=123, shuffle=True)
y_hat_rf = np.zeros(len(X_1))

params = {
    'max_depth': study.best_trial.params['rf_max_depth'],
    'n_estimators': study.best_trial.params['rf_n_estimators'],
    'min_samples_leaf': study.best_trial.params['rf_min_samples_leaf'],
    'criterion': study.best_trial.params['rf_criterion'],
    'n_jobs': -1,
    'random_state': 123,
}

model = RandomForestClassifier(**params)
# Se entrena para guardar el artefacto
model.fit(X_1, y_1)

auc_score = cross_val_score(model, X_1, y_1, n_jobs=-1, cv=10, scoring='roc_auc')
wF1_score = cross_val_score(model, X_1, y_1, n_jobs=-1, cv=10, scoring='f1_weighted')
precision_score = cross_val_score(model, X_1, y_1, n_jobs=-1, cv=10, scoring='precision_weighted')
recall_score = cross_val_score(model, X_1, y_1, n_jobs=-1, cv=10, scoring='recall_weighted')
accuracy_score = cross_val_score(model, X_1, y_1, n_jobs=-1, cv=10, scoring='accuracy')

print(f'F1 ponderado: {np.round(wF1_score.mean(), 3)}. Garces et. al. : 0.900')
print(f'Precisión: {np.round(precision_score.mean(), 3)}. Garces et. al. : 0.899')
print(f'Recall: {np.round(recall_score.mean(), 3)}. Garces et. al. : 0.903')
print(f'Accuracy: {np.round(accuracy_score.mean(), 3)}. Garces et. al. : 0.903')

F1 ponderado: 0.906. Garces et. al. : 0.900
Precisión: 0.921. Garces et. al. : 0.899
Recall: 0.919. Garces et. al. : 0.903
Accuracy: 0.919. Garces et. al. : 0.903


# Regresion logistica

In [12]:
# Adaptado de https://github.com/optuna/optuna-examples/blob/main/sklearn/sklearn_simple.py

import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

def objective(trial):

    classifier_name = trial.suggest_categorical("classifier", ["LogisticRegression"])

    if classifier_name == "LogisticRegression":
        r = trial.suggest_float('l1_ratio', 0, 1, log=False)
        c = trial.suggest_float('C', 1e-10, 1000, log=True)
        classifier_obj =  LogisticRegression(max_iter=5000, solver='saga', penalty='elasticnet', l1_ratio=r, C=c)

    score = cross_val_score(classifier_obj, X_1, y_1, n_jobs=-1, cv=5, scoring='f1_weighted')
    f1_weighted = score.mean()
    return f1_weighted


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=40)
print(study.best_trial)

[I 2024-06-24 18:20:49,910] A new study created in memory with name: no-name-55e74e55-2417-4d11-b341-6b5d70408d5e
[I 2024-06-24 18:20:53,449] Trial 0 finished with value: 0.767786055252772 and parameters: {'classifier': 'LogisticRegression', 'l1_ratio': 0.09812523882327084, 'C': 3.5411827889726974e-06}. Best is trial 0 with value: 0.767786055252772.
[I 2024-06-24 18:21:35,480] Trial 1 finished with value: 0.870662355870542 and parameters: {'classifier': 'LogisticRegression', 'l1_ratio': 0.8659950064783097, 'C': 440.1684308109714}. Best is trial 1 with value: 0.870662355870542.
[I 2024-06-24 18:21:35,615] Trial 2 finished with value: 0.767786055252772 and parameters: {'classifier': 'LogisticRegression', 'l1_ratio': 0.05170865171025918, 'C': 2.7252933423096006e-09}. Best is trial 1 with value: 0.870662355870542.
[I 2024-06-24 18:22:15,374] Trial 3 finished with value: 0.870662355870542 and parameters: {'classifier': 'LogisticRegression', 'l1_ratio': 0.6523167435173035, 'C': 104.875617529

FrozenTrial(number=31, state=TrialState.COMPLETE, values=[0.8763748364635555], datetime_start=datetime.datetime(2024, 6, 24, 18, 33, 15, 389085), datetime_complete=datetime.datetime(2024, 6, 24, 18, 33, 46, 742519), params={'classifier': 'LogisticRegression', 'l1_ratio': 0.40941406560108734, 'C': 0.15043229093931496}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'classifier': CategoricalDistribution(choices=('LogisticRegression',)), 'l1_ratio': FloatDistribution(high=1.0, log=False, low=0.0, step=None), 'C': FloatDistribution(high=1000.0, log=True, low=1e-10, step=None)}, trial_id=31, value=None)


In [13]:
# metricas
from sklearn.linear_model import LogisticRegression

params = {
    'l1_ratio': study.best_trial.params['l1_ratio'],
    'C': study.best_trial.params['C'],
    'max_iter': 5000,
    'solver': 'saga',
    'penalty': 'elasticnet',
}

model = LogisticRegression(**params)
# Se entrena para guardar el artefacto
model.fit(X_1, y_1)

auc_score = cross_val_score(model, X_1, y_1, n_jobs=-1, cv=10, scoring='roc_auc')
wF1_score = cross_val_score(model, X_1, y_1, n_jobs=-1, cv=10, scoring='f1_weighted')
precision_score = cross_val_score(model, X_1, y_1, n_jobs=-1, cv=10, scoring='precision_weighted')
recall_score = cross_val_score(model, X_1, y_1, n_jobs=-1, cv=10, scoring='recall_weighted')
accuracy_score = cross_val_score(model, X_1, y_1, n_jobs=-1, cv=10, scoring='accuracy')

print(f'F1 ponderado: {np.round(wF1_score.mean(), 3)}. Garces et. al. : 0.900')
print(f'Precisión: {np.round(precision_score.mean(), 3)}. Garces et. al. : 0.899')
print(f'Recall: {np.round(recall_score.mean(), 3)}. Garces et. al. : 0.903')
print(f'Accuracy: {np.round(accuracy_score.mean(), 3)}. Garces et. al. : 0.903')

F1 ponderado: 0.88. Garces et. al. : 0.900
Precisión: 0.893. Garces et. al. : 0.899
Recall: 0.896. Garces et. al. : 0.903
Accuracy: 0.896. Garces et. al. : 0.903


### Opcional: Registro de Experimento y Métricas en MLFlow

In [None]:
# Ejecutar el siguiente comando en terminal: mlflow ui

In [None]:
# Registro en MLFlow
# Adaptado de https://mlflow.org/docs/latest/getting-started/intro-quickstart/index.html
import mlflow
from mlflow.models import infer_signature

mlflow.set_tracking_uri(uri="http://localhost:5000")

mlflow.set_experiment("Modelo de Prueba")

with mlflow.start_run():
    # Registro de hiperparámetros
    mlflow.log_params(params)

    # Registro de métricas
    mlflow.log_metric("ROC_AUC", auc_score.mean())
    mlflow.log_metric("Weighted_F1", wF1_score.mean())

    # Etiquetas
    mlflow.set_tag("Random Forest", "Modelo base")

    # Signature del modelo
    signature = infer_signature(X_1, model.predict_proba(X_1)[:,1])

    # Registro del modelo
    model_info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="modelos_experimento",
        signature=signature,
        input_example=X_1.head(),
        registered_model_name="base-randomforest-tuned",
    )

Registered model 'base-randomforest-tuned' already exists. Creating a new version of this model...
2024/06/17 17:58:24 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: base-randomforest-tuned, version 3
Created version '3' of model 'base-randomforest-tuned'.
