In [2]:
# Cargar librerías
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score
import mlflow
import mlflow.sklearn


In [3]:
# Leer datos
df = pd.read_csv("data/diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
# identifica las columnas que tiene valores 0
(df==0).sum()

Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                     500
dtype: int64

In [4]:
# Reemplazar ceros en columnas específicas por NaN
cols_to_clean = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin']
# No se consideran algunas, por ejemplo Pregnancies (por qué si es posible que se tengan cero embarazos)

# Reemplazar ceros por NaN para poder tratarlos como datos faltantes
for col in cols_to_clean:
    df[col] = df[col].replace(0, np.nan)

# Reemplazar NaN con la moda (valor más frecuente) de cada columna
for col in cols_to_clean:
    moda = df[col].mode()[0]# buscar el método más común para determinar la moda de una columna
    df[col] = df[col].fillna(moda)


In [5]:
# Separar datos
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
# agregar el traking uri de mlflow
#mlflow.set_tracking_uri("http://0.0.0.0:5000")
mlflow.set_experiment(experiment_name="ClasificadorDemoDiabetes")
# agregar set_experiment con nombre: ClasificadorDemoDiabetes


2025/05/16 21:33:36 INFO mlflow.tracking.fluent: Experiment with name 'ClasificadorDemoDiabetes' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/817168720839594911', creation_time=1747449216958, experiment_id='817168720839594911', last_update_time=1747449216958, lifecycle_stage='active', name='ClasificadorDemoDiabetes', tags={}>

In [8]:
# Entrenamiento y registro con MLflow
C = 1.0
max_iter = 1000
# usar los parámetros del archivo logreg_variaciones_educativas.csv
# para los parámtros C, max_iter, solver y penalty de LogisticRegresion
# se debe genear un run por cada fila del archivo, usando sus parámetros
# Analizar que usar

# agregar la línea del start_run
data = pd.read_csv("data/logreg_variaciones_educativas.csv", header=0)
c = data['logreg_C'].tolist()
m = data['logreg_max_iter'].tolist()
solver = data['solver'].tolist()
penalty = data['penalty'].tolist()

for i in range(len(c)):

    with mlflow.start_run(run_name=f"LR_Run_{i}"):
        # El pipeline usar StandarScaler para que todos los valore numéricos estén en la misma escala
        # no cambiar dicha línea
        # LogisticRegresion deben variar sus parámetros en cada iteración.
        pipeline = Pipeline([
                ("scaler", StandardScaler()),
                ("clf", LogisticRegression(
                    C=c[i],
                    max_iter=m[i],
                    solver=solver[i],
                    penalty=penalty[i]
                ))
            ])

        # Entrenar y evaluar
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)

        # recuerde que log_param y log_metric debe ir guardando los valores
        # dependiendo del run correspondiente de cada fila
        mlflow.log_param("logreg_C", C)
        mlflow.log_param("logreg_max_iter", max_iter)
        mlflow.log_param("solver", "liblinear")
        mlflow.log_param("penalty", "l2")

        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision", prec)

        # Guardar el pipeline completo
        mlflow.sklearn.log_model(pipeline, "modelo_pipeline")

        print(" Modelo registrado en MLflow")
        print(f"Accuracy: {acc:.4f} | Precision: {prec:.4f}")




 Modelo registrado en MLflow
Accuracy: 0.7208 | Precision: 0.6034
🏃 View run LR_Run_0 at: http://0.0.0.0:5000/#/experiments/817168720839594911/runs/1173518eb5ce46a39298ad3da4615d2d
🧪 View experiment at: http://0.0.0.0:5000/#/experiments/817168720839594911




 Modelo registrado en MLflow
Accuracy: 0.7532 | Precision: 0.6667
🏃 View run LR_Run_1 at: http://0.0.0.0:5000/#/experiments/817168720839594911/runs/096509976c8347cb98180acc5eba00be
🧪 View experiment at: http://0.0.0.0:5000/#/experiments/817168720839594911




 Modelo registrado en MLflow
Accuracy: 0.7532 | Precision: 0.6667
🏃 View run LR_Run_2 at: http://0.0.0.0:5000/#/experiments/817168720839594911/runs/c385cf72a05747d3bba19b743d083c8c
🧪 View experiment at: http://0.0.0.0:5000/#/experiments/817168720839594911




 Modelo registrado en MLflow
Accuracy: 0.7532 | Precision: 0.6667
🏃 View run LR_Run_3 at: http://0.0.0.0:5000/#/experiments/817168720839594911/runs/3ca6e450ce624ac28928d15292a445dd
🧪 View experiment at: http://0.0.0.0:5000/#/experiments/817168720839594911




 Modelo registrado en MLflow
Accuracy: 0.7532 | Precision: 0.6667
🏃 View run LR_Run_4 at: http://0.0.0.0:5000/#/experiments/817168720839594911/runs/ac3d5fa9ff2043ce957f1cce8d651a61
🧪 View experiment at: http://0.0.0.0:5000/#/experiments/817168720839594911




 Modelo registrado en MLflow
Accuracy: 0.7532 | Precision: 0.6667
🏃 View run LR_Run_5 at: http://0.0.0.0:5000/#/experiments/817168720839594911/runs/480c7b205398477fb46b1a71f96cf751
🧪 View experiment at: http://0.0.0.0:5000/#/experiments/817168720839594911




 Modelo registrado en MLflow
Accuracy: 0.7532 | Precision: 0.6667
🏃 View run LR_Run_6 at: http://0.0.0.0:5000/#/experiments/817168720839594911/runs/d9b583368d494368a0fb11b84a4a0f25
🧪 View experiment at: http://0.0.0.0:5000/#/experiments/817168720839594911




 Modelo registrado en MLflow
Accuracy: 0.7532 | Precision: 0.6667
🏃 View run LR_Run_7 at: http://0.0.0.0:5000/#/experiments/817168720839594911/runs/ea497213150d4a40b8da9f88d56f7cf4
🧪 View experiment at: http://0.0.0.0:5000/#/experiments/817168720839594911




 Modelo registrado en MLflow
Accuracy: 0.7532 | Precision: 0.6667
🏃 View run LR_Run_8 at: http://0.0.0.0:5000/#/experiments/817168720839594911/runs/dd75d3c2c1b84c3fba269444c3f1a725
🧪 View experiment at: http://0.0.0.0:5000/#/experiments/817168720839594911




 Modelo registrado en MLflow
Accuracy: 0.7532 | Precision: 0.6667
🏃 View run LR_Run_9 at: http://0.0.0.0:5000/#/experiments/817168720839594911/runs/ce694c5e46704d11bd59c1397d2d561b
🧪 View experiment at: http://0.0.0.0:5000/#/experiments/817168720839594911
