In [3]:
# Cargar librerías
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score
import mlflow
import mlflow.sklearn


In [4]:
# Leer datos
df = pd.read_csv("./data/diabetes.csv")



In [5]:
# identifica las columnas que tiene valores 0
(df==0).sum()

Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                     500
dtype: int64

In [7]:
# Reemplazar ceros en columnas específicas por NaN
cols_to_clean = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin']
# No se consideran algunas, por ejemplo Pregnancies (por qué si es posible que se tengan cero embarazos)

# Reemplazar ceros por NaN para poder tratarlos como datos faltantes
for col in cols_to_clean:
    df[col] = df[col].replace(0, np.nan)

# Reemplazar NaN con la moda (valor más frecuente) de cada columna
for col in cols_to_clean:
    moda = df[col].mode()[0] # determinar la moda de la columna
    df[col] = df[col].fillna(moda)


In [8]:
# Separar datos
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
# agregar el traking uri de mlflow
mlflow.set_tracking_uri("http://127.0.0.1:9090")
# agregar set_experiment con nombre: ClasificadorDemoDiabetes
mlflow.set_experiment(experiment_name="ClasificadorDemoDiabetes")


2025/05/16 19:37:37 INFO mlflow.tracking.fluent: Experiment with name 'ClasificadorDemoDiabetes' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1747442257374, experiment_id='1', last_update_time=1747442257374, lifecycle_stage='active', name='ClasificadorDemoDiabetes', tags={}>

In [10]:
# Leer los parámetros del archivo CSV para las variaciones
param_df = pd.read_csv("./data/logreg_variaciones_educativas.csv")

for idx, row in param_df.iterrows():
    print(f"\nIniciando run {row['run_id']} con parámetros:")
    print(f"  C={row['logreg_C']}, max_iter={row['logreg_max_iter']}, solver={row['solver']}, penalty={row['penalty']}")
    with mlflow.start_run(run_name=row['run_id']):
        pipeline = Pipeline([
            ("scaler", StandardScaler()),
            ("clf", LogisticRegression(
                C=row['logreg_C'],
                max_iter=int(row['logreg_max_iter']),
                solver=row['solver'],
                penalty=row['penalty']
            ))
        ])
        # Entrenar y evaluar
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
        # Log de parámetros y métricas
        mlflow.log_param("logreg_C", row['logreg_C'])
        mlflow.log_param("logreg_max_iter", row['logreg_max_iter'])
        mlflow.log_param("solver", row['solver'])
        mlflow.log_param("penalty", row['penalty'])
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision", prec)
        mlflow.sklearn.log_model(pipeline, "modelo_pipeline")
        print(f"✅ Run {row['run_id']} registrado en MLflow | Accuracy: {acc:.4f} | Precision: {prec:.4f}")



Iniciando run practica_01 con parámetros:
  C=0.01, max_iter=400, solver=liblinear, penalty=l2
✅ Run practica_01 registrado en MLflow | Accuracy: 0.7208 | Precision: 0.6034

Iniciando run practica_02 con parámetros:
  C=1.12, max_iter=300, solver=lbfgs, penalty=l2




✅ Run practica_02 registrado en MLflow | Accuracy: 0.7532 | Precision: 0.6667

Iniciando run practica_03 con parámetros:
  C=2.23, max_iter=500, solver=liblinear, penalty=l2




✅ Run practica_03 registrado en MLflow | Accuracy: 0.7532 | Precision: 0.6667

Iniciando run practica_04 con parámetros:
  C=3.34, max_iter=200, solver=liblinear, penalty=l2




✅ Run practica_04 registrado en MLflow | Accuracy: 0.7532 | Precision: 0.6667

Iniciando run practica_05 con parámetros:
  C=4.45, max_iter=300, solver=liblinear, penalty=l2




✅ Run practica_05 registrado en MLflow | Accuracy: 0.7532 | Precision: 0.6667

Iniciando run practica_06 con parámetros:
  C=5.5600000000000005, max_iter=500, solver=lbfgs, penalty=l2




✅ Run practica_06 registrado en MLflow | Accuracy: 0.7532 | Precision: 0.6667

Iniciando run practica_07 con parámetros:
  C=6.67, max_iter=300, solver=lbfgs, penalty=l2




✅ Run practica_07 registrado en MLflow | Accuracy: 0.7532 | Precision: 0.6667

Iniciando run practica_08 con parámetros:
  C=7.78, max_iter=500, solver=lbfgs, penalty=l2




✅ Run practica_08 registrado en MLflow | Accuracy: 0.7532 | Precision: 0.6667

Iniciando run practica_09 con parámetros:
  C=8.89, max_iter=400, solver=lbfgs, penalty=l2




✅ Run practica_09 registrado en MLflow | Accuracy: 0.7532 | Precision: 0.6667

Iniciando run practica_10 con parámetros:
  C=10.0, max_iter=200, solver=lbfgs, penalty=l2
✅ Run practica_10 registrado en MLflow | Accuracy: 0.7532 | Precision: 0.6667


