In [1]:
# Cargar librerías
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score
import mlflow
import mlflow.sklearn


In [3]:
# Leer datos
df = pd.read_csv("data/diabetes.csv")



In [4]:
# identifica las columnas que tiene valores 0
(df==0).sum()

Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                     500
dtype: int64

In [21]:
# Reemplazar ceros en columnas específicas por NaN
cols_to_clean = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin']
# No se consideran algunas, por ejemplo Pregnancies (por qué si es posible que se tengan cero embarazos)

# Reemplazar ceros por NaN para poder tratarlos como datos faltantes
for col in cols_to_clean:
    df[col] = df[col].replace(0, np.nan)

# Reemplazar NaN con la moda (valor más frecuente) de cada columna
for col in cols_to_clean:
    #moda = # buscar el método más común para determinar la moda de una columna
    moda = df[col].mode()
    print(moda)
    df[col] = df[col].fillna(moda)


0     99.0
1    100.0
Name: Glucose, dtype: float64
0    70.0
Name: BloodPressure, dtype: float64
0    32.0
Name: SkinThickness, dtype: float64
0    105.0
Name: Insulin, dtype: float64


In [22]:
# Separar datos
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [23]:
# agregar el traking uri de mlflow
mlflow.set_tracking_uri("http://localhost:9090") # cambiar en función de su servidor
mlflow.set_experiment("ClasificadorDemoDiabetes")
# agregar set_experiment con nombre: ClasificadorDemoDiabetes


<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1747441592431, experiment_id='1', last_update_time=1747441592431, lifecycle_stage='active', name='ClasificadorDemoDiabetes', tags={}>

In [29]:
# Entrenamiento y registro con MLflow
C = 1.0
max_iter = 1000
# usar los parámetros del archivo logreg_variaciones_educativas.csv
# para los parámtros C, max_iter, solver y penalty de LogisticRegresion
# se debe genear un run por cada fila del archivo, usando sus parámetros
# Analizar que usar

datos_logreg = pd.read_csv('data/logreg_variaciones_educativas.csv')

datos_logreg

for index,row in datos_logreg.iterrows():
    #print(row['logreg_C'])
    with mlflow.start_run(run_name=f"LR_Run_practica2 {index}"):
        # El pipeline usar StandarScaler para que todos los valore numéricos estén en la misma escala
        # no cambiar dicha línea
        # LogisticRegresion deben variar sus parámetros en cada iteración.
        pipeline = Pipeline([
                ("scaler", StandardScaler()),
                ("clf", LogisticRegression(
                    C=row['logreg_C'],
                    max_iter=row['logreg_max_iter'],
                    solver=row['solver'],
                    penalty=row['penalty']
                ))
            ])
        
        # Entrenar y evaluar
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
    
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
    
        # recuerde que log_param y log_metric debe ir guardando los valores
        # dependiendo del run correspondiente de cada fila
        mlflow.log_param("logreg_C", C)
        mlflow.log_param("logreg_max_iter", max_iter)
        mlflow.log_param("solver", "liblinear")
        mlflow.log_param("penalty", "l2")
        
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision", prec)
        
        # Guardar el pipeline completo
        mlflow.sklearn.log_model(pipeline, "modelo_pipeline")
        
        print(" Modelo registrado en MLflow")
        print(f"Accuracy: {acc:.4f} | Precision: {prec:.4f}")

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values