In [21]:
# Cargar librerías
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score
import mlflow
import mlflow.sklearn


In [22]:
# Leer datos
df = pd.read_csv("data/diabetes.csv")



In [23]:
# identifica las columnas que tiene valores 0
(df==0).sum()

Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                     500
dtype: int64

In [24]:
print(df)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627   50        1  
1                  

In [None]:
def moda_df_sin_nan(df,col):
    modas = {}
   
    moda = df[col].mode().dropna()
    if not moda.empty:
        modas[col] = moda.iloc[0]
    else:
        modas[col] = 0  # Si no hay moda válida
return modas

In [35]:
# Reemplazar ceros en columnas específicas por NaN
cols_to_clean = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin']
# No se consideran algunas, por ejemplo Pregnancies (por qué si es posible que se tengan cero embarazos)

# Reemplazar ceros por NaN para poder tratarlos como datos faltantes
for col in cols_to_clean:
    df[col] = df[col].replace(0, np.nan)

# Reemplazar NaN con la moda (valor más frecuente) de cada columna
for col in cols_to_clean:
    moda_serie = df[col].mode()
    if not moda_serie.empty and not pd.isna(moda_serie[0]):
        moda = moda_serie[0]
    else:
        moda = 0  # o el valor por defecto que quieras usar
    
    #moda = moda_df_sin_nan(df,col)
    df[col] = df[col].fillna(moda)


In [36]:
print (df)

     Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6    148.0           72.0           35.0    105.0  33.6   
1              1     85.0           66.0           29.0    105.0  26.6   
2              8    183.0           64.0           32.0    105.0  23.3   
3              1     89.0           66.0           23.0     94.0  28.1   
4              0    137.0           40.0           35.0    168.0  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10    101.0           76.0           48.0    180.0  32.9   
764            2    122.0           70.0           27.0    105.0  36.8   
765            5    121.0           72.0           23.0    112.0  26.2   
766            1    126.0           60.0           32.0    105.0  30.1   
767            1     93.0           70.0           31.0    105.0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627   50        1  
1                  

In [37]:
# Separar datos
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [38]:
# agregar el traking uri de mlflow
mlflow.set_tracking_uri("http://127.0.0.1:9090")
# agregar set_experiment con nombre: ClasificadorDemoDiabetes
mlflow.set_experiment(experiment_name="ClasificadorDemoDiabetes")


<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1747442867580, experiment_id='1', last_update_time=1747442867580, lifecycle_stage='active', name='ClasificadorDemoDiabetes', tags={}>

In [39]:
# Leer datos
df2 = pd.read_csv("data/logreg_variaciones_educativas.csv")


In [40]:
print (df2)

        run_id  logreg_C  logreg_max_iter     solver penalty
0  practica_01      0.01              400  liblinear      l2
1  practica_02      1.12              300      lbfgs      l2
2  practica_03      2.23              500  liblinear      l2
3  practica_04      3.34              200  liblinear      l2
4  practica_05      4.45              300  liblinear      l2
5  practica_06      5.56              500      lbfgs      l2
6  practica_07      6.67              300      lbfgs      l2
7  practica_08      7.78              500      lbfgs      l2
8  practica_09      8.89              400      lbfgs      l2
9  practica_10     10.00              200      lbfgs      l2


In [41]:
# Entrenamiento y registro con MLflow
C = 1.0
max_iter = 1000
# usar los parámetros del archivo logreg_variaciones_educativas.csv
# para los parámtros C, max_iter, solver y penalty de LogisticRegresion
# se debe genear un run por cada fila del archivo, usando sus parámetros
# Analizar que usar
for index, row in df2.iterrows():
    
    # agregar la línea del start_run
    
    with mlflow.start_run(run_name=row['run_id']):
        # El pipeline usar StandarScaler para que todos los valore numéricos estén en la misma escala
        # no cambiar dicha línea
        # LogisticRegresion deben variar sus parámetros en cada iteración.
        pipeline = Pipeline([
                ("scaler", StandardScaler()),
                ("clf", LogisticRegression(
                    C=row['logreg_C'],
                    max_iter=row['logreg_max_iter'],
                    solver=row['solver'], #solver="liblinear",
                    penalty=row['penalty'] #"l2"
                ))
            ])
        
        # Entrenar y evaluar
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
    
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred)
    
        # recuerde que log_param y log_metric debe ir guardando los valores
        # dependiendo del run correspondiente de cada fila
        mlflow.log_param("logreg_C", row['logreg_C'])
        mlflow.log_param("logreg_max_iter", row['logreg_max_iter'])
        mlflow.log_param("solver", row['solver'] )
        mlflow.log_param("penalty", row['penalty'])
        
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision", prec)
        
        # Guardar el pipeline completo
        mlflow.sklearn.log_model(pipeline, "modelo_pipeline")
        
        print(" Modelo registrado en MLflow")
        print(f"Accuracy: {acc:.4f} | Precision: {prec:.4f}")




 Modelo registrado en MLflow
Accuracy: 0.7208 | Precision: 0.6034
🏃 View run practica_01 at: http://127.0.0.1:9090/#/experiments/1/runs/220c40e5191c4942ba2703924a509e05
🧪 View experiment at: http://127.0.0.1:9090/#/experiments/1




 Modelo registrado en MLflow
Accuracy: 0.7532 | Precision: 0.6667
🏃 View run practica_02 at: http://127.0.0.1:9090/#/experiments/1/runs/70aa9a36a8a4448db6ffc51b0872d2be
🧪 View experiment at: http://127.0.0.1:9090/#/experiments/1




 Modelo registrado en MLflow
Accuracy: 0.7532 | Precision: 0.6667
🏃 View run practica_03 at: http://127.0.0.1:9090/#/experiments/1/runs/793e6c689aa24c4f8d0c82ce02d5d43d
🧪 View experiment at: http://127.0.0.1:9090/#/experiments/1




 Modelo registrado en MLflow
Accuracy: 0.7532 | Precision: 0.6667
🏃 View run practica_04 at: http://127.0.0.1:9090/#/experiments/1/runs/cee41ba2b1d5455396fda72ce8c91532
🧪 View experiment at: http://127.0.0.1:9090/#/experiments/1




 Modelo registrado en MLflow
Accuracy: 0.7532 | Precision: 0.6667
🏃 View run practica_05 at: http://127.0.0.1:9090/#/experiments/1/runs/1320564c07cb4a69a03e9a1225b69f02
🧪 View experiment at: http://127.0.0.1:9090/#/experiments/1




 Modelo registrado en MLflow
Accuracy: 0.7532 | Precision: 0.6667
🏃 View run practica_06 at: http://127.0.0.1:9090/#/experiments/1/runs/83a7788af22f44fba4cf29a97e5a3d74
🧪 View experiment at: http://127.0.0.1:9090/#/experiments/1




 Modelo registrado en MLflow
Accuracy: 0.7532 | Precision: 0.6667
🏃 View run practica_07 at: http://127.0.0.1:9090/#/experiments/1/runs/8f737f95e14c4b79aa5e077c92b884ad
🧪 View experiment at: http://127.0.0.1:9090/#/experiments/1




 Modelo registrado en MLflow
Accuracy: 0.7532 | Precision: 0.6667
🏃 View run practica_08 at: http://127.0.0.1:9090/#/experiments/1/runs/e964be185fac417a97362edd1fe1434c
🧪 View experiment at: http://127.0.0.1:9090/#/experiments/1




 Modelo registrado en MLflow
Accuracy: 0.7532 | Precision: 0.6667
🏃 View run practica_09 at: http://127.0.0.1:9090/#/experiments/1/runs/3952f66fe52747478c4c71ce91d08fd7
🧪 View experiment at: http://127.0.0.1:9090/#/experiments/1




 Modelo registrado en MLflow
Accuracy: 0.7532 | Precision: 0.6667
🏃 View run practica_10 at: http://127.0.0.1:9090/#/experiments/1/runs/057900b62dd24b89a04a8e8af9c89aca
🧪 View experiment at: http://127.0.0.1:9090/#/experiments/1
