# üßë‚Äçüíª Introducci√≥n a MLFLow (Parte I): Entrenamiento y Registro de Modelos de Regresi√≥n Log√≠stica.
Integrantes: Tob√≠as Romero **(2021214011)** y Jenifer Roa **(2022214006)**
---

## 1. Importaci√≥n de librer√≠as.

In [None]:
import warnings
warnings.filterwarnings('ignore')

import mlflow
from mlflow.tracking import MlflowClient
import mlflow.sklearn
from mlflow.models import infer_signature

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    confusion_matrix,
    roc_curve
)

## 2. Configuraci√≥n inicial de MLflow / DagsHub.

In [None]:
# Cargar variables de entorno (para MLFLOW_TRACKING_URI, usuario/clave, etc.)
from dotenv import load_dotenv; load_dotenv()
import os

# Conectar con DagsHub y habilitar MLflow remoto
import dagshub
dagshub.init(repo_owner='jenifer8092', repo_name='Laboratorio-MLFLOW', mlflow=True)

# (Opcional) configurar/asegurar el experimento
experiment_name = os.getenv("EXPERIMENT_NAME", "Logistic_Regression_Classification")
mlflow.set_experiment(experiment_name)

print(f"‚úì Tracking URI: {mlflow.get_tracking_uri()}")
exp = mlflow.get_experiment_by_name(experiment_name)
if exp:
    print(f"Artifact location: {exp.artifact_location}")

In [None]:
descripcion = """
Experimento base para clasificaci√≥n con LogisticRegression.
Incluye baseline, m√©tricas (accuracy, F1) y comparaci√≥n con normalizaci√≥n.
"""
tags_exp = {
    "owner": "Tob√≠as Romero",
    "dataset": "Breast Cancer Wisconsin",
    "curso": "Laboratorio MLflow",
    "model_family": "LogisticRegression",
}

client = MlflowClient()
exp = mlflow.get_experiment_by_name(experiment_name)

if exp and getattr(exp, "lifecycle_stage", None) == "deleted":
    client.restore_experiment(exp.experiment_id)
    exp = mlflow.get_experiment_by_name(experiment_name)

if exp is None:
    exp_id = client.create_experiment(experiment_name, tags=tags_exp)
else:
    exp_id = exp.experiment_id
    for k, v in tags_exp.items():
        client.set_experiment_tag(exp_id, k, v)

client.set_experiment_tag(exp_id, "mlflow.note.content", descripcion)

exp_actualizado = mlflow.get_experiment(exp_id)
print("‚úì Experimento:", exp_actualizado.name, "| ID:", exp_actualizado.experiment_id)
print("‚úì Tags del experimento:", exp_actualizado.tags)
print("‚úì Descripci√≥n:", exp_actualizado.tags.get("mlflow.note.content", "(sin descripci√≥n)"))

## 3. Carga y exploraci√≥n inicial de datos.

In [None]:
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name='target')

In [None]:
print(f"Dataset: Breast Cancer Wisconsin")
print(f"N√∫mero de muestras: {X.shape[0]}")
print(f"N√∫mero de caracter√≠sticas: {X.shape[1]}")
print(f"Clases: {data.target_names}")
print(f"Distribuci√≥n de clases:\n{y.value_counts()}")
print()

In [None]:
print("Primeras filas del dataset:")
print(X.head())
print()
print("Estad√≠sticas descriptivas:")
print(X.describe())
print()

## 3.1 Divisi√≥n del dataset.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"Conjunto de entrenamiento: {X_train.shape[0]} muestras")
print(f"Conjunto de prueba: {X_test.shape[0]} muestras")
print()

## 4. Funci√≥n auxiliar para evaluar y visualizar.

In [None]:
def evaluate_and_visualize_model(model, X_test, y_test, run_name):
    """Eval√∫a el modelo y genera visualizaciones"""
    # Predicciones
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Calcular m√©tricas
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1_score': f1_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_pred_proba)
    }

    print(f"\nM√©tricas del modelo ({run_name}):")
    print("-" * 50)
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value:.4f}")

    # Reporte de clasificaci√≥n
    print("\nReporte de Clasificaci√≥n:")
    print(classification_report(y_test, y_pred, target_names=['Maligno', 'Benigno']))

    # Matriz de confusi√≥n y Curva ROC
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0],
                xticklabels=['Maligno', 'Benigno'],
                yticklabels=['Maligno', 'Benigno'])
    axes[0].set_title(f'Matriz de Confusi√≥n - {run_name}')
    axes[0].set_ylabel('Valor Real')
    axes[0].set_xlabel('Predicci√≥n')

    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    axes[1].plot(fpr, tpr, lw=2, label=f'ROC (AUC = {metrics["roc_auc"]:.2f})')
    axes[1].plot([0, 1], [0, 1], lw=2, linestyle='--', label='Random')
    axes[1].set_xlim([0.0, 1.0])
    axes[1].set_ylim([0.0, 1.05])
    axes[1].set_xlabel('Tasa de Falsos Positivos')
    axes[1].set_ylabel('Tasa de Verdaderos Positivos')
    axes[1].set_title(f'Curva ROC - {run_name}')
    axes[1].legend(loc="lower right")
    axes[1].grid(alpha=0.3)

    plt.tight_layout()
    return metrics, fig

## 5. Primer experimento.

In [None]:
with mlflow.start_run(run_name="logistic_regression_default") as run:
    # Preparar datasets como input de MLflow (si tu versi√≥n lo soporta)
    try:
        train_df = X_train.copy(); train_df["target"] = y_train.reset_index(drop=True)
        test_df  = X_test.copy();  test_df["target"]  = y_test.reset_index(drop=True)
        ds_train = mlflow.data.from_pandas(train_df, source="sklearn.breast_cancer", name="breast_cancer_train_v1")
        ds_test  = mlflow.data.from_pandas(test_df,  source="sklearn.breast_cancer", name="breast_cancer_test_v1")
    except Exception as e:
        mlflow.set_tag("dataset", "Breast Cancer Wisconsin")
        print("Aviso: no se pudo usar mlflow.data; se dej√≥ tag 'dataset'. Error:", e)

    if 'ds_train' in locals() and 'ds_test' in locals():
        mlflow.log_input(ds_train, context="training")
        mlflow.log_input(ds_test, context="test")

    print(f"\nRun ID: {run.info.run_id}")
    print("Run Name: logistic_regression_default")
    print("\nCreando pipeline de preprocesamiento y modelo...")

    # Hiperpar√°metros
    hyperparameters = {
        'C': 1.0,
        'penalty': 'l2',
        'solver': 'lbfgs',
        'max_iter': 1000,
        'random_state': 42
    }

    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(**hyperparameters))
    ])

    print("Entrenando el modelo...")
    pipeline.fit(X_train, y_train)
    print("Modelo entrenado exitosamente")

    metrics, fig = evaluate_and_visualize_model(pipeline, X_test, y_test, "Modelo Default")

    # Registro en MLflow
    print("\nRegistrando hiperpar√°metros y m√©tricas en MLflow...")
    mlflow.log_params(hyperparameters)
    mlflow.log_metrics(metrics)

    # Info extra
    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("dataset", "sklearn.breast_cancer")
    mlflow.log_param("n_features", X_train.shape[1])
    mlflow.log_param("n_samples_train", X_train.shape[0])
    mlflow.log_param("n_samples_test", X_test.shape[0])

    print("Guardando visualizaciones...")
    fig.savefig("confusion_matrix_roc_default.png", dpi=100, bbox_inches='tight')
    mlflow.log_artifact("confusion_matrix_roc_default.png")
    plt.close(fig)

    print("Guardando modelo en MLflow...")
    signature = infer_signature(X_train, pipeline.predict(X_train))
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="model",
        signature=signature,
        input_example=X_train.iloc[:5],
        registered_model_name="breast_cancer_classifier_v1"
    )

    print("Agregando tags y metadatos...")
    mlflow.set_tags({
        "model_type": "Logistic Regression",
        "framework": "scikit-learn",
        "dataset": "Breast Cancer Wisconsin",
        "preprocessing": "StandardScaler",
        "author": "Tob√≠as Romero",
        "version": "1.0",
        "purpose": "baseline_model"
    })

    mlflow.set_tag(
        "mlflow.note.content",
        "Modelo baseline de regresi√≥n log√≠stica con par√°metros por defecto. "
        "Utiliza regularizaci√≥n L2 con C=1.0 y solver lbfgs. "
        "Este modelo sirve como punto de referencia para comparaciones futuras."
    )

    print("\nExperimento 1 completado y registrado en MLflow")
    print(f"Run ID: {run.info.run_id}")

print()

## 5.1 Segundo experimento.

In [None]:
with mlflow.start_run(run_name="logistic_regression_optimized") as run:
    try:
        train_df = X_train.copy(); train_df["target"] = y_train.reset_index(drop=True)
        test_df  = X_test.copy();  test_df["target"]  = y_test.reset_index(drop=True)
        ds_train = mlflow.data.from_pandas(train_df, source="sklearn.breast_cancer", name="breast_cancer_train_v1")
        ds_test  = mlflow.data.from_pandas(test_df,  source="sklearn.breast_cancer", name="breast_cancer_test_v1")
    except Exception as e:
        mlflow.set_tag("dataset", "Breast Cancer Wisconsin")
        print("Aviso: no se pudo usar mlflow.data; se dej√≥ tag 'dataset'. Error:", e)

    if 'ds_train' in locals() and 'ds_test' in locals():
        mlflow.log_input(ds_train, context="training")
        mlflow.log_input(ds_test, context="test")

    print(f"\nRun ID: {run.info.run_id}")
    print("Run Name: logistic_regression_optimized")
    print("\nCreando pipeline...")

    hyperparameters_v2 = {
        'C': 0.1,
        'penalty': 'l2',
        'solver': 'saga',
        'max_iter': 2000,
        'random_state': 42
    }

    pipeline_v2 = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(**hyperparameters_v2))
    ])

    print("Entrenando el modelo optimizado...")
    pipeline_v2.fit(X_train, y_train)
    print("Modelo entrenado exitosamente")

    metrics_v2, fig_v2 = evaluate_and_visualize_model(pipeline_v2, X_test, y_test, "Modelo Optimizado")

    print("\nRegistrando hiperpar√°metros y m√©tricas en MLflow...")
    mlflow.log_params(hyperparameters_v2)
    mlflow.log_metrics(metrics_v2)

    mlflow.log_param("test_size", 0.2)
    mlflow.log_param("dataset", "sklearn.breast_cancer")
    mlflow.log_param("n_features", X_train.shape[1])
    mlflow.log_param("n_samples_train", X_train.shape[0])
    mlflow.log_param("n_samples_test", X_test.shape[0])

    print("Guardando visualizaciones...")
    fig_v2.savefig("confusion_matrix_roc_optimized.png", dpi=100, bbox_inches='tight')
    mlflow.log_artifact("confusion_matrix_roc_optimized.png")
    plt.close(fig_v2)

    print("Guardando modelo en MLflow...")
    signature = infer_signature(X_train, pipeline_v2.predict(X_train))
    mlflow.sklearn.log_model(
        sk_model=pipeline_v2,
        artifact_path="model",
        signature=signature,
        input_example=X_train.iloc[:5],
        registered_model_name="breast_cancer_classifier_v2"
    )

    print("Agregando tags y metadatos...")
    mlflow.set_tags({
        "model_type": "Logistic Regression",
        "framework": "scikit-learn",
        "dataset": "Breast Cancer Wisconsin",
        "preprocessing": "StandardScaler",
        "author": "Tob√≠as",
        "version": "2.0",
        "purpose": "optimized_model",
        "optimization": "increased_regularization"
    })

    mlflow.set_tag(
        "mlflow.note.content",
        "Modelo optimizado con mayor regularizaci√≥n (C=0.1) y solver SAGA. "
        "Se busca reducir el overfitting y mejorar la generalizaci√≥n del modelo. "
        "Los resultados se comparan con el modelo baseline."
    )

    print("\nExperimento 2 completado y registrado en MLflow")
    print(f"Run ID: {run.info.run_id}")

print()