In [13]:
# === 1. Librerías base
import os
import sys
import ast
import json
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# === 2. ML y procesamiento
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, learning_curve
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
from scipy.stats import randint, uniform

# === 3. Modelo principal
from xgboost import XGBClassifier

# === 4. Utilidades adicionales
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.preprocessing import MultiLabelBinarizer
from joblib import dump
from new_or_used import build_dataset  # función provista para cargar datos

# === 5. MLflow
import mlflow
import mlflow.sklearn

# === 6. Configuración general
pd.set_option('display.max_columns', None)



  from .autonotebook import tqdm as notebook_tqdm


In [14]:
# Cargar los datasets preprocesados
X_train = pd.read_csv("../Data/Gold/X_train_gold.csv")
X_test = pd.read_csv("../Data/Gold/X_test_gold.csv")
y_train = pd.read_csv("../Data/Gold/y_train_gold.csv")
y_test = pd.read_csv("../Data/Gold/y_test_gold.csv")

y_test.rename(columns={'0': 'condition'}, inplace=True)

y_train_final = y_train["condition"].map({"used": 0, "new": 1})
y_test_final = y_test["condition"].map({"used": 0, "new": 1})


print("Datasets cargados correctamente.")

Datasets cargados correctamente.


In [15]:
import mlflow
from mlflow.tracking import MlflowClient
import pandas as pd

# === Configuración
mlflow.set_tracking_uri("file:../Experiments")  # Ajusta si estás en Notebooks/
client = MlflowClient()

# Verificamos el nombre correcto del experimento
EXPERIMENT_NAME = "XGBoost_Experiment"

# Buscar el experimento
exp = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

if exp is None:
    raise ValueError(f"❌ No se encontró el experimento '{EXPERIMENT_NAME}'")

# Buscar los runs
runs = client.search_runs(
    experiment_ids=[exp.experiment_id],
    filter_string="attributes.status = 'FINISHED'"
)

if not runs:
    raise ValueError(f"⚠️ No hay runs finalizados en '{EXPERIMENT_NAME}'")

# Convertir a DataFrame
runs_df = pd.DataFrame([{
    "run_id": run.info.run_id,
    "experiment_name": EXPERIMENT_NAME,
    "accuracy": run.data.metrics.get("accuracy", None),
    "roc_auc": run.data.metrics.get("roc_auc", None)
} for run in runs])

# Ordenar
df_filtered = runs_df.sort_values(by=["accuracy", "roc_auc"], ascending=False)

# Mostrar
print("🏆 Top modelos por accuracy + roc_auc:")
print(df_filtered.head(10))

🏆 Top modelos por accuracy + roc_auc:
                             run_id     experiment_name  accuracy   roc_auc
0  4d1ecec5df16441b92ab2c18df16afca  XGBoost_Experiment    0.8892  0.959242
1  ebb835b1a47b4e728bb595b66be22b90  XGBoost_Experiment    0.8892  0.959242


In [21]:
best_run_id = df_filtered.iloc[0]["run_id"]
model_uri = f"runs:/{best_run_id}/xgboost_pipeline_v1"  # o el nombre exacto usado en log_model

# Cargar el modelo
loaded_model = mlflow.sklearn.load_model(model_uri)

# Predecir
y_pred = loaded_model.predict(X_test)

In [31]:
y_true = y_test_final.to_numpy()
y_true[0:10], y_pred[0:10]  # Mostrar las primeras 5 predicciones

(array([0, 1, 1, 0, 1, 1, 0, 1, 1, 0]), array([1, 1, 1, 0, 1, 1, 1, 1, 1, 0]))

In [24]:
y_pred

array([1, 1, 1, ..., 1, 0, 0])

In [25]:
from sklearn.metrics import classification_report, confusion_matrix

print("📊 Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))

print("\n📋 Classification Report:")
print(classification_report(y_true, y_pred))

📊 Confusion Matrix:
[[4061  533]
 [ 575 4831]]

📋 Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.88      0.88      4594
           1       0.90      0.89      0.90      5406

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [27]:
from joblib import dump
import mlflow

best_run_id = df_filtered.iloc[0]["run_id"]
model_uri = f"runs:/{best_run_id}/xgboost_pipeline_v1"

# Cargar el pipeline completo desde MLflow
loaded_model = mlflow.sklearn.load_model(model_uri)

# Guardar localmente
dump(loaded_model, "../Models/best_model_production.pkl")
print("✅ Modelo guardado en ./Models/best_model_production.pkl")

✅ Modelo guardado en ./Models/best_model_production.pkl
