In [None]:
# ═══════════════════════════════════════════════
# 1. Librerías estándar y configuración general
# ═══════════════════════════════════════════════
import os
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)

# ═══════════════════════════════════════════════
# 2. Preprocesamiento y ML clásico
# ═══════════════════════════════════════════════
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score
from scipy.stats import loguniform

# ═══════════════════════════════════════════════
# 3. Modelos
# ═══════════════════════════════════════════════
from sklearn.linear_model import LogisticRegression

# ═══════════════════════════════════════════════
# 4. MLflow tracking
# ═══════════════════════════════════════════════
import mlflow
import mlflow.sklearn

mlflow.set_tracking_uri("file:../Experiments")
mlflow.set_experiment("LogisticRegression_Experiment")
run_name = "LogReg_CV_Sample"

In [8]:
# Cargar los datasets preprocesados
X_train = pd.read_csv("../Data/Gold/X_train_gold.csv")
X_test = pd.read_csv("../Data/Gold/X_test_gold.csv")
y_train = pd.read_csv("../Data/Gold/y_train_gold.csv")
y_test = pd.read_csv("../Data/Gold/y_test_gold.csv")

y_test.rename(columns={'0': 'condition'}, inplace=True)

y_train_final = y_train["condition"].map({"used": 0, "new": 1})
y_test_final = y_test["condition"].map({"used": 0, "new": 1})


print("Datasets cargados correctamente.")

Datasets cargados correctamente.


In [None]:
# === Custom transformer
class TopCityTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, city_col='seller_address.city.name', top_n=20):
        self.city_col = city_col
        self.top_n = top_n
        self.top_cities_ = None

    def fit(self, X, y=None):
        self.top_cities_ = X[self.city_col].value_counts().nlargest(self.top_n).index
        return self

    def transform(self, X):
        X = X.copy()
        X['city_grouped'] = X[self.city_col].apply(
            lambda x: x if x in self.top_cities_ else 'other'
        )
        return X.drop(columns=['seller_id', self.city_col])


# === Pipeline
city_transformer = TopCityTransformer()

# ColumnTransformer que selecciona columnas por tipo dinámicamente
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), selector(dtype_include=object)),
    ("num", StandardScaler(), selector(dtype_include=np.number))
])

pipeline = Pipeline([
    ("city_transform", city_transformer),
    ("preprocessing", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000, random_state=42))
])

# === Hiperparámetros
param_dist = {
    "classifier__C": loguniform(0.01, 10),
    "classifier__solver": ["lbfgs", "saga"]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=5,
    scoring="accuracy",
    cv=cv,
    random_state=42,
    verbose=2,
    n_jobs=-1
)

# === Submuestreo (si aplica)
X_sample = X_train.sample(frac=0.3, random_state=42)
y_sample = y_train_final.loc[X_sample.index]

# === MLflow + entrenamiento
run_name = "LogisticRegression_CV"

with mlflow.start_run(run_name=run_name):
    search.fit(X_sample, y_sample)
    best_pipeline = search.best_estimator_
    mlflow.log_params(search.best_params_)

    y_pred = best_pipeline.predict(X_test)
    y_prob = best_pipeline.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test_final, y_pred)
    auc = roc_auc_score(y_test_final, y_prob)

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("roc_auc", auc)

    mlflow.sklearn.log_model(best_pipeline, "LogisticRegression_CV")

    print(f"✅ Logistic Regression Accuracy: {acc:.4f} | ROC AUC: {auc:.4f}")

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] END classifier__C=0.13292918943162169, classifier__solver=lbfgs; total time= 6.5min
[CV] END classifier__C=0.13292918943162169, classifier__solver=lbfgs; total time= 6.9min
[CV] END classifier__C=0.13292918943162169, classifier__solver=lbfgs; total time= 7.1min
[CV] END classifier__C=0.13292918943162169, classifier__solver=lbfgs; total time= 7.1min
[CV] END classifier__C=0.13292918943162169, classifier__solver=lbfgs; total time= 7.2min
[CV] END classifier__C=0.6251373574521749, classifier__solver=lbfgs; total time=14.4min
[CV] END classifier__C=0.21751953118777648, classifier__solver=lbfgs; total time= 9.6min
[CV] END classifier__C=0.6251373574521749, classifier__solver=lbfgs; total time=13.3min
[CV] END classifier__C=0.6251373574521749, classifier__solver=lbfgs; total time=14.0min
[CV] END classifier__C=0.6251373574521749, classifier__solver=lbfgs; total time=13.9min
[CV] END classifier__C=0.6251373574521749, classifier_



✅ Logistic Regression Accuracy: 0.8488 | ROC AUC: 0.9209


In [6]:

# === Agrupar top ciudades
top_cities = X_train['seller_address.city.name'].value_counts().nlargest(20).index
X_train['city_grouped'] = X_train['seller_address.city.name'].apply(lambda x: x if x in top_cities else 'other')
X_test['city_grouped'] = X_test['seller_address.city.name'].apply(lambda x: x if x in top_cities else 'other')

# === Eliminar columnas de alta cardinalidad
X_train_lr = X_train.drop(columns=['seller_id', 'seller_address.city.name'])
X_test_lr = X_test.drop(columns=['seller_id', 'seller_address.city.name'])

# === Submuestreo para evitar explosión de RAM
X_sample = X_train_lr.sample(frac=0.3, random_state=42)
y_sample = y_train_final.loc[X_sample.index]

# === Columnas
cat_cols = X_sample.select_dtypes(include='object').columns.tolist()
num_cols = X_sample.select_dtypes(include=['int64', 'float64']).columns.tolist()

# === Preprocesamiento
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols),
    ("num", StandardScaler(), num_cols)
])

# === Pipeline base
pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000, random_state=42))
])

# === Búsqueda de hiperparámetros
param_dist = {
    "classifier__C": loguniform(0.01, 10),
    "classifier__solver": ["lbfgs", "saga"]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=5,
    scoring="accuracy",
    cv=cv,
    random_state=42,
    verbose=2,
    n_jobs=-1
)



with mlflow.start_run(run_name=run_name):
    search.fit(X_sample, y_sample)

    best_pipeline = search.best_estimator_
    mlflow.log_params(search.best_params_)

    # Evaluar en full test
    y_pred = best_pipeline.predict(X_test_lr)
    y_prob = best_pipeline.predict_proba(X_test_lr)[:, 1]

    acc = accuracy_score(y_test_final, y_pred)
    auc = roc_auc_score(y_test_final, y_prob)

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("roc_auc", auc)
    mlflow.sklearn.log_model(best_pipeline, "LogisticRegression_CV")

    print(f"✅ Logistic Regression Accuracy: {acc:.4f} | ROC AUC: {auc:.4f}")

Fitting 5 folds for each of 5 candidates, totalling 25 fits


KeyboardInterrupt: 