In [1]:
# ═══════════════════════════════════════════════
# 1. Librerías estándar y configuración general
# ═══════════════════════════════════════════════


import numpy as np
import pandas as pd


pd.set_option('display.max_columns', None)

# ═══════════════════════════════════════════════
# 2. Preprocesamiento y ML clásico
# ═══════════════════════════════════════════════
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler
)

from sklearn.metrics import (
    accuracy_score,
    roc_auc_score
)


# ═══════════════════════════════════════════════
# 3. Modelos
# ═══════════════════════════════════════════════
from sklearn.neural_network import MLPClassifier

# ═══════════════════════════════════════════════
# 4. MLflow tracking
# ═══════════════════════════════════════════════
import mlflow
import mlflow.sklearn

mlflow.set_tracking_uri("file:../Experiments")
mlflow.set_experiment("Neural_Network_Experiment")
run_name = "NeuralNetwork"


2025/05/22 15:14:53 INFO mlflow.tracking.fluent: Experiment with name 'Neural_Network_Experiment' does not exist. Creating a new experiment.


In [2]:
# Cargar los datasets preprocesados
X_train = pd.read_csv("../Data/Gold/X_train_gold.csv")
X_test = pd.read_csv("../Data/Gold/X_test_gold.csv")
y_train = pd.read_csv("../Data/Gold/y_train_gold.csv")
y_test = pd.read_csv("../Data/Gold/y_test_gold.csv")

y_test.rename(columns={'0': 'condition'}, inplace=True)

y_train_final = y_train["condition"].map({"used": 0, "new": 1})
y_test_final = y_test["condition"].map({"used": 0, "new": 1})


print("Datasets cargados correctamente.")

Datasets cargados correctamente.


In [3]:

# === Agrupar ciudades top
top_cities = X_train['seller_address.city.name'].value_counts().nlargest(20).index
X_train['city_grouped'] = X_train['seller_address.city.name'].apply(lambda x: x if x in top_cities else 'other')
X_test['city_grouped'] = X_test['seller_address.city.name'].apply(lambda x: x if x in top_cities else 'other')

# === Eliminar columnas de alta cardinalidad
X_train_lr = X_train.drop(columns=['seller_id', 'seller_address.city.name'])
X_test_lr = X_test.drop(columns=['seller_id', 'seller_address.city.name'])

# === Columnas categóricas y numéricas
cat_cols = X_train_lr.select_dtypes(include='object').columns.tolist()
num_cols = X_train_lr.select_dtypes(include=['int64', 'float64']).columns.tolist()

# === Pipeline
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols),
    ("num", StandardScaler(), num_cols)
])

pipeline = Pipeline([
    ("preprocessing", preprocessor),
    ("classifier", MLPClassifier(hidden_layer_sizes=(64,), max_iter=200, random_state=42))
])

with mlflow.start_run(run_name=run_name):
    pipeline.fit(X_train, y_train_final)
    y_pred = pipeline.predict(X_test)
    y_prob = pipeline.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test_final, y_pred)
    auc = roc_auc_score(y_test_final, y_prob)

    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("roc_auc", auc)
    mlflow.sklearn.log_model(pipeline, "NeuralNet")

    print(f"NeuralNet => Accuracy: {acc:.4f} | ROC AUC: {auc:.4f}")



NeuralNet => Accuracy: 0.8677 | ROC AUC: 0.9380
