In [1]:
%load_ext autoreload
%autoreload 2


# Modelos lineales

In [None]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

def scale_features(X_train, X_test, numeric_columns, drop_columns=None):
    """
    Escala las columnas numéricas de X_train y X_test usando StandardScaler
    y mantiene el resto (categóricas) intactas.

    Parámetros:
    - X_train: DataFrame de entrenamiento
    - X_test: DataFrame de prueba
    - numeric_columns: lista de columnas numéricas a escalar
    - drop_columns: columnas a eliminar (por ejemplo IDs)

    Retorna:
    - X_train_scaled: DataFrame escalado
    - X_test_scaled: DataFrame escalado
    - scaler: objeto StandardScaler ya ajustado
    """
    # Separar partes numéricas y categóricas
    X_train_num = X_train[numeric_columns]
    X_test_num = X_test[numeric_columns]

    if drop_columns:
        X_train_cat = X_train.drop(columns=numeric_columns + drop_columns)
        X_test_cat = X_test.drop(columns=numeric_columns + drop_columns)
    else:
        X_train_cat = X_train.drop(columns=numeric_columns)
        X_test_cat = X_test.drop(columns=numeric_columns)

    # Escalar
    scaler = StandardScaler()
    X_train_scaled_num = pd.DataFrame(scaler.fit_transform(X_train_num), columns=numeric_columns, index=X_train.index)
    X_test_scaled_num = pd.DataFrame(scaler.transform(X_test_num), columns=numeric_columns, index=X_test.index)

    # Reconstruir
    X_train_scaled = pd.concat([X_train_scaled_num, X_train_cat], axis=1)
    X_test_scaled = pd.concat([X_test_scaled_num, X_test_cat], axis=1)

    return X_train_scaled, X_test_scaled, scaler

In [None]:
cols_numericas = [
    'base_price', 'price', 'initial_quantity', 'sold_quantity',
    'available_quantity', 'price_diff', 'sold_ratio',
    'availability_change', 'days_active'
]

X_train_scaled, X_test_scaled, scaler = scale_features(X_train, X_test, numeric_columns=cols_numericas)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# logreg = LogisticRegression(random_state=42)
svm = SVC(kernel='rbf', random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)

# logreg.fit(X_train_scaled, y_train)
svm.fit(X_train_scaled, y_train)
knn.fit(X_train_scaled, y_train)

In [None]:
# y_pred_logreg = logreg.predict(X_test_scaled)
y_pred_svm = svm.predict(X_test_scaled)
y_pred_knn = knn.predict(X_test_scaled)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

def evaluar(nombre, y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(f"\n🔎 {nombre}")
    print(f"🎯 Accuracy: {acc:.4f}")
    print(f"📊 F1-score: {f1:.4f}")
    print("📋 Classification Report:")
    print(classification_report(y_true, y_pred))

# 5. Llamar función para cada modelo
# evaluar("Logistic Regression", y_test, y_pred_logreg)
evaluar("SVM", y_test, y_pred_svm)
evaluar("KNN", y_test, y_pred_knn)

# Modelos basados en árboles

In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

def build_dataset():
    # Leer el archivo JSONLines
    filepath="../data/raw/MLA_100k_checked_v3.jsonlines"
    with open(filepath) as f:
        data = [json.loads(line) for line in f]

    # Convertir a DataFrame
    df = pd.DataFrame(data)

    # Seller activity
    df['seller_freq'] = df['seller_id'].map(df['seller_id'].value_counts(normalize=True))

    # Shipping info
    df['is_free_shipping'] = df['shipping'].apply(lambda x: x.get('free_shipping', False) if isinstance(x, dict) else False).astype(int)
    df['shipping_mode'] = df['shipping'].apply(lambda x: x.get('mode', 'unknown') if isinstance(x, dict) else 'unknown')
    df['local_pick_up'] = df['shipping'].apply(lambda x: x.get('local_pick_up', False) if isinstance(x, dict) else False).astype(int)

    # Payment methods
    df['payment_method_group'] = df['non_mercado_pago_payment_methods'].apply(lambda x: len(x) if isinstance(x, list) else 0).apply(lambda n: n if n <= 3 else '4+')

    # Tags
    df['has_dragged_visits'] = df['tags'].apply(lambda x: 'dragged_visits' in x or 'dragged_bids_and_visits' in x if isinstance(x, list) else False).astype(int)
    df['has_good_thumbnail'] = df['tags'].apply(lambda x: 'good_quality_thumbnail' in x if isinstance(x, list) else False).astype(int)
    df['has_poor_thumbnail'] = df['tags'].apply(lambda x: 'poor_quality_thumbnail' in x if isinstance(x, list) else False).astype(int)
    df['was_relisted'] = df['tags'].apply(lambda x: 'free_relist' in x if isinstance(x, list) else False).astype(int)

    # Images
    df['picture_group'] = df['pictures'].apply(lambda x: len(x) if isinstance(x, list) else 0).apply(lambda x: '0' if x == 0 else ('1' if x == 1 else ('2-6' if x <= 6 else '7+')))

    # Title features
    df['title_length_group'] = df['title'].str.len().apply(lambda x: 'short' if x < 30 else ('medium' if x < 60 else 'long'))
    df['title_contains_new'] = df['title'].str.lower().str.contains("nuevo").fillna(False).astype(int)
    df['title_contains_used'] = df['title'].str.lower().str.contains("usado").fillna(False).astype(int)

    # Location 
    df['state_id'] = df['seller_address'].apply(lambda x: x['state']['id'] if isinstance(x, dict) else 'unknown')

    # Price diff
    df['price_diff'] = df['price'] - df['base_price']

    # Quantities
    df['sold_ratio'] = df['sold_quantity'] / (df['sold_quantity'] + df['available_quantity'] + 1e-5)
    df["availability_change"] = df["initial_quantity"] - df["available_quantity"]
    df["stock_used_ratio"] = df["availability_change"] / (df["initial_quantity"] + 1e-5)

    # Tiempos
    df["was_updated"] = (pd.to_datetime(df["last_updated"], errors="coerce") != pd.to_datetime(df["date_created"], errors="coerce")).astype(int)

    # Status
    df["is_active"] = df["status"].apply(lambda x: 1 if x == "active" else 0)

    # Columnas base
    cols_base = [
        "currency_id", "base_price", "price", "listing_type_id",
        "initial_quantity", "sold_quantity", "available_quantity", "condition",
        "seller_freq", "is_free_shipping", "shipping_mode", "local_pick_up",
        "payment_method_group", "has_dragged_visits", "has_good_thumbnail",
        "has_poor_thumbnail", "was_relisted", "picture_group", "title_length_group",
        "title_contains_new", "title_contains_used", "state_id", "price_diff",
        "sold_ratio", "availability_change", "stock_used_ratio", "was_updated",
        "is_active"
        ]

    df_model = df[cols_base].copy()

    # Target
    y = df_model["condition"]
    
    # Features: eliminamos columnas no predictivas o redundantes
    X = df_model.drop(columns=['condition'])

    # One-hot encoding para categóricas
    X_encoded = pd.get_dummies(X, drop_first=True)

    # Codificar target
    label_encoder = LabelEncoder()
    label_encoder.classes_ = np.array(['used', 'new'])   
    y_encoded = label_encoder.transform(y)

    # Dividir manualmente usando N = -10000
    N = -10000
    X_train = X_encoded[:N]
    X_test = X_encoded[N:]
    y_train = y_encoded[:N]
    y_test = y_encoded[N:]

    return X_train, y_train, X_test, y_test, label_encoder


In [None]:
X_train, y_train, X_test, y_test, label_encoder = build_dataset()

print("Dimensiones de los dataset:")
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)


In [None]:
X_train


In [None]:
y_train


In [None]:
# Decodificar etiquetas predichas
y_train_labels = label_encoder.inverse_transform(y_train)

print("Etiquetas predichas (decodificadas):")
print(y_train_labels[:10])


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)


In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(random_state=42, eval_metric='logloss')
xgb.fit(X_train, y_train)

y_pred_xgb = xgb.predict(X_test)
acc_xgb = accuracy_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)


In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Usamos StratifiedKFold para mantener proporciones de clases
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestClassifier

param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 20, 40],
    'min_samples_split': [2, 5, 10],
}

grid_rf = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid_rf,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_rf.fit(X_train, y_train)

print("Best RF Params:", grid_rf.best_params_)
print("Best RF Accuracy Score:", grid_rf.best_score_)


In [None]:
from xgboost import XGBClassifier

param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 12],
    'learning_rate': [0.1, 0.3, 0.6],
    'subsample': [0.4, 0.8, 1.0],
}

grid_xgb = GridSearchCV(
    estimator=XGBClassifier(random_state=42, eval_metric='logloss'),
    param_grid=param_grid_xgb,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_xgb.fit(X_train, y_train)

print("Best XGBoost Params:", grid_xgb.best_params_)


In [None]:
from catboost import CatBoostClassifier

# Definir el espacio de hiperparámetros para CatBoost
param_grid_cat = {
    'iterations': [100, 200, 300],
    'learning_rate': [0.03, 0.1, 0.5],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5],
}

# Configurar GridSearchCV para CatBoost
grid_cat = GridSearchCV(
    estimator=CatBoostClassifier(random_state=42, verbose=0),
    param_grid=param_grid_cat,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Entrenar el modelo CatBoost con GridSearchCV
grid_cat.fit(X_train, y_train)

# Mostrar los mejores parámetros
print("Best CatBoost Params:", grid_cat.best_params_)


In [None]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(estimators=[
    ('rf', grid_rf.best_estimator_),
    ('xgb', grid_xgb.best_estimator_)
    ('cat', grid_cat.best_estimator_)
], voting='soft')

voting_clf.fit(X_train, y_train)
y_pred_voting = voting_clf.predict(X_test)


In [None]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

def evaluar_modelo(nombre, modelo, X_test, y_test, cumple_umbral=0.86):
    y_pred = modelo.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"\n Resultados de {nombre}:")
    print(f"Accuracy: {acc:.4f} {'(Cumple)' if acc >= cumple_umbral else '(No cumple)'}")
    print(f"F1-score: {f1:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))


# Modelos simples (entrenados sobre X_train completo)
evaluar_modelo("Random Forest Simple", rf, X_test, y_test)
evaluar_modelo("XGBoost Simple", xgb, X_test, y_test)

# Modelos entrenados sobre X_train_selected
evaluar_modelo("Random Forest", grid_rf.best_estimator_, X_test, y_test)
evaluar_modelo("XGBoost", grid_xgb.best_estimator_, X_test, y_test)
evaluar_modelo("CatBoost"   , grid_cat.best_estimator_, X_test, y_test)
evaluar_modelo("Voting Ensemble", voting_clf, X_test, y_test)


In [None]:
import os
import joblib

# Crear carpeta si no existe
output_dir = "models/tree-based-embeddings"
os.makedirs(output_dir, exist_ok=True)

# Guardar modelos
joblib.dump(grid_rf.best_estimator_, os.path.join(output_dir, "random_forest.pkl"))
joblib.dump(grid_xgb.best_estimator_, os.path.join(output_dir, "xgboost.pkl"))
joblib.dump(grid_cat.best_estimator_, os.path.join(output_dir, "catboost.pkl"))
joblib.dump(voting_clf, os.path.join(output_dir, "voting_ensemble.pkl"))

# Guardar el encoder
joblib.dump(label_encoder, os.path.join(output_dir, "label_encoder.pkl"))

print("Modelos y encoder guardados exitosamente en 'tree-based-embeddings/'")


# Modelos basados en árboles y embeddings

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false" 

import json
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer
from umap import UMAP

def extract_sbert_features(df, model_name='distiluse-base-multilingual-cased-v2', n_components=50, random_state=42):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(df['title'].fillna(""), show_progress_bar=True)

    # Reducir dimensiones con UMAP
    umap = UMAP(n_components=n_components, random_state=random_state)
    reduced_embeddings = umap.fit_transform(embeddings)

    embeddings_df = pd.DataFrame(reduced_embeddings, columns=[f"sbert_{i}" for i in range(reduced_embeddings.shape[1])])
    embeddings_df.index = df.index
    df = pd.concat([df, embeddings_df], axis=1)

    print(f"[SBERT] UMAP redujo las dimensiones a {n_components} componentes.")
    return df

def build_dataset():
    filepath = "../data/raw/MLA_100k_checked_v3.jsonlines"
    with open(filepath) as f:
        data = [json.loads(line) for line in f]

    df = pd.DataFrame(data)

    # Seller activity
    df['seller_freq'] = df['seller_id'].map(df['seller_id'].value_counts(normalize=True))

    # Shipping info
    df['is_free_shipping'] = df['shipping'].apply(lambda x: x.get('free_shipping', False) if isinstance(x, dict) else False).astype(int)
    df['shipping_mode'] = df['shipping'].apply(lambda x: x.get('mode', 'unknown') if isinstance(x, dict) else 'unknown')
    df['local_pick_up'] = df['shipping'].apply(lambda x: x.get('local_pick_up', False) if isinstance(x, dict) else False).astype(int)

    # Payment methods
    df['payment_method_group'] = df['non_mercado_pago_payment_methods'].apply(lambda x: len(x) if isinstance(x, list) else 0).apply(lambda n: n if n <= 3 else '4+')

    # Tags
    df['has_dragged_visits'] = df['tags'].apply(lambda x: 'dragged_visits' in x or 'dragged_bids_and_visits' in x if isinstance(x, list) else False).astype(int)
    df['has_good_thumbnail'] = df['tags'].apply(lambda x: 'good_quality_thumbnail' in x if isinstance(x, list) else False).astype(int)
    df['has_poor_thumbnail'] = df['tags'].apply(lambda x: 'poor_quality_thumbnail' in x if isinstance(x, list) else False).astype(int)
    df['was_relisted'] = df['tags'].apply(lambda x: 'free_relist' in x if isinstance(x, list) else False).astype(int)

    # Images
    df['picture_group'] = df['pictures'].apply(lambda x: len(x) if isinstance(x, list) else 0).apply(lambda x: '0' if x == 0 else ('1' if x == 1 else ('2-6' if x <= 6 else '7+')))

    # Title features
    df['title_length_group'] = df['title'].str.len().apply(lambda x: 'short' if x < 30 else ('medium' if x < 60 else 'long'))
    df['title_contains_new'] = df['title'].str.lower().str.contains("nuevo").fillna(False).astype(int)
    df['title_contains_used'] = df['title'].str.lower().str.contains("usado").fillna(False).astype(int)

    # Location 
    df['state_id'] = df['seller_address'].apply(lambda x: x['state']['id'] if isinstance(x, dict) else 'unknown')

    # Price diff
    df['price_diff'] = df['price'] - df['base_price']

    # Quantities
    df['sold_ratio'] = df['sold_quantity'] / (df['sold_quantity'] + df['available_quantity'] + 1e-5)
    df['availability_change'] = df['initial_quantity'] - df['available_quantity']
    df['stock_used_ratio'] = df['availability_change'] / (df['initial_quantity'] + 1e-5)

    # Tiempos
    df['was_updated'] = (pd.to_datetime(df['last_updated'], errors='coerce') != pd.to_datetime(df['date_created'], errors='coerce')).astype(int)

    # Status
    df['is_active'] = df['status'].apply(lambda x: 1 if x == 'active' else 0)

    # Vectorizar títulos con SBERT reducido con UMAP
    df = extract_sbert_features(df)

    # Columnas base
    cols_base = [
        "currency_id", "base_price", "price", "listing_type_id",
        "initial_quantity", "sold_quantity", "available_quantity", "condition",
        "seller_freq", "is_free_shipping", "shipping_mode", "local_pick_up",
        "payment_method_group", "has_dragged_visits", "has_good_thumbnail",
        "has_poor_thumbnail", "was_relisted", "picture_group", "title_length_group",
        "title_contains_new", "title_contains_used", "state_id", "price_diff",
        "sold_ratio", "availability_change", "stock_used_ratio", "was_updated",
        "is_active"
    ]

    # Añadir columnas SBERT
    sbert_cols = [col for col in df.columns if col.startswith('sbert_')]
    cols_base += sbert_cols

    df_model = df[cols_base].copy()

    # Target
    y = df_model["condition"]
    X = df_model.drop(columns=["condition"])

    # One-hot encoding
    X_encoded = pd.get_dummies(X, drop_first=True)

    # Codificar target
    label_encoder = LabelEncoder()
    label_encoder.classes_ = np.array(['used', 'new'])
    y_encoded = label_encoder.transform(y)

    # División manual
    N = -10000
    X_train = X_encoded[:N]
    X_test = X_encoded[N:]
    y_train = y_encoded[:N]
    y_test = y_encoded[N:]

    return X_train, y_train, X_test, y_test, label_encoder


In [None]:
X_train, y_train, X_test, y_test, label_encoder = build_dataset()

print("Dimensiones de los dataset:")
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)


In [None]:
X_train


In [None]:
y_train


In [None]:
# Decodificar etiquetas predichas
y_train_labels = label_encoder.inverse_transform(y_train)

print("Etiquetas predichas (decodificadas):")
print(y_train_labels[:10])


In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, f1_score, classification_report

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)


In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(random_state=42, eval_metric='logloss')
xgb.fit(X_train, y_train)

y_pred_xgb = xgb.predict(X_test)
acc_xgb = accuracy_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)


In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Usamos StratifiedKFold para mantener proporciones de clases
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [None]:
from sklearn.ensemble import RandomForestClassifier

param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 20, 40],
    'min_samples_split': [2, 5, 10],
}

grid_rf = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid_rf,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_rf.fit(X_train, y_train)

print("Best RF Params:", grid_rf.best_params_)
print("Best RF Accuracy Score:", grid_rf.best_score_)


In [None]:
from xgboost import XGBClassifier

param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 12],
    'learning_rate': [0.1, 0.3, 0.6],
    'subsample': [0.4, 0.8, 1.0],
}

grid_xgb = GridSearchCV(
    estimator=XGBClassifier(random_state=42, eval_metric='logloss'),
    param_grid=param_grid_xgb,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_xgb.fit(X_train, y_train)

print("Best XGBoost Params:", grid_xgb.best_params_)


In [None]:
from catboost import CatBoostClassifier

# Definir el espacio de hiperparámetros para CatBoost
param_grid_cat = {
    'iterations': [100, 200, 300],
    'learning_rate': [0.03, 0.1, 0.5],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5],
}

# Configurar GridSearchCV para CatBoost
grid_cat = GridSearchCV(
    estimator=CatBoostClassifier(random_state=42, verbose=0),
    param_grid=param_grid_cat,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Entrenar el modelo CatBoost con GridSearchCV
grid_cat.fit(X_train, y_train)

# Mostrar los mejores parámetros
print("Best CatBoost Params:", grid_cat.best_params_)


In [None]:
from sklearn.ensemble import VotingClassifier

voting_clf = VotingClassifier(estimators=[
    ('rf', grid_rf.best_estimator_),
    ('xgb', grid_xgb.best_estimator_)
    ('cat', grid_cat.best_estimator_)
], voting='soft')

voting_clf.fit(X_train, y_train)
y_pred_voting = voting_clf.predict(X_test)


In [None]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

def evaluar_modelo(nombre, modelo, X_test, y_test, cumple_umbral=0.86):
    y_pred = modelo.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    print(f"\n Resultados de {nombre}:")
    print(f"Accuracy: {acc:.4f} {'(Cumple)' if acc >= cumple_umbral else '(No cumple)'}")
    print(f"F1-score: {f1:.4f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))


# Modelos simples (entrenados sobre X_train completo)
evaluar_modelo("Random Forest Simple", rf, X_test, y_test)
evaluar_modelo("XGBoost Simple", xgb, X_test, y_test)

# Modelos entrenados sobre X_train_selected
evaluar_modelo("Random Forest", grid_rf.best_estimator_, X_test, y_test)
evaluar_modelo("XGBoost", grid_xgb.best_estimator_, X_test, y_test)
evaluar_modelo("CatBoost"   , grid_cat.best_estimator_, X_test, y_test)
evaluar_modelo("Voting Ensemble", voting_clf, X_test, y_test)


In [None]:
import os
import joblib

# Crear carpeta si no existe
output_dir = "models/tree-based-embeddings"
os.makedirs(output_dir, exist_ok=True)

# Guardar modelos
joblib.dump(grid_rf.best_estimator_, os.path.join(output_dir, "random_forest.pkl"))
joblib.dump(grid_xgb.best_estimator_, os.path.join(output_dir, "xgboost.pkl"))
joblib.dump(grid_cat.best_estimator_, os.path.join(output_dir, "catboost.pkl"))
joblib.dump(voting_clf, os.path.join(output_dir, "voting_ensemble.pkl"))

# Guardar el encoder
joblib.dump(label_encoder, os.path.join(output_dir, "label_encoder.pkl"))

print("Modelos y encoder guardados exitosamente en 'tree-based-embeddings/'")


# Redes Neuronales

In [None]:
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
        
y_categorical = to_categorical(y_train)

# Paso 3: Normalización
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)

# Paso 4: Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_categorical, test_size=0.2, random_state=42, stratify=y_train)


In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

def extract_sbert_features(df, model_name='distiluse-base-multilingual-cased-v2', explained_variance=0.95):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(df['title'].fillna(""), show_progress_bar=True)

    # Reducir dimensiones con PCA manteniendo la varianza deseada
    pca = PCA(n_components=explained_variance)
    reduced_embeddings = pca.fit_transform(embeddings)

    embeddings_df = pd.DataFrame(reduced_embeddings, columns=[f"sbert_{i}" for i in range(reduced_embeddings.shape[1])])
    embeddings_df.index = df.index
    df = pd.concat([df, embeddings_df], axis=1)

    print(f"[SBERT] {pca.n_components_} componentes retenidos para explicar {explained_variance*100:.0f}% de la varianza.")
    return df

def build_dataset():
    filepath = "../data/raw/MLA_100k_checked_v3.jsonlines"
    with open(filepath) as f:
        data = [json.loads(line) for line in f]

    df = pd.DataFrame(data)

    # Seller activity
    df['seller_freq'] = df['seller_id'].map(df['seller_id'].value_counts(normalize=True))

    # Shipping info
    df['is_free_shipping'] = df['shipping'].apply(lambda x: x.get('free_shipping', False) if isinstance(x, dict) else False).astype(int)
    df['shipping_mode'] = df['shipping'].apply(lambda x: x.get('mode', 'unknown') if isinstance(x, dict) else 'unknown')
    df['local_pick_up'] = df['shipping'].apply(lambda x: x.get('local_pick_up', False) if isinstance(x, dict) else False).astype(int)

    # Payment methods
    df['payment_method_group'] = df['non_mercado_pago_payment_methods'].apply(lambda x: len(x) if isinstance(x, list) else 0).apply(lambda n: n if n <= 3 else '4+')

    # Tags
    df['has_dragged_visits'] = df['tags'].apply(lambda x: 'dragged_visits' in x or 'dragged_bids_and_visits' in x if isinstance(x, list) else False).astype(int)
    df['has_good_thumbnail'] = df['tags'].apply(lambda x: 'good_quality_thumbnail' in x if isinstance(x, list) else False).astype(int)
    df['has_poor_thumbnail'] = df['tags'].apply(lambda x: 'poor_quality_thumbnail' in x if isinstance(x, list) else False).astype(int)
    df['was_relisted'] = df['tags'].apply(lambda x: 'free_relist' in x if isinstance(x, list) else False).astype(int)

    # Images
    df['picture_group'] = df['pictures'].apply(lambda x: len(x) if isinstance(x, list) else 0).apply(lambda x: '0' if x == 0 else ('1' if x == 1 else ('2-6' if x <= 6 else '7+')))

    # Title features
    df['title_length_group'] = df['title'].str.len().apply(lambda x: 'short' if x < 30 else ('medium' if x < 60 else 'long'))
    df['title_contains_new'] = df['title'].str.lower().str.contains("nuevo").fillna(False).astype(int)
    df['title_contains_used'] = df['title'].str.lower().str.contains("usado").fillna(False).astype(int)

    # Location 
    df['state_id'] = df['seller_address'].apply(lambda x: x['state']['id'] if isinstance(x, dict) else 'unknown')

    # Price diff
    df['price_diff'] = df['price'] - df['base_price']

    # Quantities
    df['sold_ratio'] = df['sold_quantity'] / (df['sold_quantity'] + df['available_quantity'] + 1e-5)
    df['availability_change'] = df['initial_quantity'] - df['available_quantity']
    df['stock_used_ratio'] = df['availability_change'] / (df['initial_quantity'] + 1e-5)

    # Tiempos
    df['was_updated'] = (pd.to_datetime(df['last_updated'], errors='coerce') != pd.to_datetime(df['date_created'], errors='coerce')).astype(int)

    # Status
    df['is_active'] = df['status'].apply(lambda x: 1 if x == 'active' else 0)

    # Vectorizar títulos con SBERT reducido
    df = extract_sbert_features(df)

    # Columnas base
    cols_base = [
        "currency_id", "base_price", "price", "listing_type_id",
        "initial_quantity", "sold_quantity", "available_quantity", "condition",
        "seller_freq", "is_free_shipping", "shipping_mode", "local_pick_up",
        "payment_method_group", "has_dragged_visits", "has_good_thumbnail",
        "has_poor_thumbnail", "was_relisted", "picture_group", "title_length_group",
        "title_contains_new", "title_contains_used", "state_id", "price_diff",
        "sold_ratio", "availability_change", "stock_used_ratio", "was_updated",
        "is_active"
    ]

    # Añadir columnas SBERT
    sbert_cols = [col for col in df.columns if col.startswith('sbert_')]
    cols_base += sbert_cols

    df_model = df[cols_base].copy()

    # Target
    y = df_model["condition"]
    X = df_model.drop(columns=["condition"])

    # One-hot encoding
    X_encoded = pd.get_dummies(X, drop_first=True)
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_encoded)

    # Codificar target
    label_encoder = LabelEncoder()
    label_encoder.classes_ = np.array(['used', 'new'])
    y_encoded = label_encoder.transform(y)
    
    y_categorical = to_categorical(y_encoded)

    # División manual
    N = -10000
    X_train = X_scaled[:N]
    X_test = X_scaled[N:]
    y_train = y_categorical[:N]
    y_test = y_categorical[N:]

    return X_train, y_train, X_test, y_test, label_encoder

In [None]:
X_train, y_train, X_test, y_test, label_encoder = build_dataset()

print("Dimensiones de los dataset:")
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

In [None]:
import datetime
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True
)

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(2, activation='sigmoid') 
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()


In [None]:
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=64,
    verbose=1,
    callbacks=[early_stopping, tensorboard_callback]
)


In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"✅ Accuracy: {accuracy:.4f}")

# F1-score
from sklearn.metrics import classification_report
y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype(int)

print(classification_report(y_test, y_pred))


In [None]:
# tensorboard --logdir=logs/fit

In [None]:
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Definir arquitectura
def create_model(learning_rate=0.001, l2_reg=0.01, dropout_rate=0.3, **kwargs):
    model = Sequential()
    model.add(Dense(64, input_shape=(X_scaled.shape[1],), activation='relu', kernel_regularizer=l2(l2_reg)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(32, activation='relu', kernel_regularizer=l2(l2_reg)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=Adam(learning_rate=learning_rate),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

# Scikeras wrapper
model = KerasClassifier(
    model=create_model,
    verbose=0,
    target_type="binary",
    callbacks=[EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)],
    validation_split=0.2,
)

# Hiperparámetros
param_grid = {
    "model__learning_rate": [0.001, 0.01],
    "model__l2_reg": [0.001, 0.01],
    "model__dropout_rate": [0.3, 0.5],
    "batch_size": [32, 64],
    "epochs": [15, 25, 50],
}

# Grid Search sin cv (usa validación interna de Keras)
grid = GridSearchCV(model, param_grid, scoring="accuracy", n_jobs=-1)
grid.fit(X_train, y_train)

# Resultados
print("Best Params:", grid.best_params_)
print("Best Score (Accuracy):", grid.best_score_)


In [None]:
# Evaluar en test
y_pred_test = grid.best_estimator_.predict(X_test)
acc = accuracy_score(y_test, y_pred_test)
f1 = f1_score(y_test, y_pred_test)

print(f"Accuracy en test: {acc:.4f}")
print(f"F1-score en test: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_test, target_names=label_encoder.classes_))

# Otros

In [None]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def load_data(file_path):
    return pd.read_json(file_path, lines=True)

file_path = "../data/raw/MLA_100k_checked_v3.jsonlines"
df = load_data(file_path)
df.head()


In [None]:
# Seller activity
df['seller_freq'] = df['seller_id'].map(df['seller_id'].value_counts(normalize=True))

# Shipping info
df['is_free_shipping'] = df['shipping'].apply(lambda x: x.get('free_shipping', False) if isinstance(x, dict) else False).astype(int)
df['shipping_mode'] = df['shipping'].apply(lambda x: x.get('mode', 'unknown') if isinstance(x, dict) else 'unknown')
df['local_pick_up'] = df['shipping'].apply(lambda x: x.get('local_pick_up', False) if isinstance(x, dict) else False).astype(int)

# Payment methods
df['payment_method_group'] = df['non_mercado_pago_payment_methods'].apply(lambda x: len(x) if isinstance(x, list) else 0).apply(lambda n: n if n <= 3 else '4+')

# Tags
df['has_dragged_visits'] = df['tags'].apply(lambda x: 'dragged_visits' in x or 'dragged_bids_and_visits' in x if isinstance(x, list) else False).astype(int)
df['has_good_thumbnail'] = df['tags'].apply(lambda x: 'good_quality_thumbnail' in x if isinstance(x, list) else False).astype(int)
df['has_poor_thumbnail'] = df['tags'].apply(lambda x: 'poor_quality_thumbnail' in x if isinstance(x, list) else False).astype(int)
df['was_relisted'] = df['tags'].apply(lambda x: 'free_relist' in x if isinstance(x, list) else False).astype(int)

# Images
df['picture_group'] = df['pictures'].apply(lambda x: len(x) if isinstance(x, list) else 0).apply(lambda x: '0' if x == 0 else ('1' if x == 1 else ('2-6' if x <= 6 else '7+')))

# Title features
df['title_length_group'] = df['title'].str.len().apply(lambda x: 'short' if x < 30 else ('medium' if x < 60 else 'long'))
df['title_contains_new'] = df['title'].str.lower().str.contains("nuevo").fillna(False).astype(int)
df['title_contains_used'] = df['title'].str.lower().str.contains("usado").fillna(False).astype(int)

# Location 
df['state_id'] = df['seller_address'].apply(lambda x: x['state']['id'] if isinstance(x, dict) else 'unknown')

# Price diff
df['price_diff'] = df['price'] - df['base_price']

# Quantities
df['sold_ratio'] = df['sold_quantity'] / (df['sold_quantity'] + df['available_quantity'] + 1e-5)
df["availability_change"] = df["initial_quantity"] - df["available_quantity"]
df["stock_used_ratio"] = df["availability_change"] / (df["initial_quantity"] + 1e-5)

# Tiempos
df["was_updated"] = (pd.to_datetime(df["last_updated"], errors="coerce") != pd.to_datetime(df["date_created"], errors="coerce")).astype(int)

# Status
df["is_active"] = df["status"].apply(lambda x: 1 if x == "active" else 0)

In [None]:
plt.figure(figsize=(6, 4))
sns.countplot(data=df, x='condition', palette='viridis')
plt.title('Distribución de condición (nuevo vs usado)', fontsize=14)
plt.xlabel('Condición')
plt.ylabel('Cantidad')
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.histplot(data=df, x='price', hue='condition', log_scale=True, bins=50, palette='coolwarm', kde=True)
plt.title('Distribución de precios por condición')
plt.xlabel('Precio (escala log)')
plt.ylabel('Frecuencia')
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
sns.boxplot(data=df, x='condition', y='picture_group', palette='pastel')
plt.title('Cantidad de imágenes por condición')
plt.xlabel('Condición')
plt.ylabel('Número de imágenes')
plt.show()

In [None]:
df_title_flags = df[['title_contains_new', 'title_contains_used']].sum().rename({0: 'No', 1: 'Sí'})
df_title_flags.index = ['Contiene \"nuevo\"', 'Contiene \"usado\"']

plt.figure(figsize=(6, 4))
df_title_flags.plot(kind='bar', color=['#66c2a5', '#fc8d62'])
plt.title('Presencia de \"nuevo\" o \"usado\" en títulos')
plt.ylabel('Número de ítems')
plt.xticks(rotation=0)
plt.show()

In [None]:
plt.figure(figsize=(10, 4))
top_states = df['state_id'].value_counts().head(10)
sns.barplot(x=top_states.index, y=top_states.values, palette='cubehelix')
plt.title('Top 10 estados con más publicaciones')
plt.xlabel('Estado')
plt.ylabel('Cantidad de ítems')
plt.xticks(rotation=45)
plt.show()


In [None]:
import matplotlib.pyplot as plt

feat_importances = pd.Series(rf.feature_importances_, index=X_train.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.title("Top 10 Features - Random Forest")
plt.xlabel("Importancia")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Importancia de características con XGBoost
xgb_importances = pd.Series(xgb.feature_importances_, index=X_train.columns)

# Mostrar las 10 más importantes
xgb_importances.nlargest(10).plot(kind='barh')
plt.title("Top 10 Features - XGBoost")
plt.xlabel("Importancia")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
import joblib

# Cargar modelo y encoder desde archivos distintos
modelo_produccion = joblib.load("models/best_gb_model.pkl")
encoder = joblib.load("models/label_encoder.pkl")

# Predecir
y_pred = modelo_produccion.predict(X_test)

# Decodificar etiquetas predichas (si eran categóricas)
y_pred_labels = encoder.inverse_transform(y_pred)

print("Etiquetas predichas (decodificadas):")
print(y_pred_labels[:10])


In [None]:
import json
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from umap import UMAP

def extract_text_features(df, max_features=100, n_components=20, random_state=42):
    # Limpiar texto: minúsculas, quitar símbolos, eliminar stopwords
    def clean_text(text):
        text = re.sub(r"[^\w\s]", "", text.lower())
        tokens = text.split()
        spanish_stopwords = set(stopwords.words('spanish'))
        tokens = [t for t in tokens if t not in spanish_stopwords]
        return " ".join(tokens)

    df["title_clean"] = df["title"].fillna("").apply(clean_text)

    # TF-IDF
    vectorizer = TfidfVectorizer(max_features=max_features)
    tfidf_matrix = vectorizer.fit_transform(df["title_clean"])

    # Reducción con UMAP
    umap = UMAP(n_components=n_components, random_state=random_state)
    tfidf_umap = umap.fit_transform(tfidf_matrix.toarray())

    tfidf_df = pd.DataFrame(tfidf_umap, columns=[f"tfidf_{i}" for i in range(tfidf_umap.shape[1])])
    tfidf_df.index = df.index
    df = pd.concat([df, tfidf_df], axis=1)

    return df

def build_dataset():
    filepath = "../data/raw/MLA_100k_checked_v3.jsonlines"
    with open(filepath) as f:
        data = [json.loads(line) for line in f]

    df = pd.DataFrame(data)

    # Seller activity
    df['seller_freq'] = df['seller_id'].map(df['seller_id'].value_counts(normalize=True))

    # Shipping info
    df['is_free_shipping'] = df['shipping'].apply(lambda x: x.get('free_shipping', False) if isinstance(x, dict) else False).astype(int)
    df['shipping_mode'] = df['shipping'].apply(lambda x: x.get('mode', 'unknown') if isinstance(x, dict) else 'unknown')
    df['local_pick_up'] = df['shipping'].apply(lambda x: x.get('local_pick_up', False) if isinstance(x, dict) else False).astype(int)

    # Payment methods
    df['payment_method_group'] = df['non_mercado_pago_payment_methods'].apply(lambda x: len(x) if isinstance(x, list) else 0).apply(lambda n: n if n <= 3 else '4+')

    # Tags
    df['has_dragged_visits'] = df['tags'].apply(lambda x: 'dragged_visits' in x or 'dragged_bids_and_visits' in x if isinstance(x, list) else False).astype(int)
    df['has_good_thumbnail'] = df['tags'].apply(lambda x: 'good_quality_thumbnail' in x if isinstance(x, list) else False).astype(int)
    df['has_poor_thumbnail'] = df['tags'].apply(lambda x: 'poor_quality_thumbnail' in x if isinstance(x, list) else False).astype(int)
    df['was_relisted'] = df['tags'].apply(lambda x: 'free_relist' in x if isinstance(x, list) else False).astype(int)

    # Images
    df['picture_group'] = df['pictures'].apply(lambda x: len(x) if isinstance(x, list) else 0).apply(lambda x: '0' if x == 0 else ('1' if x == 1 else ('2-6' if x <= 6 else '7+')))

    # Title features
    df['title_length_group'] = df['title'].str.len().apply(lambda x: 'short' if x < 30 else ('medium' if x < 60 else 'long'))
    df['title_contains_new'] = df['title'].str.lower().str.contains("nuevo").fillna(False).astype(int)
    df['title_contains_used'] = df['title'].str.lower().str.contains("usado").fillna(False).astype(int)

    # Location 
    df['state_id'] = df['seller_address'].apply(lambda x: x['state']['id'] if isinstance(x, dict) else 'unknown')

    # Price diff
    df['price_diff'] = df['price'] - df['base_price']

    # Quantities
    df['sold_ratio'] = df['sold_quantity'] / (df['sold_quantity'] + df['available_quantity'] + 1e-5)
    df["availability_change"] = df["initial_quantity"] - df["available_quantity"]
    df["stock_used_ratio"] = df["availability_change"] / (df["initial_quantity"] + 1e-5)

    # Tiempos
    df["was_updated"] = (pd.to_datetime(df["last_updated"], errors="coerce") != pd.to_datetime(df["date_created"], errors="coerce")).astype(int)

    # Status
    df["is_active"] = df["status"].apply(lambda x: 1 if x == "active" else 0)

    # TF-IDF con limpieza y reducción con UMAP
    df = extract_text_features(df, max_features=100, n_components=20)

    # Columnas base
    cols_base = [
        "currency_id", "base_price", "price", "listing_type_id",
        "initial_quantity", "sold_quantity", "available_quantity", "condition",
        "seller_freq", "is_free_shipping", "shipping_mode", "local_pick_up",
        "payment_method_group", "has_dragged_visits", "has_good_thumbnail",
        "has_poor_thumbnail", "was_relisted", "picture_group", "title_length_group",
        "title_contains_new", "title_contains_used", "state_id", "price_diff",
        "sold_ratio", "availability_change", "stock_used_ratio", "was_updated",
        "is_active"
    ]

    # Añadir columnas de UMAP
    tfidf_cols = [col for col in df.columns if col.startswith("tfidf_")]
    cols_base += tfidf_cols

    df_model = df[cols_base].copy()

    # Target y Features
    y = df_model["condition"]
    X = df_model.drop(columns=["condition"])
    X_encoded = pd.get_dummies(X, drop_first=True)

    # Codificar target
    label_encoder = LabelEncoder()
    label_encoder.classes_ = np.array(['used', 'new'])   
    y_encoded = label_encoder.transform(y)

    # División
    N = -10000
    X_train = X_encoded[:N]
    X_test = X_encoded[N:]
    y_train = y_encoded[:N]
    y_test = y_encoded[N:]

    return X_train, y_train, X_test, y_test, label_encoder


In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer

def extract_sbert_features(df, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(df['title'].fillna(""), show_progress_bar=True)
    embeddings_df = pd.DataFrame(embeddings, columns=[f"sbert_{i}" for i in range(embeddings.shape[1])])
    embeddings_df.index = df.index
    df = pd.concat([df, embeddings_df], axis=1)
    return df

def build_dataset():
    filepath = "../data/raw/MLA_100k_checked_v3.jsonlines"
    with open(filepath) as f:
        data = [json.loads(line) for line in f]

    df = pd.DataFrame(data)

    # Seller activity
    df['seller_freq'] = df['seller_id'].map(df['seller_id'].value_counts(normalize=True))

    # Shipping info
    df['is_free_shipping'] = df['shipping'].apply(lambda x: x.get('free_shipping', False) if isinstance(x, dict) else False).astype(int)
    df['shipping_mode'] = df['shipping'].apply(lambda x: x.get('mode', 'unknown') if isinstance(x, dict) else 'unknown')
    df['local_pick_up'] = df['shipping'].apply(lambda x: x.get('local_pick_up', False) if isinstance(x, dict) else False).astype(int)

    # Payment methods
    df['payment_method_group'] = df['non_mercado_pago_payment_methods'].apply(lambda x: len(x) if isinstance(x, list) else 0).apply(lambda n: n if n <= 3 else '4+')

    # Tags
    df['has_dragged_visits'] = df['tags'].apply(lambda x: 'dragged_visits' in x or 'dragged_bids_and_visits' in x if isinstance(x, list) else False).astype(int)
    df['has_good_thumbnail'] = df['tags'].apply(lambda x: 'good_quality_thumbnail' in x if isinstance(x, list) else False).astype(int)
    df['has_poor_thumbnail'] = df['tags'].apply(lambda x: 'poor_quality_thumbnail' in x if isinstance(x, list) else False).astype(int)
    df['was_relisted'] = df['tags'].apply(lambda x: 'free_relist' in x if isinstance(x, list) else False).astype(int)

    # Images
    df['picture_group'] = df['pictures'].apply(lambda x: len(x) if isinstance(x, list) else 0).apply(lambda x: '0' if x == 0 else ('1' if x == 1 else ('2-6' if x <= 6 else '7+')))

    # Title features
    df['title_length_group'] = df['title'].str.len().apply(lambda x: 'short' if x < 30 else ('medium' if x < 60 else 'long'))
    df['title_contains_new'] = df['title'].str.lower().str.contains("nuevo").fillna(False).astype(int)
    df['title_contains_used'] = df['title'].str.lower().str.contains("usado").fillna(False).astype(int)

    # Location 
    df['state_id'] = df['seller_address'].apply(lambda x: x['state']['id'] if isinstance(x, dict) else 'unknown')

    # Price diff
    df['price_diff'] = df['price'] - df['base_price']

    # Quantities
    df['sold_ratio'] = df['sold_quantity'] / (df['sold_quantity'] + df['available_quantity'] + 1e-5)
    df['availability_change'] = df['initial_quantity'] - df['available_quantity']
    df['stock_used_ratio'] = df['availability_change'] / (df['initial_quantity'] + 1e-5)

    # Tiempos
    df['was_updated'] = (pd.to_datetime(df['last_updated'], errors='coerce') != pd.to_datetime(df['date_created'], errors='coerce')).astype(int)

    # Status
    df['is_active'] = df['status'].apply(lambda x: 1 if x == 'active' else 0)

    # Vectorizar títulos con SBERT
    df = extract_sbert_features(df)

    # Columnas base
    cols_base = [
        "currency_id", "base_price", "price", "listing_type_id",
        "initial_quantity", "sold_quantity", "available_quantity", "condition",
        "seller_freq", "is_free_shipping", "shipping_mode", "local_pick_up",
        "payment_method_group", "has_dragged_visits", "has_good_thumbnail",
        "has_poor_thumbnail", "was_relisted", "picture_group", "title_length_group",
        "title_contains_new", "title_contains_used", "state_id", "price_diff",
        "sold_ratio", "availability_change", "stock_used_ratio", "was_updated",
        "is_active"
    ]

    # Añadir columnas SBERT
    sbert_cols = [col for col in df.columns if col.startswith('sbert_')]
    cols_base += sbert_cols

    df_model = df[cols_base].copy()

    # Target
    y = df_model["condition"]
    X = df_model.drop(columns=["condition"])

    # One-hot encoding
    X_encoded = pd.get_dummies(X, drop_first=True)

    # Codificar target
    label_encoder = LabelEncoder()
    label_encoder.classes_ = np.array(['used', 'new'])
    y_encoded = label_encoder.transform(y)

    # División manual
    N = -10000
    X_train = X_encoded[:N]
    X_test = X_encoded[N:]
    y_train = y_encoded[:N]
    y_test = y_encoded[N:]

    return X_train, y_train, X_test, y_test, label_encoder

In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA

def extract_sbert_features(df, model_name='distiluse-base-multilingual-cased-v2', explained_variance=0.90):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(df['title'].fillna(""), show_progress_bar=True)

    # Reducir dimensiones con PCA manteniendo la varianza deseada
    pca = PCA(n_components=explained_variance)
    reduced_embeddings = pca.fit_transform(embeddings)

    embeddings_df = pd.DataFrame(reduced_embeddings, columns=[f"sbert_{i}" for i in range(reduced_embeddings.shape[1])])
    embeddings_df.index = df.index
    df = pd.concat([df, embeddings_df], axis=1)

    print(f"[SBERT] {pca.n_components_} componentes retenidos para explicar {explained_variance*100:.0f}% de la varianza.")
    return df

def build_dataset():
    filepath = "../data/raw/MLA_100k_checked_v3.jsonlines"
    with open(filepath) as f:
        data = [json.loads(line) for line in f]

    df = pd.DataFrame(data)

    # Seller activity
    df['seller_freq'] = df['seller_id'].map(df['seller_id'].value_counts(normalize=True))

    # Shipping info
    df['is_free_shipping'] = df['shipping'].apply(lambda x: x.get('free_shipping', False) if isinstance(x, dict) else False).astype(int)
    df['shipping_mode'] = df['shipping'].apply(lambda x: x.get('mode', 'unknown') if isinstance(x, dict) else 'unknown')
    df['local_pick_up'] = df['shipping'].apply(lambda x: x.get('local_pick_up', False) if isinstance(x, dict) else False).astype(int)

    # Payment methods
    df['payment_method_group'] = df['non_mercado_pago_payment_methods'].apply(lambda x: len(x) if isinstance(x, list) else 0).apply(lambda n: n if n <= 3 else '4+')

    # Tags
    df['has_dragged_visits'] = df['tags'].apply(lambda x: 'dragged_visits' in x or 'dragged_bids_and_visits' in x if isinstance(x, list) else False).astype(int)
    df['has_good_thumbnail'] = df['tags'].apply(lambda x: 'good_quality_thumbnail' in x if isinstance(x, list) else False).astype(int)
    df['has_poor_thumbnail'] = df['tags'].apply(lambda x: 'poor_quality_thumbnail' in x if isinstance(x, list) else False).astype(int)
    df['was_relisted'] = df['tags'].apply(lambda x: 'free_relist' in x if isinstance(x, list) else False).astype(int)

    # Images
    df['picture_group'] = df['pictures'].apply(lambda x: len(x) if isinstance(x, list) else 0).apply(lambda x: '0' if x == 0 else ('1' if x == 1 else ('2-6' if x <= 6 else '7+')))

    # Title features
    df['title_length_group'] = df['title'].str.len().apply(lambda x: 'short' if x < 30 else ('medium' if x < 60 else 'long'))
    df['title_contains_new'] = df['title'].str.lower().str.contains("nuevo").fillna(False).astype(int)
    df['title_contains_used'] = df['title'].str.lower().str.contains("usado").fillna(False).astype(int)

    # Location 
    df['state_id'] = df['seller_address'].apply(lambda x: x['state']['id'] if isinstance(x, dict) else 'unknown')

    # Price diff
    df['price_diff'] = df['price'] - df['base_price']

    # Quantities
    df['sold_ratio'] = df['sold_quantity'] / (df['sold_quantity'] + df['available_quantity'] + 1e-5)
    df['availability_change'] = df['initial_quantity'] - df['available_quantity']
    df['stock_used_ratio'] = df['availability_change'] / (df['initial_quantity'] + 1e-5)

    # Tiempos
    df['was_updated'] = (pd.to_datetime(df['last_updated'], errors='coerce') != pd.to_datetime(df['date_created'], errors='coerce')).astype(int)

    # Status
    df['is_active'] = df['status'].apply(lambda x: 1 if x == 'active' else 0)

    # Vectorizar títulos con SBERT reducido
    df = extract_sbert_features(df)

    # Columnas base
    cols_base = [
        "currency_id", "base_price", "price", "listing_type_id",
        "initial_quantity", "sold_quantity", "available_quantity", "condition",
        "seller_freq", "is_free_shipping", "shipping_mode", "local_pick_up",
        "payment_method_group", "has_dragged_visits", "has_good_thumbnail",
        "has_poor_thumbnail", "was_relisted", "picture_group", "title_length_group",
        "title_contains_new", "title_contains_used", "state_id", "price_diff",
        "sold_ratio", "availability_change", "stock_used_ratio", "was_updated",
        "is_active"
    ]

    # Añadir columnas SBERT
    sbert_cols = [col for col in df.columns if col.startswith('sbert_')]
    cols_base += sbert_cols

    df_model = df[cols_base].copy()

    # Target
    y = df_model["condition"]
    X = df_model.drop(columns=["condition"])

    # One-hot encoding
    X_encoded = pd.get_dummies(X, drop_first=True)

    # Codificar target
    label_encoder = LabelEncoder()
    label_encoder.classes_ = np.array(['used', 'new'])
    y_encoded = label_encoder.transform(y)

    # División manual
    N = -10000
    X_train = X_encoded[:N]
    X_test = X_encoded[N:]
    y_train = y_encoded[:N]
    y_test = y_encoded[N:]

    return X_train, y_train, X_test, y_test, label_encoder


In [None]:
X_train, y_train, X_test, y_test, label_encoder = build_dataset()

print("Dimensiones de los dataset:")
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)


In [None]:
X_train


In [None]:
y_train


In [None]:
# Decodificar etiquetas predichas
y_train_labels = label_encoder.inverse_transform(y_train)

print("Etiquetas predichas (decodificadas):")
print(y_train_labels[:10])
