In [None]:
%load_ext autoreload
%autoreload 2


# Preparación de los datos

In [None]:
import json
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

def build_dataset():
    # Leer el archivo JSONLines
    filepath="../data/raw/MLA_100k_checked_v3.jsonlines"
    with open(filepath) as f:
        data = [json.loads(line) for line in f]

    # Convertir a DataFrame
    df = pd.DataFrame(data)

    # Seller activity
    df['seller_freq'] = df['seller_id'].map(df['seller_id'].value_counts(normalize=True))

    # Shipping info
    df['is_free_shipping'] = df['shipping'].apply(lambda x: x.get('free_shipping', False) if isinstance(x, dict) else False).astype(int)
    df['shipping_mode'] = df['shipping'].apply(lambda x: x.get('mode', 'unknown') if isinstance(x, dict) else 'unknown')
    df['local_pick_up'] = df['shipping'].apply(lambda x: x.get('local_pick_up', False) if isinstance(x, dict) else False).astype(int)

    # Payment methods
    df['payment_method_group'] = df['non_mercado_pago_payment_methods'].apply(lambda x: len(x) if isinstance(x, list) else 0).apply(lambda n: n if n <= 3 else '4+')

    # Tags
    df['has_dragged_visits'] = df['tags'].apply(lambda x: 'dragged_visits' in x or 'dragged_bids_and_visits' in x if isinstance(x, list) else False).astype(int)
    df['has_good_thumbnail'] = df['tags'].apply(lambda x: 'good_quality_thumbnail' in x if isinstance(x, list) else False).astype(int)
    df['has_poor_thumbnail'] = df['tags'].apply(lambda x: 'poor_quality_thumbnail' in x if isinstance(x, list) else False).astype(int)
    df['was_relisted'] = df['tags'].apply(lambda x: 'free_relist' in x if isinstance(x, list) else False).astype(int)

    # Images
    df['picture_group'] = df['pictures'].apply(lambda x: len(x) if isinstance(x, list) else 0).apply(lambda x: '0' if x == 0 else ('1' if x == 1 else ('2-6' if x <= 6 else '7+')))

    # Title features
    df['title_length_group'] = df['title'].str.len().apply(lambda x: 'short' if x < 30 else ('medium' if x < 60 else 'long'))
    df['title_contains_new'] = df['title'].str.lower().str.contains("nuevo").fillna(False).astype(int)
    df['title_contains_used'] = df['title'].str.lower().str.contains("usado").fillna(False).astype(int)

    # Location 
    df['state_id'] = df['seller_address'].apply(lambda x: x['state']['id'] if isinstance(x, dict) else 'unknown')

    # Price diff
    df['price_diff'] = df['price'] - df['base_price']

    # Quantities
    df['sold_ratio'] = df['sold_quantity'] / (df['sold_quantity'] + df['available_quantity'] + 1e-5)
    df["availability_change"] = df["initial_quantity"] - df["available_quantity"]
    df["stock_used_ratio"] = df["availability_change"] / (df["initial_quantity"] + 1e-5)

    # Tiempos
    df["was_updated"] = (pd.to_datetime(df["last_updated"], errors="coerce") != pd.to_datetime(df["date_created"], errors="coerce")).astype(int)

    # Status
    df["is_active"] = df["status"].apply(lambda x: 1 if x == "active" else 0)

    # Columnas base
    cols_base = [
        "currency_id", "base_price", "price", "listing_type_id",
        "initial_quantity", "sold_quantity", "available_quantity", "condition",
        "seller_freq", "is_free_shipping", "shipping_mode", "local_pick_up",
        "payment_method_group", "has_dragged_visits", "has_good_thumbnail",
        "has_poor_thumbnail", "was_relisted", "picture_group", "title_length_group",
        "title_contains_new", "title_contains_used", "state_id", "price_diff",
        "sold_ratio", "availability_change", "stock_used_ratio", "was_updated",
        "is_active"
        ]

    df_model = df[cols_base].copy()

    # Target
    y = df_model["condition"]
    
    # Features: eliminamos columnas no predictivas o redundantes
    X = df_model.drop(columns=['condition'])

    # One-hot encoding para categóricas
    X_encoded = pd.get_dummies(X, drop_first=True)

    # Codificar target
    label_encoder = LabelEncoder()
    label_encoder.classes_ = np.array(['used', 'new'])   
    y_encoded = label_encoder.transform(y)

    # Dividir manualmente usando N = -10000
    N = -10000
    X_train = X_encoded[:N]
    X_test = X_encoded[N:]
    y_train = y_encoded[:N]
    y_test = y_encoded[N:]

    return X_train, y_train, X_test, y_test, label_encoder


In [None]:
X_train, y_train, X_test, y_test, label_encoder = build_dataset()

print("Dimensiones de los dataset:")
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)


In [None]:
X_train


In [None]:
y_train


In [None]:
# Decodificar etiquetas predichas
y_train_labels = label_encoder.inverse_transform(y_train)

print("Etiquetas predichas (decodificadas):")
print(y_train_labels[:10])
