# _**Model Training**_


We import the necessary libraries for the model

In [20]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [21]:
Model = pd.read_json('../data/MLA_100k_Cleaned.jsonlines', lines = True)

We divide the predictor variables into (x) and the objective variable (y)

In [22]:
# Dividir los datos en variables predictoras (X) y la variable objetivo (y)
X = Model[['warranty',
           'seller_contact',
           'base_price',
           'price',
           'buying_mode',
           'last_updated',
           'accepts_mercadopago',
           'original_price',
           'date_created',
           'stop_time',
           'status',
           'initial_quantity',
           'start_time',
           'sold_quantity',
           'available_quantity',
           'local_pickup',
           'shipping_free',
           'shipping_mode',
           'num_shipping_tags',
           'non_mercado_pago_payment_methods',
           'pixeles',
           'pixeles_max']]  

y = Model['condition'].apply(lambda x: 1 if x == 'new' else 0)  # 1 = new, 0 = used

we do the 70/30 split for training and testing

In [23]:

# Convertir columnas categóricas a tipo numérico
X = pd.get_dummies(X, columns=['buying_mode', 'status', 'shipping_mode'], drop_first=True)

# Manejar valores NaN
X.fillna(0, inplace=True)  # Puedes usar 0, la media, o eliminar filas con NaN

# Dividir en conjuntos de entrenamiento y prueba (70/30)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



We define the models that best suit this case

In [24]:
# Definir los modelos
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}


We train the model and evaluate to see which was the best model

In [25]:
# Entrenar y evaluar los modelos
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    auc_roc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    
    print(f'--- Resultados para {model_name} ---')
    print(f'Accuracy: {accuracy}')
    print(f'AUC-ROC: {auc_roc}')
    print('Classification Report:')
    print(classification_report(y_test, y_pred))

--- Resultados para Random Forest ---
Accuracy: 0.8194666666666667
AUC-ROC: 0.8943187735916869
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.85      0.81     13838
           1       0.86      0.79      0.83     16162

    accuracy                           0.82     30000
   macro avg       0.82      0.82      0.82     30000
weighted avg       0.82      0.82      0.82     30000

--- Resultados para Gradient Boosting ---
Accuracy: 0.8160333333333334
AUC-ROC: 0.8872172590253128
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.87      0.81     13838
           1       0.88      0.77      0.82     16162

    accuracy                           0.82     30000
   macro avg       0.82      0.82      0.82     30000
weighted avg       0.82      0.82      0.82     30000

--- Resultados para K-Nearest Neighbors ---
Accuracy: 0.7154
AUC-ROC: 0.7771098887315575
Classification Re

Parameters: { "use_label_encoder" } are not used.



--- Resultados para XGBoost ---
Accuracy: 0.8269666666666666
AUC-ROC: 0.8997035391355401
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.88      0.82     13838
           1       0.88      0.78      0.83     16162

    accuracy                           0.83     30000
   macro avg       0.83      0.83      0.83     30000
weighted avg       0.83      0.83      0.83     30000



In [29]:
from xgboost import XGBClassifier
import pickle  


# Definir el modelo XGBoost
xgboost_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Entrenar el modelo
xgboost_model.fit(X_train, y_train)

# Evaluar el modelo
y_pred = xgboost_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
auc_roc = roc_auc_score(y_test, xgboost_model.predict_proba(X_test)[:, 1])

print('--- Resultados para XGBoost ---')
print(f'Accuracy: {accuracy}')
print(f'AUC-ROC: {auc_roc}')
print('Classification Report:')
print(classification_report(y_test, y_pred))

# Guardar el modelo entrenado en un archivo .pkl
with open('../model/modelo_xgboost.pkl', 'wb') as f:
    pickle.dump(xgboost_model, f)

print("Modelo XGBoost guardado exitosamente en 'modelo_xgboost.pkl'.")


Parameters: { "use_label_encoder" } are not used.



--- Resultados para XGBoost ---
Accuracy: 0.8269666666666666
AUC-ROC: 0.8997035391355401
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.88      0.82     13838
           1       0.88      0.78      0.83     16162

    accuracy                           0.83     30000
   macro avg       0.83      0.83      0.83     30000
weighted avg       0.83      0.83      0.83     30000

Modelo XGBoost guardado exitosamente en 'modelo_xgboost.pkl'.
