In [1]:
import pandas as pd
import numpy as np
from pycaret.classification import *
from sklearn.feature_extraction.text import TfidfVectorizer
from pycaret.classification import setup, compare_models
import pickle

In [2]:
# Cargo datos
file_path = "productos_todas_categorias.csv"
df = pd.read_csv(file_path)

In [3]:
# Cuento categorías antes de filtrar
category_counts = df['category_id'].value_counts()

In [4]:
# Filtro subcategorías con al menos 50 productos
valid_categories = category_counts[category_counts >= 50].index
df = df[df['category_id'].isin(valid_categories)]

In [5]:
# Selecciono solo las columnas necesarias
df = df[['title', 'category_id']]

In [6]:
# Limpio texto
df['title'] = df['title'].str.lower().str.replace(r'[^a-zA-Z0-9 ]', '', regex=True)

In [7]:
# Convierto títulos a vectores TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(df['title']).toarray()

In [8]:
# Imprimo las columnas del vectorizador después del entrenamiento
print("🔹 Columnas en el vectorizador después del entrenamiento:")
print(vectorizer.get_feature_names_out())

🔹 Columnas en el vectorizador después del entrenamiento:
['01' '012' '030' ... 'zsn' 'zte' 'zve10']


In [9]:
# Convierto a DataFrame
X_df = pd.DataFrame(X_tfidf, columns=vectorizer.get_feature_names_out())
X_df['category_id'] = df['category_id'].values  

In [10]:
# imprimo valores de category_id
print(X_df['category_id'].value_counts())

category_id
MCO1196      981
MCO1744      953
MCO1176      798
MCO1442      763
MCO1055      589
            ... 
MCO90075      52
MCO173191     52
MCO180874     52
MCO388859     52
MCO167689     51
Name: count, Length: 107, dtype: int64


In [11]:
# Configuro PyCaret
clf = setup(X_df, target='category_id', session_id=123, train_size=0.8, verbose=False, data_split_stratify=True)

In [12]:
# #Comparo modelos
best_model = compare_models(exclude=['dummy', 'ridge','lr', 'lda','nb', 'svm', 'ada', 'xgboost', 'lightgbm', 'qda','gbc', 'rbfsvm','gpc','mlp','et'])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.8877,0.993,0.8877,0.8941,0.8854,0.8853,0.8854,15.77
dt,Decision Tree Classifier,0.8389,0.9216,0.8389,0.8493,0.8372,0.8355,0.8357,11.852
knn,K Neighbors Classifier,0.6487,0.9137,0.6487,0.8876,0.7103,0.6385,0.6706,9.979


In [13]:
# Imprimo métricas del modelo sin ajustar
print("Métricas del modelo sin ajuste de hiperparámetros:")
metrics_no_tuning = predict_model(best_model)

Métricas del modelo sin ajuste de hiperparámetros:


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.8979,0.9938,0.8979,0.9024,0.8972,0.8957,0.8959


In [14]:
# Finalizo el modelo sin tuning (entrenado con todos los datos)
final_model = finalize_model(best_model)

In [15]:
# Guardo el modelo para futuras predicciones
save_model(final_model, 'modelo_Predicción_Category')

# Guardo el vectorizador entrenado para futuras predicciones
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("Vectorizador TF-IDF guardado correctamente.")

Transformation Pipeline and Model Successfully Saved
✅ Vectorizador TF-IDF guardado correctamente.


In [16]:
# Evaluo el modelo
evaluate_model(best_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…