In [1]:
import pandas as pd
import numpy as np
import json
import pprint
from pymongo import MongoClient

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from stop_words import get_stop_words
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import plotly.graph_objects as go
from sklearn.preprocessing import OneHotEncoder

import nltk
from nltk.stem import SnowballStemmer

# Descargar recursos necesarios de nltk
nltk.download('punkt')



[nltk_data] Downloading package punkt to /Users/javi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
y_train = pd.read_csv("dataset/df_train.csv.gzip")

ParserError: Error tokenizing data. C error: Expected 1 fields in line 4, saw 3


In [None]:
conn = MongoClient()
db = conn.ml

In [None]:
X_train_items = db.X_train
y_train_items = db.y_train
X_test_items = db.X_test
y_test_items = db.y_test


In [None]:
# You can safely assume that `build_dataset` is correctly implemented
def build_dataset():
    data = [json.loads(x) for x in open("MLA_100k.jsonlines")]
    target = lambda x: x.get("condition")
    N = -10000
    X_train = data[:N]
    X_test = data[N:]
    y_train = [target(x) for x in X_train]
    y_test = [target(x) for x in X_test]
    for x in X_test:
        del x["condition"]
    return X_train, y_train, X_test, y_test

In [None]:
X_train_raw, y_train_raw, X_test_raw, y_test_raw  = build_dataset()

In [None]:
X_train_items.insert_many(X_train_raw)
X_test_items.insert_many(X_test_raw)

In [None]:
X_train_collect = X_train_items.find()
X_test_collect = X_test_items.find()

In [None]:
X_train = pd.DataFrame(X_train_collect)
X_test = pd.DataFrame(X_test_collect)

In [None]:
X_train.pop("_id")
X_test.pop("_id")

In [None]:
y_train = pd.DataFrame({"is_new":y_train_raw})
y_test = pd.DataFrame({"is_new":y_test_raw})

In [None]:
df_train = pd.concat([X_train,y_train], axis = 1)
df_test = pd.concat([X_test,y_test], axis = 1)

In [None]:
columns = ["id","title","date_created","base_price","price","category_id","tags","attributes","variations",
           "pictures","seller_id","seller_country","seller_province","seller_city","seller_loyalty","buying_mode",
           "shipping_mode","shipping_admits_pic","shipping_is_free","status","sub_status","warranty",
           "is_new","initial_quantity","sold_quantity","available_quantity"] 

columns = ["id","title","date_created","base_price","price","category_id","tags","attributes","variations",
           "pictures","seller_id","seller_country","seller_province","seller_city","buying_mode",
           "shipping_mode","shipping_admits_pic","shipping_is_free","status","sub_status","warranty",
           "initial_quantity","sold_quantity","available_quantity"] 
#columns = ["date_created","base_price","price","category_id","tags","attributes","variations",
#           "buying_mode","status","sub_status","warranty","initial_quantity","sold_quantity","available_quantity"] 

In [None]:
X_train["shipping_admits_pic"] = X_train["shipping"].apply(lambda x : x.get("local_pick_up"))
X_test["shipping_admits_pic"] = X_test["shipping"].apply(lambda x : x.get("local_pick_up"))

In [None]:
X_train["shipping_admits_pic"].unique()

In [None]:
X_train["shipping_mode"] = X_train["shipping"].apply(lambda x : x.get("mode"))
X_test["shipping_mode"] = X_test["shipping"].apply(lambda x : x.get("mode"))

In [None]:
X_train["shipping_mode"].unique()

In [None]:
X_train["shipping_is_free"] = X_train["shipping"].apply(lambda x : x.get("free_shipping"))
X_test["shipping_is_free"] = X_test["shipping"].apply(lambda x : x.get("free_shipping"))

In [None]:
X_train["shipping_is_free"].unique()

In [None]:
X_train["seller_city"] = X_train["seller_address"].apply(lambda x : x.get("city").get("name") )
X_test["seller_city"] = X_test["seller_address"].apply(lambda x : x.get("city").get("name") )

In [None]:
X_train["seller_city"].unique()

In [None]:
X_train["seller_province"] = X_train["seller_address"].apply(lambda x : x.get("state").get("name") )
X_test["seller_province"] = X_test["seller_address"].apply(lambda x : x.get("state").get("name") )

In [None]:
X_train["seller_province"].unique()

In [None]:
# no se pide la ciudad en el dataset , pero creo que es relevante para tener contexto

X_train["seller_country"] = X_train["seller_address"].apply(lambda x : x.get("country").get("name"))
X_test["seller_country"] = X_test["seller_address"].apply(lambda x : x.get("country").get("name"))

In [None]:
X_train["seller_country"].unique()

In [None]:
X_train = X_train[columns]
X_test = X_test[columns]

In [None]:
X_train["title"] = X_train["title"].str.lower()
X_train["warranty"] = X_train["warranty"].str.lower()

X_test["title"] = X_test["title"].str.lower()
X_test["warranty"] = X_test["warranty"].str.lower()

In [None]:
X_train["warranty"] = np.where(X_train["warranty"].notnull(), X_train["warranty"], "")
X_test["warranty"] = np.where(X_test["warranty"].notnull(), X_test["warranty"], "")

In [None]:
X_test["warranty"]

In [None]:
# Combinar las descripciones de los dos campos en un solo campo
X_train['descripcion_combinada'] = X_train["title"] + ' ' + X_train["warranty"]
X_test['descripcion_combinada'] = X_test["title"] + ' ' + X_test["warranty"]

In [None]:
y_train.value_counts(normalize = True)

In [None]:
df_train["date_created"] = pd.to_datetime(df_train["date_created"])

In [None]:
df_train["date_created"].min(), df_train["date_created"].max()

In [None]:
precio_prom_semanal = df_train.groupby(["is_new",pd.Grouper(key = 'date_created', freq = 'W')]).price.mean().fillna(0).reset_index()

In [None]:


productos_nuevos = precio_prom_semanal[precio_prom_semanal.is_new == "new"]
productos_usados = precio_prom_semanal[precio_prom_semanal.is_new == "used"]

# Create traces
fig = go.Figure()
fig.add_trace(go.Scatter(x=productos_nuevos["date_created"], y=productos_nuevos["price"],
                    mode='lines',
                    name='nuevos'))

fig.add_trace(go.Scatter(x=productos_usados["date_created"], y=productos_usados["price"],
                    mode='lines',
                    name='usados'))

#fig.add_trace(go.Scatter(x=df_grouped["FECHA_VENTA"], y=df_grouped["TICKET_PROMEDIO"],
#                    mode='lines',
#                    name='TICKET_PROMEDIO'))




fig.update_layout(
    title_text= "PRECIO PROMEDIO SEMANAL", # title of plot
    xaxis_tickfont_size=8,
    xaxis_title_text='FECHA', # xaxis label
    yaxis_title_text='PRECIO', # yaxis label

) 



fig.show()

In [None]:
#precio promedio por categoria de productos nuevos y usados
precio_by_category = df_train.groupby(["is_new","category_id"]).price.mean().reset_index()
#precio_by_category.pivot_table()

In [None]:
precio_by_category.is_new.value_counts()

In [None]:
precio_cat = pd.pivot_table(precio_by_category, values='price',index=['category_id'], columns=['is_new'])
precio_cat

In [None]:
precio_cat[(precio_cat.new.notnull()) & (precio_cat.used.notnull())]

In [None]:
X_train["category_id"].nunique()

In [None]:
#hipotesis
#la descripcion debiese reflejar 

In [None]:
# Crear una instancia del vectorizador Bag of Words
stop_words_es = get_stop_words('spanish')
vectorizer = CountVectorizer(stop_words=stop_words_es)

In [None]:
# Agregar stemming a los vectores numéricos
stemmer = SnowballStemmer('spanish')
X_train_stemmed = [[stemmer.stem(word) for word in nltk.word_tokenize(document)] for document in X_train]
X_test_stemmed = [[stemmer.stem(word) for word in nltk.word_tokenize(document)] for document in X_test]

# Volver a transformar los datos en vectores numéricos después del stemming
X_train_vectorized = vectorizer.fit_transform([' '.join(doc) for doc in X_train_stemmed])
X_test_vectorized = vectorizer.transform([' '.join(doc) for doc in X_test_stemmed])

In [None]:
# Transformar las descripciones en vectores numéricos

# Agregar la variable cantidad_disponible al conjunto de características vectorizadas

X_train_vectorized = vectorizer.fit_transform(X_train["descripcion_combinada"])
X_test_vectorized = vectorizer.transform(X_test["descripcion_combinada"])


#from scipy.sparse import hstack
#X_train_vectorized = hstack((X_train_vectorized, X_train['available_quantity'].values.reshape(-1, 1)))
#X_test_vectorized = hstack((X_test_vectorized, X_test['available_quantity'].values.reshape(-1, 1)))

In [None]:
# Codificar las categorías utilizando la codificación one-hot

encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
categories_encoded_train = encoder.fit_transform(X_train["category_id"].values.reshape(-1, 1))
categories_encoded_test = encoder.transform(X_test["category_id"].values.reshape(-1, 1))

In [None]:
X_train_vectorized = hstack((X_train_vectorized, categories_encoded_train))
X_test_vectorized = hstack((X_test_vectorized, categories_encoded_test))

In [None]:
# Crear y entrenar el modelo de clasificación (Naive Bayes)
classifier = MultinomialNB()
classifier.fit(X_train_vectorized, y_train)

# Realizar predicciones en el conjunto de prueba
y_pred = classifier.predict(X_test_vectorized)

# Evaluar el rendimiento del modelo
accuracy = accuracy_score(y_test, y_pred)
print(f'Precisión del modelo: {accuracy:.2f}')

report = classification_report(y_test, y_pred)
print('Informe de clasificación:')
print(report)


In [None]:
# Generar la matriz de confusión
#conf_matrix = confusion_matrix(y_test, y_pred)
#print('Matriz de Confusión:')
#print(conf_matrix)


In [None]:
out = pd.DataFrame({"y":y_test_raw, "y_pred": y_pred})
conf_matrix = pd.crosstab(out.y, out.y_pred, rownames=['Etiqueta Real'], colnames=['Predicción'], margins=True)
print('Matriz de Confusión:')
print(conf_matrix)