# Proceso a seguir

1. Importar datos de entrenamiento
1. Aumentar para incluir negativos
3. Split train val
4. Preprocesamiento de train
5. Entrenar modelo con datos de train
5. Predicciones con val
6. Evaluación

In [2]:
import pandas as pd
import os

train_file = "../../datasets/customer_purchases/customer_purchases_train.csv"
train_file = os.path.abspath(train_file)
train_df = pd.read_csv(train_file)

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7289 entries, 0 to 7288
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   purchase_id             7289 non-null   int64  
 1   customer_id             7289 non-null   object 
 2   customer_date_of_birth  7289 non-null   object 
 3   customer_gender         5738 non-null   object 
 4   customer_signup_date    7289 non-null   object 
 5   item_id                 7289 non-null   object 
 6   item_title              7289 non-null   object 
 7   item_category           7289 non-null   object 
 8   item_price              7289 non-null   float64
 9   item_img_filename       7289 non-null   object 
 10  item_avg_rating         7244 non-null   float64
 11  item_num_ratings        7289 non-null   int64  
 12  item_release_date       7289 non-null   object 
 13  purchase_timestamp      7289 non-null   object 
 14  customer_item_views     7289 non-null   

In [20]:
from negative_generation import *


negatives = gen_smart_negatives(train_df)

customer_columns = [
    "customer_date_of_birth",
    "customer_gender",
    "customer_signup_date",
]

item_columns = [
    "item_title",
    "item_category",
    "item_price",
    "item_img_filename",
    "item_avg_rating",
    "item_num_ratings",
    "item_release_date",
]

purcharse_columns = [
    "purchase_id",
    "purchase_timestamp",
    "customer_item_views",
    "purchase_item_rating",
    "purchase_device",
]

# Return
# Dataframe con labels 0 y uno y las mismas columnas que train_df
# concatenar vertical los zeros
# shuffle


# Crear índices para búsqueda rápida
customer_info = (
    train_df[["customer_id"] + customer_columns]
    .drop_duplicates(subset=["customer_id"])
    .set_index("customer_id")
    .to_dict(orient="index")
)
item_info = (
    train_df[["item_id"] + item_columns]
    .drop_duplicates(subset=["item_id"])
    .set_index("item_id")
    .to_dict(orient="index")
)
# Enriquecer los negativos con columnas de cliente e ítem
enriched_negatives = []
for _, row in negatives.iterrows():
    cust_data = customer_info.get(row["customer_id"], {})
    item_data = item_info.get(row["item_id"], {})
    enriched_negatives.append({
        "customer_id": row["customer_id"],
        "item_id": row["item_id"],
        **cust_data,
        **item_data,
        "purchase_id": np.nan,
        "purchase_timestamp": np.nan,
        "customer_item_views": np.nan,
        "purchase_item_rating": np.nan,
        "purchase_device": np.nan,
        "label": 0,
    })
neg_df = pd.DataFrame(enriched_negatives)

# Los positivos ya tienen toda la info
pos_df = train_df.copy()
# Concatenar positivos y negativos
final_df = pd.concat([pos_df, neg_df], ignore_index=True)
# Reordenar columnas igual que train_df + label
cols = list(train_df.columns)
final_df = final_df[cols]
final_df = final_df.sample(frac=1).reset_index(drop=True)

final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14578 entries, 0 to 14577
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   purchase_id             7289 non-null   float64
 1   customer_id             14578 non-null  object 
 2   customer_date_of_birth  14578 non-null  object 
 3   customer_gender         11476 non-null  object 
 4   customer_signup_date    14578 non-null  object 
 5   item_id                 14578 non-null  object 
 6   item_title              14578 non-null  object 
 7   item_category           14578 non-null  object 
 8   item_price              14578 non-null  float64
 9   item_img_filename       14578 non-null  object 
 10  item_avg_rating         14500 non-null  float64
 11  item_num_ratings        14578 non-null  int64  
 12  item_release_date       14578 non-null  object 
 13  purchase_timestamp      7289 non-null   object 
 14  customer_item_views     7289 non-null 

In [None]:
from sklearn.model_selection import train_test_split

# Suppose you have your features and labels:
X = complete_df.drop(columns="label")
y = complete_df["label"]

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.2,        
    random_state=42,      # for reproducibility
    stratify=y            
)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_val.shape}")

In [None]:
from data_processing import *

# se extraen los atributos del cliente, calculan columnas derivadas y transforman
# primero extraer los atributos del cliente solo de los que son columnas de compra

processed_train = preprocess(X_train, training=True) # se te olvido excluir el label del preprocesamiento
processed_train.info()

save_df(processed_train, "processed_train.csv")

In [None]:
from model import *
#from training import *

# entrenar diferentes modelos