In [1]:
import psutil
import pandas as pd
import numpy as np
from src.utils.utils import get_base_dir
base_dir = get_base_dir()
base_dir

WindowsPath('C:/Users/lauta/Desktop/Lautaro/maestria_ds/labo3/repo-entrega')

### 1. Carga de datos

In [None]:
sell_in_features = pd.read_feather(base_dir / "data/interim/sell_in_completo_con_fe.feather")

### 2. Creación del target - dicisión de bases - post procesamiento

In [3]:
# Variable respuesta
sell_in_features["target"] = sell_in_features["tn"] - sell_in_features["venta_t-2"]

In [4]:
# Variable para importancia registro durante el entrenamiento
sell_in_features["peso_entrenamiento"] = np.log1p(np.abs(sell_in_features["tn"]))

In [5]:
# Listado de variables extras consideradas para el modelado
new_cols = ['venta_t-2', 'venta_t-3', 'venta_t-6',
       'venta_t-9', 'venta_t-12', 'venta_t-24', 'media_3m', 'std_3m',
       'suma_3m', 'max_3m', 'min_3m', 'media_6m', 'std_6m', 'suma_6m',
       'max_6m', 'min_6m', 'media_9m', 'std_9m', 'suma_9m', 'max_9m', 'min_9m',
       'media_12m', 'std_12m', 'suma_12m', 'max_12m', 'min_12m', 'media_24m',
       'std_24m', 'suma_24m', 'max_24m', 'min_24m', 'pct_change_2_4m',
       'pct_change_4_7m', 'tendencia_2_4m', 'tendencia_4_6m', 'venta_año_ant',
       'mes_sin', 'mes_cos', 'trimestre_sin', 'trimestre_cos', 'es_enero',
       'es_diciembre', 'es_verano', 'es_invierno', 'producto_media_historica',
       'producto_volatilidad', 'cliente_media_historica', 'momentum_3m',
       'momentum_6m', 'aceleracion', 'categoria_media', 'vs_categoria',
       'marca_media', 'vs_marca', 'meses_sin_venta', 'frecuencia_compras_12m',
       'VarMensualGral', 'VarMensualAlim', 'IndiceMensualGral',
       'IndiceMensualAlim', 'dolar']

In [6]:
# Definir listado de variables
numerical_features = new_cols + ["mes", "año", "trimestre", "a_predecir"]
categorical_features = ["product_id", "customer_id", 'cat1', 'cat2', 'cat3', 'brand', 'descripcion']
model_features = numerical_features + categorical_features

# Convertir variables categóricas
for cat_col in categorical_features:
    sell_in_features[cat_col] = sell_in_features[cat_col].astype("category")

In [7]:
# crear los conjuntos de datos de train, validation y predict
df_predict = sell_in_features[sell_in_features["periodo"]==202002].copy()
df_train = sell_in_features[sell_in_features["periodo"]<201907].copy()
df_validation = sell_in_features[(sell_in_features["periodo"]>=201907) & (sell_in_features["periodo"]<202001)].copy()
del sell_in_features

In [8]:
# check memoria ram
mem = psutil.virtual_memory()
print(f"Memoria RAM disponible: {mem.available / (1024**3):.2f} GB")

Memoria RAM disponible: 17.47 GB


In [9]:
# dimensiones particiones
df_predict.shape, df_train.shape, df_validation.shape

((465660, 79), (15599943, 79), (4277589, 79))

In [10]:
# check nulos
df_train.target.isna().sum(), df_validation.target.isna().sum(), df_predict.target.isna().sum()

(np.int64(1319082), np.int64(149269), np.int64(465660))

In [11]:
# eliminar target nan de train y validation
df_train = df_train[df_train["target"].notna()].copy().reset_index(drop=True)
df_validation = df_validation[df_validation["target"].notna()].copy().reset_index(drop=True)

In [12]:
# filtrar datos posteriores a julio 2017
df_train = df_train[df_train["periodo"]>201707].copy().reset_index(drop=True)

In [13]:
# dimensiones particiones post-procesamiento
df_predict.shape, df_train.shape, df_validation.shape

((465660, 79), (12516572, 79), (4128320, 79))

### 3. Check caso

In [14]:
perfil = (df_train["product_id"]==20001) & (df_train["customer_id"]==10001)
df_train[perfil].sort_values(by="periodo", ascending=True)

Unnamed: 0,periodo,product_id,customer_id,tn,fecha,cat1,cat2,cat3,brand,sku_size,...,vs_marca,meses_sin_venta,frecuencia_compras_12m,VarMensualGral,VarMensualAlim,IndiceMensualGral,IndiceMensualAlim,dolar,target,peso_entrenamiento
0,201708,20001,10001,43.3393,2017-08-01,HC,ROPA LAVADO,Liquido,ARIEL,3000.0,...,320.014496,7.0,6.0,1.4,2.1,115.3819,114.3467,17.718636,-84.70862,3.791871
1,201709,20001,10001,289.35024,2017-09-01,HC,ROPA LAVADO,Liquido,ARIEL,3000.0,...,330.778138,8.0,7.0,1.9,1.8,117.5719,116.4048,17.549524,188.14313,5.671088
2,201710,20001,10001,222.11389,2017-10-01,HC,ROPA LAVADO,Liquido,ARIEL,3000.0,...,146.786744,9.0,8.0,1.5,1.5,119.3528,118.1646,17.763333,178.77459,5.407682
3,201711,20001,10001,111.54944,2017-11-01,HC,ROPA LAVADO,Liquido,ARIEL,3000.0,...,859.94549,10.0,9.0,1.4,1.2,120.994,119.5688,17.787,-177.8008,4.723393
4,201712,20001,10001,131.2715,2017-12-01,HC,ROPA LAVADO,Liquido,ARIEL,3000.0,...,619.444203,11.0,10.0,3.1,0.7,124.7956,120.36,18.045789,-90.84239,4.884857
5,201801,20001,10001,49.61857,2018-01-01,HC,ROPA LAVADO,Liquido,ARIEL,3000.0,...,338.72253,11.0,11.0,1.8,2.1,126.9887,122.8752,19.357273,-61.93087,3.924319
6,201802,20001,10001,88.44065,2018-02-01,HC,ROPA LAVADO,Liquido,ARIEL,3000.0,...,528.466037,11.0,12.0,2.4,2.2,130.0606,125.5566,20.150556,-42.83085,4.493575
7,201803,20001,10001,214.72336,2018-03-01,HC,ROPA LAVADO,Liquido,ARIEL,3000.0,...,219.667734,11.0,12.0,2.3,2.3,133.1054,128.469,20.558,165.10479,5.373997
8,201804,20001,10001,132.83419,2018-04-01,HC,ROPA LAVADO,Liquido,ARIEL,3000.0,...,382.32814,11.0,12.0,2.7,1.2,136.7512,129.9559,20.537368,44.39354,4.896602
9,201805,20001,10001,165.73507,2018-05-01,HC,ROPA LAVADO,Liquido,ARIEL,3000.0,...,565.000193,11.0,12.0,2.1,3.3,139.5893,134.2019,24.234286,-48.98829,5.116406


In [15]:
perfil = (df_validation["product_id"]==20001) & (df_validation["customer_id"]==10001)
df_validation[perfil].sort_values(by="periodo", ascending=True)

Unnamed: 0,periodo,product_id,customer_id,tn,fecha,cat1,cat2,cat3,brand,sku_size,...,vs_marca,meses_sin_venta,frecuencia_compras_12m,VarMensualGral,VarMensualAlim,IndiceMensualGral,IndiceMensualAlim,dolar,target,peso_entrenamiento
0,201907,20001,10001,144.78714,2019-07-01,HC,ROPA LAVADO,Liquido,ARIEL,3000.0,...,2841.417597,11.0,12.0,2.2,2.3,230.494,232.0295,43.751429,-295.11933,4.982148
1,201908,20001,10001,33.63991,2019-08-01,HC,ROPA LAVADO,Liquido,ARIEL,3000.0,...,662.989654,11.0,12.0,4.0,4.5,239.6077,242.5051,54.641429,-32.28445,3.545006
2,201909,20001,10001,109.05244,2019-09-01,HC,ROPA LAVADO,Liquido,ARIEL,3000.0,...,1042.743891,11.0,12.0,5.9,5.7,253.7102,256.3757,58.79,-35.7347,4.700957
3,201910,20001,10001,176.0298,2019-10-01,HC,ROPA LAVADO,Liquido,ARIEL,3000.0,...,337.12431,11.0,12.0,3.3,2.5,262.0661,262.6603,61.399091,142.38989,5.176318
4,201911,20001,10001,236.65556,2019-11-01,HC,ROPA LAVADO,Liquido,ARIEL,3000.0,...,856.082875,11.0,12.0,4.3,5.3,273.2158,276.6339,63.011579,127.60312,5.470822
5,201912,20001,10001,180.21938,2019-12-01,HC,ROPA LAVADO,Liquido,ARIEL,3000.0,...,1728.614293,11.0,12.0,3.7,3.1,283.4442,285.297,63.009474,4.18958,5.199708


In [16]:
perfil = (df_predict["product_id"]==20001) & (df_predict["customer_id"]==10001)
df_predict[perfil].sort_values(by="periodo", ascending=True)

Unnamed: 0,periodo,product_id,customer_id,tn,fecha,cat1,cat2,cat3,brand,sku_size,...,vs_marca,meses_sin_venta,frecuencia_compras_12m,VarMensualGral,VarMensualAlim,IndiceMensualGral,IndiceMensualAlim,dolar,target,peso_entrenamiento
37,202002,20001,10001,,2020-02-01,HC,ROPA LAVADO,Liquido,ARIEL,3000.0,...,1901.95275,11.0,12.0,2.0,2.7,295.666,306.6182,63.056,,


In [18]:
# check memoria ram
mem = psutil.virtual_memory()
print(f"Memoria RAM disponible: {mem.available / (1024**3):.2f} GB")

Memoria RAM disponible: 15.44 GB


## 4. Guardar particiones para modelado

In [19]:
df_train.to_feather(base_dir / "data/processed/df_train.feather")
df_validation.to_feather(base_dir / "data/processed/df_validation.feather")
df_predict.to_feather(base_dir / "data/processed/df_predict.feather")