In [1]:
import sys
import os

# Añadir la raíz del proyecto (la carpeta donde está tu src/)
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [2]:
from src.feature import crear_target_prod_cust, crear_features_temporales, getYearMonth
import src.config as conf


In [3]:
import pandas as pd
import numpy as np

# 1) Carga de datos
sell_in       = pd.read_csv('../data/sell-in.txt', sep='\t')
prod_vigentes = pd.read_csv('../data/product_id_apredecir201912.txt', sep='\t')  # contiene al menos product_id
productos = pd.read_csv('../data/tb_productos_05262025.txt', sep='\t')  # contiene al menos product_id


In [4]:
# 2) Parseo de 'periodo' a datetime
sell_in['periodo'] = pd.to_datetime(
    sell_in['periodo'].astype(str) + '01',
    format='%Y%m%d'
)

In [5]:
sell_in.shape

(2945818, 7)

In [6]:
# 3) Agregar cust_request_qty, cust_request_tn y tn por (periodo, customer_id, product_id)

sell_in_agg = (
    sell_in
    .groupby(['periodo','customer_id','product_id','plan_precios_cuidados'], as_index=False)
    .agg({
        'tn': 'sum',
        'cust_request_qty': 'sum',
        'cust_request_tn': 'sum'
    })
)

In [7]:
sell_in_agg.shape

(2945818, 7)

In [8]:
sell_in_agg

Unnamed: 0,periodo,customer_id,product_id,plan_precios_cuidados,tn,cust_request_qty,cust_request_tn
0,2017-01-01,10001,20001,0,99.43861,11,99.43861
1,2017-01-01,10001,20002,0,87.64856,17,90.13504
2,2017-01-01,10001,20003,0,100.21284,25,100.21284
3,2017-01-01,10001,20004,0,21.73954,13,21.73954
4,2017-01-01,10001,20006,0,29.17196,18,31.36770
...,...,...,...,...,...,...,...
2945813,2019-12-01,10606,20303,0,0.01298,1,0.01298
2945814,2019-12-01,10606,20563,0,0.00442,1,0.00442
2945815,2019-12-01,10606,20962,0,0.00655,1,0.00655
2945816,2019-12-01,10606,20975,0,0.00655,1,0.00655


In [9]:
# 4) Extraer lista única de periodos
unique_periodos = sell_in_agg['periodo'].drop_duplicates()
unique_customers = sell_in_agg['customer_id'].drop_duplicates()
unique_products  = prod_vigentes['product_id'].drop_duplicates()


In [10]:
# 6) Crear todas las combinaciones (periodo × 10001 × 20001)
idx = pd.MultiIndex.from_product(
    [unique_periodos, unique_customers, unique_products],
    names=['periodo','customer_id','product_id']
)
complete_test = idx.to_frame(index=False)

In [11]:
complete_test

Unnamed: 0,periodo,customer_id,product_id
0,2017-01-01,10001,20001
1,2017-01-01,10001,20002
2,2017-01-01,10001,20003
3,2017-01-01,10001,20004
4,2017-01-01,10001,20005
...,...,...,...
16763755,2019-12-01,10582,21263
16763756,2019-12-01,10582,21265
16763757,2019-12-01,10582,21266
16763758,2019-12-01,10582,21267


In [12]:
# 7) Hacer merge con las ventas reales de (10001, 20001) y rellenar NaN → 0

#     Hacemos el merge con complete_test. Después rellenamos NaN->0 en tn.
sell_in_completed = (
    complete_test
    .merge(
        sell_in_agg,  # traemos solo periodo + tn
        on= ['periodo','customer_id', 'product_id'],
        how='left'
    )
    .fillna({
        'tn': 0,
        'cust_request_qty': 0,
        'cust_request_tn': 0,
        'plan_precios_cuidados': 0
    })
)

In [13]:
sell_in_completed


Unnamed: 0,periodo,customer_id,product_id,plan_precios_cuidados,tn,cust_request_qty,cust_request_tn
0,2017-01-01,10001,20001,0.0,99.43861,11.0,99.43861
1,2017-01-01,10001,20002,0.0,87.64856,17.0,90.13504
2,2017-01-01,10001,20003,0.0,100.21284,25.0,100.21284
3,2017-01-01,10001,20004,0.0,21.73954,13.0,21.73954
4,2017-01-01,10001,20005,0.0,0.00000,0.0,0.00000
...,...,...,...,...,...,...,...
16763755,2019-12-01,10582,21263,0.0,0.00000,0.0,0.00000
16763756,2019-12-01,10582,21265,0.0,0.00000,0.0,0.00000
16763757,2019-12-01,10582,21266,0.0,0.00000,0.0,0.00000
16763758,2019-12-01,10582,21267,0.0,0.00000,0.0,0.00000


In [14]:
sell_in_merged_productos = sell_in_completed.merge(productos, on='product_id', how='left')

In [15]:
print(sell_in_merged_productos)

            periodo  customer_id  product_id  plan_precios_cuidados  \
0        2017-01-01        10001       20001                    0.0   
1        2017-01-01        10001       20002                    0.0   
2        2017-01-01        10001       20003                    0.0   
3        2017-01-01        10001       20004                    0.0   
4        2017-01-01        10001       20005                    0.0   
...             ...          ...         ...                    ...   
16763755 2019-12-01        10582       21263                    0.0   
16763756 2019-12-01        10582       21265                    0.0   
16763757 2019-12-01        10582       21266                    0.0   
16763758 2019-12-01        10582       21267                    0.0   
16763759 2019-12-01        10582       21276                    0.0   

                 tn  cust_request_qty  cust_request_tn   cat1         cat2  \
0          99.43861              11.0         99.43861     HC  ROPA L

In [16]:
#contar los registros donde el tn es cero
print("Número de registros con tn = 0:", len(sell_in_merged_productos[sell_in_merged_productos['tn'] == 0]))

Número de registros con tn = 0: 14470279


In [17]:
#9) Guardar a CSV de prueba
sell_in_merged_productos.to_csv('../data/ventas_complete.csv', index=False)
#print("CSV generado: ventas_complete.csv")

In [18]:
sell_in_merged_productos.info()  # Mostrar información del DataFrame final

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16763760 entries, 0 to 16763759
Data columns (total 13 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   periodo                datetime64[ns]
 1   customer_id            int64         
 2   product_id             int64         
 3   plan_precios_cuidados  float64       
 4   tn                     float64       
 5   cust_request_qty       float64       
 6   cust_request_tn        float64       
 7   cat1                   object        
 8   cat2                   object        
 9   cat3                   object        
 10  brand                  object        
 11  sku_size               int64         
 12  descripcion            object        
dtypes: datetime64[ns](1), float64(4), int64(3), object(5)
memory usage: 1.6+ GB


In [19]:
# de sell_in_merged_productos sacar los 15 productos con mayor tn
top_15_products = sell_in_merged_productos.groupby('product_id')['tn'].sum().nlargest(15).reset_index()
top_15_products


Unnamed: 0,product_id,tn
0,20001,50340.39558
1,20002,36337.25439
2,20003,32004.15274
3,20004,24178.15379
4,20005,23191.21852
5,20007,22018.45234
6,20006,21088.76007
7,20008,19948.29352
8,20010,18671.07918
9,20012,17813.59935


In [20]:
sell_in_featured = sell_in_merged_productos.copy()

In [21]:
# agregar un campo llamado productos_estrella que sea 1 si el producto está en top_15_products y 0
sell_in_featured['productos_estrella'] = sell_in_featured['product_id'].isin(top_15_products['product_id']).astype(int)
sell_in_featured.head()  # Mostrar las primeras filas del DataFrame con el nuevo campo

Unnamed: 0,periodo,customer_id,product_id,plan_precios_cuidados,tn,cust_request_qty,cust_request_tn,cat1,cat2,cat3,brand,sku_size,descripcion,productos_estrella
0,2017-01-01,10001,20001,0.0,99.43861,11.0,99.43861,HC,ROPA LAVADO,Liquido,ARIEL,3000,genoma,1
1,2017-01-01,10001,20002,0.0,87.64856,17.0,90.13504,HC,ROPA LAVADO,Liquido,LIMPIEX,3000,Maquina 1er lavado,1
2,2017-01-01,10001,20003,0.0,100.21284,25.0,100.21284,FOODS,ADEREZOS,Mayonesa,NATURA,475,Regular sin TACC,1
3,2017-01-01,10001,20004,0.0,21.73954,13.0,21.73954,FOODS,ADEREZOS,Mayonesa,NATURA,240,Regular sin TACC,1
4,2017-01-01,10001,20005,0.0,0.0,0.0,0.0,FOODS,ADEREZOS,Mayonesa,NATURA,120,Regular sin TACC,1


In [22]:
# agregar un campo mes que sea el mes del periodo
sell_in_featured['mes'] = sell_in_featured['periodo'].dt.month
sell_in_featured.head()  # Mostrar las primeras filas del DataFrame con el nuevo campo mes

Unnamed: 0,periodo,customer_id,product_id,plan_precios_cuidados,tn,cust_request_qty,cust_request_tn,cat1,cat2,cat3,brand,sku_size,descripcion,productos_estrella,mes
0,2017-01-01,10001,20001,0.0,99.43861,11.0,99.43861,HC,ROPA LAVADO,Liquido,ARIEL,3000,genoma,1,1
1,2017-01-01,10001,20002,0.0,87.64856,17.0,90.13504,HC,ROPA LAVADO,Liquido,LIMPIEX,3000,Maquina 1er lavado,1,1
2,2017-01-01,10001,20003,0.0,100.21284,25.0,100.21284,FOODS,ADEREZOS,Mayonesa,NATURA,475,Regular sin TACC,1,1
3,2017-01-01,10001,20004,0.0,21.73954,13.0,21.73954,FOODS,ADEREZOS,Mayonesa,NATURA,240,Regular sin TACC,1,1
4,2017-01-01,10001,20005,0.0,0.0,0.0,0.0,FOODS,ADEREZOS,Mayonesa,NATURA,120,Regular sin TACC,1,1


In [23]:
# necesito un campo catastrofe que sea 1 para el periodo 2019-08-01 y 0 para el resto
sell_in_featured['catastrofe'] = (sell_in_featured['periodo'] == '2019-08-01').astype(int)
sell_in_featured.head()  # Mostrar las primeras filas del DataFrame con el nuevo campo catastrofe

Unnamed: 0,periodo,customer_id,product_id,plan_precios_cuidados,tn,cust_request_qty,cust_request_tn,cat1,cat2,cat3,brand,sku_size,descripcion,productos_estrella,mes,catastrofe
0,2017-01-01,10001,20001,0.0,99.43861,11.0,99.43861,HC,ROPA LAVADO,Liquido,ARIEL,3000,genoma,1,1,0
1,2017-01-01,10001,20002,0.0,87.64856,17.0,90.13504,HC,ROPA LAVADO,Liquido,LIMPIEX,3000,Maquina 1er lavado,1,1,0
2,2017-01-01,10001,20003,0.0,100.21284,25.0,100.21284,FOODS,ADEREZOS,Mayonesa,NATURA,475,Regular sin TACC,1,1,0
3,2017-01-01,10001,20004,0.0,21.73954,13.0,21.73954,FOODS,ADEREZOS,Mayonesa,NATURA,240,Regular sin TACC,1,1,0
4,2017-01-01,10001,20005,0.0,0.0,0.0,0.0,FOODS,ADEREZOS,Mayonesa,NATURA,120,Regular sin TACC,1,1,0


In [24]:
# como se cuantos 1 hay en catastrofe
print("Número de catástrofes (periodo 2019-08-01):", sell_in_featured['catastrofe'].sum())

Número de catástrofes (periodo 2019-08-01): 465660


In [25]:
# generar un campo cliente_estrella con los 15 clientes que mas tn tienen
top_15_customers = sell_in_featured.groupby('customer_id')['tn'].sum().nlargest(15).reset_index()
top_15_customers

Unnamed: 0,customer_id,tn
0,10001,91061.3951
1,10002,66447.48495
2,10003,56972.90867
3,10004,53417.40432
4,10005,45395.67421
5,10006,43193.9767
6,10007,36618.78124
7,10008,36162.11362
8,10009,34468.67226
9,10010,29128.21751


In [26]:
# sumar el campo cliente_estrella al DataFrame sell_in_featured con un valor de 1 si el cliente está en top_15_customers y 0 si no
sell_in_featured['cliente_estrella'] = sell_in_featured['customer_id'].isin(top_15_customers['customer_id']).astype(int)
sell_in_featured.head()  # Mostrar las primeras filas del DataFrame con el nuevo campo cliente_estrella

Unnamed: 0,periodo,customer_id,product_id,plan_precios_cuidados,tn,cust_request_qty,cust_request_tn,cat1,cat2,cat3,brand,sku_size,descripcion,productos_estrella,mes,catastrofe,cliente_estrella
0,2017-01-01,10001,20001,0.0,99.43861,11.0,99.43861,HC,ROPA LAVADO,Liquido,ARIEL,3000,genoma,1,1,0,1
1,2017-01-01,10001,20002,0.0,87.64856,17.0,90.13504,HC,ROPA LAVADO,Liquido,LIMPIEX,3000,Maquina 1er lavado,1,1,0,1
2,2017-01-01,10001,20003,0.0,100.21284,25.0,100.21284,FOODS,ADEREZOS,Mayonesa,NATURA,475,Regular sin TACC,1,1,0,1
3,2017-01-01,10001,20004,0.0,21.73954,13.0,21.73954,FOODS,ADEREZOS,Mayonesa,NATURA,240,Regular sin TACC,1,1,0,1
4,2017-01-01,10001,20005,0.0,0.0,0.0,0.0,FOODS,ADEREZOS,Mayonesa,NATURA,120,Regular sin TACC,1,1,0,1


In [27]:
sell_in_featured.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16763760 entries, 0 to 16763759
Data columns (total 17 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   periodo                datetime64[ns]
 1   customer_id            int64         
 2   product_id             int64         
 3   plan_precios_cuidados  float64       
 4   tn                     float64       
 5   cust_request_qty       float64       
 6   cust_request_tn        float64       
 7   cat1                   object        
 8   cat2                   object        
 9   cat3                   object        
 10  brand                  object        
 11  sku_size               int64         
 12  descripcion            object        
 13  productos_estrella     int64         
 14  mes                    int32         
 15  catastrofe             int64         
 16  cliente_estrella       int64         
dtypes: datetime64[ns](1), float64(4), int32(1), int64(6), object(5)

In [28]:
sell_in_featured1 = sell_in_featured.copy()

In [29]:
sell_in_featured1["prod_cust"] = sell_in_featured1.product_id.astype(str) + "-" + sell_in_featured1.customer_id.astype(str)

In [30]:
sell_in_featured1.head()

Unnamed: 0,periodo,customer_id,product_id,plan_precios_cuidados,tn,cust_request_qty,cust_request_tn,cat1,cat2,cat3,brand,sku_size,descripcion,productos_estrella,mes,catastrofe,cliente_estrella,prod_cust
0,2017-01-01,10001,20001,0.0,99.43861,11.0,99.43861,HC,ROPA LAVADO,Liquido,ARIEL,3000,genoma,1,1,0,1,20001-10001
1,2017-01-01,10001,20002,0.0,87.64856,17.0,90.13504,HC,ROPA LAVADO,Liquido,LIMPIEX,3000,Maquina 1er lavado,1,1,0,1,20002-10001
2,2017-01-01,10001,20003,0.0,100.21284,25.0,100.21284,FOODS,ADEREZOS,Mayonesa,NATURA,475,Regular sin TACC,1,1,0,1,20003-10001
3,2017-01-01,10001,20004,0.0,21.73954,13.0,21.73954,FOODS,ADEREZOS,Mayonesa,NATURA,240,Regular sin TACC,1,1,0,1,20004-10001
4,2017-01-01,10001,20005,0.0,0.0,0.0,0.0,FOODS,ADEREZOS,Mayonesa,NATURA,120,Regular sin TACC,1,1,0,1,20005-10001


In [31]:
sell_in_featured1 = sell_in_featured1.sort_values(by=['product_id', 'periodo'],ascending=True)

# 2. Agregar los 5 lags de tn para cada producto
for lag in range(1, 36):
    sell_in_featured1[f'tn_lag_{lag}'] = sell_in_featured1.groupby('product_id')['tn'].shift(lag)

# 3. Agregar tn_mas_2 (el valor de tn dos períodos hacia adelante)
sell_in_featured1['tn_mas_2'] = sell_in_featured1.groupby('product_id')['tn'].shift(-2)

In [87]:
#sell_in_featured1

In [32]:
sell_in_featured1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16763760 entries, 0 to 16763759
Data columns (total 54 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   periodo                datetime64[ns]
 1   customer_id            int64         
 2   product_id             int64         
 3   plan_precios_cuidados  float64       
 4   tn                     float64       
 5   cust_request_qty       float64       
 6   cust_request_tn        float64       
 7   cat1                   object        
 8   cat2                   object        
 9   cat3                   object        
 10  brand                  object        
 11  sku_size               int64         
 12  descripcion            object        
 13  productos_estrella     int64         
 14  mes                    int32         
 15  catastrofe             int64         
 16  cliente_estrella       int64         
 17  prod_cust              object        
 18  tn_lag_1               fl

In [33]:
# necesito tomar los datos del archivo ../data/IPC-AR_export.csv
ipc_ar = pd.read_csv('../data/IPC-AR_export.csv', sep=',', encoding='utf-8')
# Convertir el campo 'periodo' al formato '2017-01-01'
ipc_ar['periodo'] = pd.to_datetime(ipc_ar['periodo'], format='%m-%Y').dt.strftime('%Y-%m-01')

# Si querés que sea datetime64 y no string:
ipc_ar['periodo'] = pd.to_datetime(ipc_ar['periodo'])

ipc_ar = ipc_ar.rename(columns={'indice_ipc': 'ipc'})

# Verificá el resultado
print(ipc_ar[['periodo']].head())

primer_valor_real = ipc_ar['var_anual'].dropna().iloc[0]
ipc_ar['var_anual'] = ipc_ar['var_anual'].fillna(primer_valor_real)

ipc_ar

     periodo
0 2017-01-01
1 2017-02-01
2 2017-03-01
3 2017-04-01
4 2017-05-01


Unnamed: 0,periodo,ipc,var_mensual,var_anual
0,2017-01-01,101.5859,1.59,24.8
1,2017-02-01,103.6859,2.07,24.8
2,2017-03-01,106.1476,2.37,24.8
3,2017-04-01,108.9667,2.66,24.8
4,2017-05-01,110.5301,1.43,24.8
5,2017-06-01,111.8477,1.19,24.8
6,2017-07-01,113.7852,1.73,24.8
7,2017-08-01,115.3819,1.4,24.8
8,2017-09-01,117.5719,1.9,24.8
9,2017-10-01,119.3528,1.51,24.8


In [34]:
# merge ipc_ar con sell_in_featured1 by periodo adding for each row a ne field called ipc with the value of indice_ipc
sell_in_featured1 = sell_in_featured1.merge(
    ipc_ar[['periodo', 'ipc', 'var_mensual', 'var_anual']],
    on='periodo',
    how='left'
)

sell_in_featured1.info()  # Mostrar las primeras filas del DataFrame con el nuevo campo ipc

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16763760 entries, 0 to 16763759
Data columns (total 57 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   periodo                datetime64[ns]
 1   customer_id            int64         
 2   product_id             int64         
 3   plan_precios_cuidados  float64       
 4   tn                     float64       
 5   cust_request_qty       float64       
 6   cust_request_tn        float64       
 7   cat1                   object        
 8   cat2                   object        
 9   cat3                   object        
 10  brand                  object        
 11  sku_size               int64         
 12  descripcion            object        
 13  productos_estrella     int64         
 14  mes                    int32         
 15  catastrofe             int64         
 16  cliente_estrella       int64         
 17  prod_cust              object        
 18  tn_lag_1            

In [35]:
# tomo la cotizacion del dolar del archivo ../data/cotizacionDolar.csv
cotizacion_dolar = pd.read_csv('../data/cotizacionDolar.csv', sep=',', encoding='utf-8')
# Paso 1: Convertí la columna fecha a datetime
cotizacion_dolar['fecha'] = pd.to_datetime(cotizacion_dolar['fecha'], format='%m/%d/%Y')

# Paso 2: Si querés verla como string (opcional)
#cotizacion_dolar['fecha'] = cotizacion_dolar['fecha'].dt.strftime('%Y-%m-%d')

cotizacion_dolar['periodo'] = cotizacion_dolar['fecha'].apply(lambda x: getYearMonth(x))
cotizacion_dolar.periodo.unique()


df_dolar_resum = cotizacion_dolar.groupby("periodo").mean(numeric_only=True).reset_index()
df_dolar_resum.rename(columns={"cotizacion_dolar_venta":"promedio_mens_dolar_venta"},inplace=True)
df_dolar_resum.promedio_mens_dolar_venta = round(df_dolar_resum.promedio_mens_dolar_venta,2)
df_dolar_resum['periodo_fecha'] = pd.to_datetime(df_dolar_resum['periodo'], format='%Y%m')
df_dolar_resum['periodo'] = df_dolar_resum['periodo_fecha'].dt.strftime('%Y-%m-01')
df_dolar_resum.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82 entries, 0 to 81
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   periodo                    82 non-null     object        
 1   promedio_mens_dolar_venta  82 non-null     float64       
 2   periodo_fecha              82 non-null     datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 2.1+ KB


In [36]:
df_dolar_resum

Unnamed: 0,periodo,promedio_mens_dolar_venta,periodo_fecha
0,2017-01-01,15.91,2017-01-01
1,2017-02-01,15.59,2017-02-01
2,2017-03-01,15.52,2017-03-01
3,2017-04-01,15.36,2017-04-01
4,2017-05-01,15.72,2017-05-01
...,...,...,...
77,2023-06-01,248.78,2023-06-01
78,2023-07-01,266.43,2023-07-01
79,2023-08-01,322.13,2023-08-01
80,2023-09-01,350.00,2023-09-01


In [37]:
# 1. Asegurate que ambos son string y del mismo formato
sell_in_featured1['periodo'] = sell_in_featured1['periodo'].astype(str)
df_dolar_resum['periodo'] = df_dolar_resum['periodo'].astype(str)

# 2. Merge
sell_in_featured1 = sell_in_featured1.merge(
    df_dolar_resum[['periodo', 'promedio_mens_dolar_venta']],
    on='periodo',
    how='left'
)

In [38]:
# de sell_in_featured1 hacer un sample de 1000 filas
sell_in_featured1_sample = sell_in_featured1.sample(n=1000, random_state=42)
sell_in_featured1_sample

Unnamed: 0,periodo,customer_id,product_id,plan_precios_cuidados,tn,cust_request_qty,cust_request_tn,cat1,cat2,cat3,...,tn_lag_31,tn_lag_32,tn_lag_33,tn_lag_34,tn_lag_35,tn_mas_2,ipc,var_mensual,var_anual,promedio_mens_dolar_venta
5836941,2018-10-01,10073,20330,0.0,0.0,0.0,0.0,PC,CABELLO,Acond Mujer,...,0.14282,0.21423,0.64270,0.00000,0.00000,0.03246,174.1473,5.39,45.91,37.06
51618,2018-03-01,10317,20003,0.0,0.0,0.0,0.0,FOODS,ADEREZOS,Mayonesa,...,0.00000,0.00000,0.00000,0.00000,0.62244,0.00000,133.1054,2.34,25.40,20.24
4112397,2018-01-01,10297,20235,0.0,0.0,0.0,0.0,PC,DEOS,Aero,...,0.00000,0.00983,0.00000,0.00000,0.00000,0.00000,126.9887,1.76,25.01,19.04
14130221,2018-05-01,10594,20962,0.0,0.0,0.0,0.0,FOODS,OTROS,Galletitas,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,139.5893,2.08,26.29,23.73
5413130,2019-08-01,10135,20307,0.0,0.0,0.0,0.0,PC,DEOS,Aero,...,0.00000,0.00000,0.04226,0.00000,0.01966,0.00983,239.6077,3.95,54.48,52.59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15446982,2019-03-01,10223,21109,0.0,0.0,0.0,0.0,PC,DEOS,Barra,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,205.9571,4.68,54.73,41.52
5198580,2019-08-01,10562,20297,0.0,0.0,0.0,0.0,PC,CABELLO,SHAMPOO,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,239.6077,3.95,54.48,52.59
13013347,2018-06-01,10136,20855,0.0,0.0,0.0,0.0,HC,HOGAR,TOILETTE,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,144.8053,3.74,29.47,26.62
8075211,2019-03-01,10201,20483,0.0,0.0,0.0,0.0,PC,CABELLO,SHAMPOO,...,0.00000,0.00000,0.00000,0.00000,0.00000,0.00000,205.9571,4.68,54.73,41.52


In [39]:
sell_in_featured1.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16763760 entries, 0 to 16763759
Data columns (total 58 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   periodo                    object 
 1   customer_id                int64  
 2   product_id                 int64  
 3   plan_precios_cuidados      float64
 4   tn                         float64
 5   cust_request_qty           float64
 6   cust_request_tn            float64
 7   cat1                       object 
 8   cat2                       object 
 9   cat3                       object 
 10  brand                      object 
 11  sku_size                   int64  
 12  descripcion                object 
 13  productos_estrella         int64  
 14  mes                        int32  
 15  catastrofe                 int64  
 16  cliente_estrella           int64  
 17  prod_cust                  object 
 18  tn_lag_1                   float64
 19  tn_lag_2                   float64
 20  

In [40]:
df_temperaturas = pd.read_csv('../data/temperaturaMediaHistoricaCABA.csv', sep=',', encoding='utf-8')
df_temperaturas

Unnamed: 0,periodo_fecha,temp_media,temp_max_media,temp_min_media
0,Jan-17,,30.4,20.9
1,Feb-17,,30.1,20.6
2,Mar-17,,26.9,18.0
3,Apr-17,,23.5,14.3
4,May-17,,19.5,12.3
5,Jun-17,,17.9,9.5
6,Jul-17,,17.0,9.7
7,Aug-17,,19.2,11.1
8,Sep-17,,20.3,11.6
9,Oct-17,,23.0,11.3


In [41]:
# Para 2017 no esta la temperatura media, la aproximo
df_temperaturas.temp_media = np.where(df_temperaturas.temp_media.isna(),(df_temperaturas.temp_max_media+df_temperaturas.temp_min_media)/2,df_temperaturas.temp_media)

In [42]:
df_temperaturas

Unnamed: 0,periodo_fecha,temp_media,temp_max_media,temp_min_media
0,Jan-17,25.65,30.4,20.9
1,Feb-17,25.35,30.1,20.6
2,Mar-17,22.45,26.9,18.0
3,Apr-17,18.9,23.5,14.3
4,May-17,15.9,19.5,12.3
5,Jun-17,13.7,17.9,9.5
6,Jul-17,13.35,17.0,9.7
7,Aug-17,15.15,19.2,11.1
8,Sep-17,15.95,20.3,11.6
9,Oct-17,17.15,23.0,11.3


# DE ACA PARA ABAJO NO ANDA BIEN

In [43]:
# Convertir 'Jan-17' a datetime y luego a formato 'YYYY-MM-01'
df_temperaturas['periodo_fecha'] = pd.to_datetime(df_temperaturas['periodo_fecha'], format='%b-%y')
# Ahora lo llevas a formato 'YYYYMM' (o 'YYYY-MM-01' según quieras mergear)
# df_temperaturas['periodo'] = df_temperaturas['periodo_fecha'].dt.strftime('%Y%m')
df_temperaturas['periodo'] = df_temperaturas['periodo_fecha'].dt.strftime('%Y-%m-01')



In [44]:
df_temperaturas

Unnamed: 0,periodo_fecha,temp_media,temp_max_media,temp_min_media,periodo
0,2017-01-01,25.65,30.4,20.9,2017-01-01
1,2017-02-01,25.35,30.1,20.6,2017-02-01
2,2017-03-01,22.45,26.9,18.0,2017-03-01
3,2017-04-01,18.9,23.5,14.3,2017-04-01
4,2017-05-01,15.9,19.5,12.3,2017-05-01
5,2017-06-01,13.7,17.9,9.5,2017-06-01
6,2017-07-01,13.35,17.0,9.7,2017-07-01
7,2017-08-01,15.15,19.2,11.1,2017-08-01
8,2017-09-01,15.95,20.3,11.6,2017-09-01
9,2017-10-01,17.15,23.0,11.3,2017-10-01


In [45]:
sell_in_featured1.head()

Unnamed: 0,periodo,customer_id,product_id,plan_precios_cuidados,tn,cust_request_qty,cust_request_tn,cat1,cat2,cat3,...,tn_lag_31,tn_lag_32,tn_lag_33,tn_lag_34,tn_lag_35,tn_mas_2,ipc,var_mensual,var_anual,promedio_mens_dolar_venta
0,2017-01-01,10001,20001,0.0,99.43861,11.0,99.43861,HC,ROPA LAVADO,Liquido,...,,,,,,143.49426,101.5859,1.59,24.8,15.91
1,2017-01-01,10002,20001,0.0,35.72806,17.0,38.68301,HC,ROPA LAVADO,Liquido,...,,,,,,184.72927,101.5859,1.59,24.8,15.91
2,2017-01-01,10003,20001,0.0,143.49426,17.0,143.49426,HC,ROPA LAVADO,Liquido,...,,,,,,19.08407,101.5859,1.59,24.8,15.91
3,2017-01-01,10004,20001,0.0,184.72927,9.0,184.72927,HC,ROPA LAVADO,Liquido,...,,,,,,43.83179,101.5859,1.59,24.8,15.91
4,2017-01-01,10005,20001,0.0,19.08407,23.0,19.08407,HC,ROPA LAVADO,Liquido,...,,,,,,65.12087,101.5859,1.59,24.8,15.91


In [46]:
# Si querés todas las temperaturas:
sell_in_featured1 = sell_in_featured1.merge(
    df_temperaturas[['periodo', 'temp_media', 'temp_max_media', 'temp_min_media']],
    on='periodo', how='left'
)

sell_in_featured1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16763760 entries, 0 to 16763759
Data columns (total 61 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   periodo                    object 
 1   customer_id                int64  
 2   product_id                 int64  
 3   plan_precios_cuidados      float64
 4   tn                         float64
 5   cust_request_qty           float64
 6   cust_request_tn            float64
 7   cat1                       object 
 8   cat2                       object 
 9   cat3                       object 
 10  brand                      object 
 11  sku_size                   int64  
 12  descripcion                object 
 13  productos_estrella         int64  
 14  mes                        int32  
 15  catastrofe                 int64  
 16  cliente_estrella           int64  
 17  prod_cust                  object 
 18  tn_lag_1                   float64
 19  tn_lag_2                   float64
 20  

In [47]:
# de sell_in_featured1 hacer un sample de 1000 filas
sell_in_featured1_sample = sell_in_featured1.sample(n=1000, random_state=42)
sell_in_featured1_sample

Unnamed: 0,periodo,customer_id,product_id,plan_precios_cuidados,tn,cust_request_qty,cust_request_tn,cat1,cat2,cat3,...,tn_lag_34,tn_lag_35,tn_mas_2,ipc,var_mensual,var_anual,promedio_mens_dolar_venta,temp_media,temp_max_media,temp_min_media
5836941,2018-10-01,10073,20330,0.0,0.0,0.0,0.0,PC,CABELLO,Acond Mujer,...,0.00000,0.00000,0.03246,174.1473,5.39,45.91,37.06,18.0,22.4,13.2
51618,2018-03-01,10317,20003,0.0,0.0,0.0,0.0,FOODS,ADEREZOS,Mayonesa,...,0.00000,0.62244,0.00000,133.1054,2.34,25.40,20.24,22.0,27.9,17.1
4112397,2018-01-01,10297,20235,0.0,0.0,0.0,0.0,PC,DEOS,Aero,...,0.00000,0.00000,0.00000,126.9887,1.76,25.01,19.04,25.6,31.4,20.8
14130221,2018-05-01,10594,20962,0.0,0.0,0.0,0.0,FOODS,OTROS,Galletitas,...,0.00000,0.00000,0.00000,139.5893,2.08,26.29,23.73,16.1,20.3,13.2
5413130,2019-08-01,10135,20307,0.0,0.0,0.0,0.0,PC,DEOS,Aero,...,0.00000,0.01966,0.00983,239.6077,3.95,54.48,52.59,12.6,17.1,8.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15446982,2019-03-01,10223,21109,0.0,0.0,0.0,0.0,PC,DEOS,Barra,...,0.00000,0.00000,0.00000,205.9571,4.68,54.73,41.52,21.0,25.6,16.8
5198580,2019-08-01,10562,20297,0.0,0.0,0.0,0.0,PC,CABELLO,SHAMPOO,...,0.00000,0.00000,0.00000,239.6077,3.95,54.48,52.59,12.6,17.1,8.6
13013347,2018-06-01,10136,20855,0.0,0.0,0.0,0.0,HC,HOGAR,TOILETTE,...,0.00000,0.00000,0.00000,144.8053,3.74,29.47,26.62,10.6,15.7,6.9
8075211,2019-03-01,10201,20483,0.0,0.0,0.0,0.0,PC,CABELLO,SHAMPOO,...,0.00000,0.00000,0.00000,205.9571,4.68,54.73,41.52,21.0,25.6,16.8


In [48]:
print(sell_in_featured1.isnull().sum()[sell_in_featured1.isnull().sum() > 0])

tn_lag_1       780
tn_lag_2      1560
tn_lag_3      2340
tn_lag_4      3120
tn_lag_5      3900
tn_lag_6      4680
tn_lag_7      5460
tn_lag_8      6240
tn_lag_9      7020
tn_lag_10     7800
tn_lag_11     8580
tn_lag_12     9360
tn_lag_13    10140
tn_lag_14    10920
tn_lag_15    11700
tn_lag_16    12480
tn_lag_17    13260
tn_lag_18    14040
tn_lag_19    14820
tn_lag_20    15600
tn_lag_21    16380
tn_lag_22    17160
tn_lag_23    17940
tn_lag_24    18720
tn_lag_25    19500
tn_lag_26    20280
tn_lag_27    21060
tn_lag_28    21840
tn_lag_29    22620
tn_lag_30    23400
tn_lag_31    24180
tn_lag_32    24960
tn_lag_33    25740
tn_lag_34    26520
tn_lag_35    27300
tn_mas_2      1560
dtype: int64


In [49]:
#Guardar a CSV de prueba
sell_in_featured1.to_csv('../data/ventas_featurev2.csv', index=False)
print("CSV generado: ventas_featurev2.csv")

CSV generado: ventas_featurev2.csv


In [None]:
# # 1. Filtrá solo filas donde tn > 0 (ventas reales)
# df_con_ventas = sell_in_featured1[sell_in_featured1['tn'] > 0]

# # 2. Agrupá por producto y buscá el primer periodo de venta
# df_primer_mes_prod = df_con_ventas.groupby("product_id").agg({"periodo": "min"}).reset_index()

# # 3. Renombrá la columna para dejar claro qué significa
# df_primer_mes_prod = df_primer_mes_prod.rename(columns={"periodo": "primer_periodo_prod"})

# df_primer_mes_prod


Unnamed: 0,product_id,primer_periodo_prod
0,20001,2017-01-01
1,20002,2017-01-01
2,20003,2017-01-01
3,20004,2017-01-01
4,20005,2017-01-01
...,...,...
775,21263,2018-10-01
776,21265,2019-03-01
777,21266,2019-03-01
778,21267,2019-03-01


In [None]:
#sell_in_featured1 = crear_target_prod_cust(sell_in_featured)

sell_in_featured1 = crear_features_temporales("prod_cust",sell_in_featured, conf.NUM_LAGS_PARAM, conf.FAMILIA_FEATURES_TEMP_PARAM)
# df_clientes = crear_features_temporales("customer_id",df_clientes,config.NUM_LAGS_PARAM,config.FAMILIA_FEATURES_TEMP_PARAM)
# df_cat1 = crear_features_temporales("cat1",df_cat1,config.NUM_LAGS_PARAM,config.FAMILIA_FEATURES_TEMP_PARAM)
# df_cat2 = crear_features_temporales("cat2",df_cat2, config.NUM_LAGS_PARAM,config.FAMILIA_FEATURES_TEMP_PARAM)
# df_cat3 = crear_features_temporales("cat3",df_cat3, config.NUM_LAGS_PARAM,config.FAMILIA_FEATURES_TEMP_PARAM)

# print(len(df_sellout))
# if "customer_id" in config.AMPLIA_FEATURES_PARAM:
#     df_sellout = df_sellout.merge(df_clientes,how="left",on=["customer_id","periodo"])
# if "cat1" in config.AMPLIA_FEATURES_PARAM:
#     df_sellout = df_sellout.merge(df_cat1,how="left",on=["cat1","periodo"])

# if "cat2" in config.AMPLIA_FEATURES_PARAM:
#     df_sellout = df_sellout.merge(df_cat2,how="left",on=["cat2","periodo"])

# if "cat3" in config.AMPLIA_FEATURES_PARAM:
#     df_sellout = df_sellout.merge(df_cat3,how="left",on=["cat3","periodo"])
# print(len(df_sellout))