In [65]:
import pandas as pd

# 1) Carga de datos
sell_in       = pd.read_csv('../data/sell-in.txt', sep='\t')
prod_vigentes = pd.read_csv('../data/product_id_apredecir201912.txt', sep='\t')  # contiene al menos product_id
productos = pd.read_csv('../data/tb_productos_05262025.txt', sep='\t')  # contiene al menos product_id


In [66]:
# 2) Parseo de 'periodo' a datetime
sell_in['periodo'] = pd.to_datetime(
    sell_in['periodo'].astype(str) + '01',
    format='%Y%m%d'
)

In [67]:
sell_in.shape

(2945818, 7)

In [53]:
# 3) Agregar cust_request_qty, cust_request_tn y tn por (periodo, customer_id, product_id)

sell_in_agg = (
    sell_in
    .groupby(['periodo','customer_id','product_id'], as_index=False)
    .agg({
        'tn': 'sum',
        'cust_request_qty': 'sum',
        'cust_request_tn': 'sum'
    })
)

In [69]:
sell_in_agg.shape

(2945818, 6)

In [70]:
sell_in_agg

Unnamed: 0,periodo,customer_id,product_id,tn,cust_request_qty,cust_request_tn
0,2017-01-01,10001,20001,99.43861,11,99.43861
1,2017-01-01,10001,20002,87.64856,17,90.13504
2,2017-01-01,10001,20003,100.21284,25,100.21284
3,2017-01-01,10001,20004,21.73954,13,21.73954
4,2017-01-01,10001,20006,29.17196,18,31.36770
...,...,...,...,...,...,...
2945813,2019-12-01,10606,20303,0.01298,1,0.01298
2945814,2019-12-01,10606,20563,0.00442,1,0.00442
2945815,2019-12-01,10606,20962,0.00655,1,0.00655
2945816,2019-12-01,10606,20975,0.00655,1,0.00655


In [71]:
# 4) Extraer lista única de periodos
unique_periodos = sell_in_agg['periodo'].drop_duplicates()
unique_customers = sell_in_agg['customer_id'].drop_duplicates()
unique_products  = prod_vigentes['product_id'].drop_duplicates()


In [43]:
# 5) Fijar cliente y producto para la prueba
#unique_customers = pd.Index([10002], name='customer_id')
#unique_products  = pd.Index([20001], name='product_id')

In [56]:
# 6) Crear todas las combinaciones (periodo × 10001 × 20001)
idx = pd.MultiIndex.from_product(
    [unique_periodos, unique_customers, unique_products],
    names=['periodo','customer_id','product_id']
)
complete_test = idx.to_frame(index=False)

In [57]:
complete_test

Unnamed: 0,periodo,customer_id,product_id
0,2017-01-01,10001,20001
1,2017-01-01,10001,20002
2,2017-01-01,10001,20003
3,2017-01-01,10001,20004
4,2017-01-01,10001,20005
...,...,...,...
16763755,2019-12-01,10582,21263
16763756,2019-12-01,10582,21265
16763757,2019-12-01,10582,21266
16763758,2019-12-01,10582,21267


In [58]:
# 7) Hacer merge con las ventas reales de (10001, 20001) y rellenar NaN → 0

#     Hacemos el merge con complete_test. Después rellenamos NaN->0 en tn.
complete_test = (
    complete_test
    .merge(
        sell_in_agg,  # traemos solo periodo + tn
        on= ['periodo','customer_id', 'product_id'],
        how='left'
    )
    .fillna({
        'tn': 0,
        'cust_request_qty': 0,
        'cust_request_tn': 0
    })
)
#complete_test.drop(columns='tn_x', inplace=True)  # eliminamos periodo
#complete_test.drop(columns='tn', inplace=True)  # eliminamos periodo

In [59]:
complete_test


Unnamed: 0,periodo,customer_id,product_id,tn,cust_request_qty,cust_request_tn
0,2017-01-01,10001,20001,99.43861,11.0,99.43861
1,2017-01-01,10001,20002,87.64856,17.0,90.13504
2,2017-01-01,10001,20003,100.21284,25.0,100.21284
3,2017-01-01,10001,20004,21.73954,13.0,21.73954
4,2017-01-01,10001,20005,0.00000,0.0,0.00000
...,...,...,...,...,...,...
16763755,2019-12-01,10582,21263,0.00000,0.0,0.00000
16763756,2019-12-01,10582,21265,0.00000,0.0,0.00000
16763757,2019-12-01,10582,21266,0.00000,0.0,0.00000
16763758,2019-12-01,10582,21267,0.00000,0.0,0.00000


In [60]:
complete_test = complete_test.merge(productos, on='product_id', how='left')

In [62]:
print(complete_test)

            periodo  customer_id  product_id         tn  cust_request_qty  \
0        2017-01-01        10001       20001   99.43861              11.0   
1        2017-01-01        10001       20002   87.64856              17.0   
2        2017-01-01        10001       20003  100.21284              25.0   
3        2017-01-01        10001       20004   21.73954              13.0   
4        2017-01-01        10001       20005    0.00000               0.0   
...             ...          ...         ...        ...               ...   
16763755 2019-12-01        10582       21263    0.00000               0.0   
16763756 2019-12-01        10582       21265    0.00000               0.0   
16763757 2019-12-01        10582       21266    0.00000               0.0   
16763758 2019-12-01        10582       21267    0.00000               0.0   
16763759 2019-12-01        10582       21276    0.00000               0.0   

          cust_request_tn   cat1         cat2              cat3    brand  \

In [50]:
#contar los registros donde el tn es cero
print("Número de registros con tn = 0:", len(complete_test[complete_test['tn'] == 0]))

Número de registros con tn = 0: 14470279


In [63]:
# 9) Guardar a CSV de prueba
complete_test.to_csv('ventas_complete.csv', index=False)
print("CSV generado: ventas_complete.csv")

CSV generado: ventas_complete.csv


In [2]:
complete_test.info()  # Mostrar información del DataFrame final

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21492 entries, 0 to 21491
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   periodo      21492 non-null  datetime64[ns]
 1   customer_id  21492 non-null  int64         
 2   product_id   21492 non-null  int64         
 3   tn           21492 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 671.8 KB
