In [1]:
import pandas as pd
from datetime import date
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
import logging

logging.getLogger("prophet").setLevel(logging.WARNING)

In [4]:
df = pd.read_parquet('./datasets/raw/master_salesforescast_curated_v2.parquet')
df.head()

Unnamed: 0,product_id,forescast_key,store_id,date,quantity,discount_for_event,price_taxes_excluded
0,230617.7818.TS,250348434AF5,1012300,2018-01-02,1.0,0.0,25201.679688
1,226267.2001.160CM,240BK3OZ8,1010100,2018-01-02,1.0,0.0,14277.30957
2,207172.2088.146CM,240AQ2OA6,1014200,2018-01-02,3.0,0.0,3357.139893
3,127039.1000.MD05,248BE2QK1MD05,1010100,2018-01-02,1.0,0.0,18478.990234
4,207172.2088.146CM,240AQ2OA6,1011000,2018-01-02,1.5,0.0,3357.139893


In [5]:
df.rename(columns = {'forescast_key':'combination'}, inplace=True)
df['quantity'] = df['quantity'].astype(int)

In [7]:
# Read campaigns xlsx file
campaigns = pd.read_excel('./datasets/raw/b_eventos.xlsx', sheet_name='fechas')
campaigns['fecha'] = pd.to_datetime(campaigns['fecha'])
campaigns.columns = ['date','campaigns_name']
campaigns.head()

Unnamed: 0,date,campaigns_name
0,2018-02-01,TIJERETAZO I
1,2018-02-02,TIJERETAZO I
2,2018-02-03,TIJERETAZO I
3,2018-02-04,TIJERETAZO I
4,2018-02-05,TIJERETAZO I


In [8]:
# merge campaings to the dataset
df = df.merge(campaigns, on='date', how='left')
df['campaigns_name'].fillna('no-discount', inplace=True)
df.head()

Unnamed: 0,product_id,combination,store_id,date,quantity,discount_for_event,price_taxes_excluded,campaigns_name
0,230617.7818.TS,250348434AF5,1012300,2018-01-02,1,0.0,25201.679688,no-discount
1,226267.2001.160CM,240BK3OZ8,1010100,2018-01-02,1,0.0,14277.30957,no-discount
2,207172.2088.146CM,240AQ2OA6,1014200,2018-01-02,3,0.0,3357.139893,no-discount
3,127039.1000.MD05,248BE2QK1MD05,1010100,2018-01-02,1,0.0,18478.990234,no-discount
4,207172.2088.146CM,240AQ2OA6,1011000,2018-01-02,1,0.0,3357.139893,no-discount


In [9]:
# Check rows with quantity <= 0
print(f'Records with Quantity <= 0: {len(df[df["quantity"]<=0])}')

# Quantity <= 0 -> 212.683

# get rid all the rows with quantity <= 0, why keep this records?, make not sense
df = df[df['quantity'] > 0]

Records with Quantity <= 0: 1276948


In [10]:
# Group by week
df_grouped_by_week = df.groupby(['combination', pd.Grouper(key='date', freq='W-MON')]).agg({'quantity': 'sum', 'discount_for_event': "mean",'campaigns_name': 'max','price_taxes_excluded' : 'sum'}).reset_index() 
df_grouped_by_week.rename(columns={'date':'date_week'}, inplace=True)
df_grouped_by_week.tail()

Unnamed: 0,combination,date_week,quantity,discount_for_event,campaigns_name,price_taxes_excluded
452119,281348434BV7157,2021-12-27,2,10.0,SALE NAVIDAD,16798.320312
452120,281348434BV7157,2022-06-06,1,0.0,no-discount,11344.540039
452121,281348434BV7157,2023-07-03,1,10.0,ANIVERSARIO,39411.761719
452122,281348434BV7157,2023-08-07,4,0.0,no-discount,157647.046875
452123,281348434BV7157,2023-08-14,1,0.0,no-discount,39411.761719


In [11]:
# Generating Dummy for campaing
df_grouped_by_week['campaign'] = np.where(df_grouped_by_week['campaigns_name'] != 'no-discount',1,0)

In [12]:
## Export curated dataset with all trnasformations 
df.to_parquet('./datasets/curated/almacenes_si_curated.parquet', index = False)
df_grouped_by_week.to_parquet('./datasets/curated/almacenes_si_curated_by_week.parquet', index = False)