In [15]:
import pandas as pd
from almacenes_si_db import forecast_key_format
import numpy as np
pd.set_option('display.max_columns', 500)


In [22]:
df_sales = pd.read_parquet('./datasets/historico_ventas_2018_2024_raw.parquet')
df_product_raw = pd.read_parquet('./datasets/product_information_raw.parquet')

In [3]:
df_sales.head(1)

Unnamed: 0,codigo_corto,ProductId,StoreId,TicketId,Date,Quantity,PriceTaxesExcluded,SalesPersonId,Origen
0,721957,199980.1001.TM,1013700,L37173777,2023-01-28,1.0,15050.42,9553,POS


In [4]:
df_product_raw.head(1)

Unnamed: 0,ProductId,codigo_corto,cod_fami,des_fami,cod_tipo,des_tipo,cod_grup,des_grup,cod_subg,des_subg,cod_dsub,des_dsgp,cod_tlla,des_tlla,cod_dime,des_dime,cod_mndo,des_mndo,cod_marc,des_marc,cod_fndo,des_fndo,cod_pcon,des_pcon,cod_clor,des_clor,cod_unid,Description,fecha_creacion,combination
0,1.0999.UNI,170315,250,VESTUARIO EXTERIOR,310,Bottom,462,MASCULINO,AZ3,Jeans,,No aplica,UNI,Unica,,,JSW,JEANS WEAR,156,IMPORT STA,UNC,Unicolor,MDA,MODA,999,Generico,UN,JEAN STAFF CHEVI,2023-12-31,250310462AZ3


### Process the historic sales and group by week 

In [23]:
# rename sales df columns
df_sales.rename(columns = {
                    'ProductId' : 'product_id', 
                    'StoreId':'store_id', 
                    'Date' : 'date', 
                    'Quantity' : 'quantity',
                    'PriceTaxesExcluded' : 'price_taxes_excluded'}, inplace=True)
df_product_raw.rename(columns = { 'ProductId': 'product_id','Description' : 'description', 'des_fami':'description_fami'}, inplace=True)
# generate the key (llave/combination)
df_product_raw['combination'] = df_product_raw.apply(lambda x: forecast_key_format(x), axis = 1)
df_product_raw = df_product_raw[['product_id','cod_fami','combination','description_fami','description']]

# merge sales with previous combination column generated 
sales_with_combination = df_sales.merge(df_product_raw, how = 'left', on = 'product_id')
sales_with_combination['date'] = pd.to_datetime(sales_with_combination['date'])
# Group By Week
master_sales_by_week = sales_with_combination.groupby(['combination', pd.Grouper(key='date', freq='W-MON')]).agg({'cod_fami' : 'max','quantity': 'sum', 'store_id':'max','price_taxes_excluded':'min' ,'product_id':'max','description_fami':'max','description':'max'}).reset_index() 

In [24]:
master_sales_by_week.head()

Unnamed: 0,combination,date,cod_fami,quantity,store_id,price_taxes_excluded,product_id,description_fami,description
0,201AA3,2018-01-08,201,1.0,1010100,7554.62,229254.1001.EST,ACCESORIOS BEBE,PEZONERA GBC5103 EN SILICONA
1,201AA3,2018-01-15,201,1.0,1010100,12596.64,206906.1001.EST,ACCESORIOS BEBE,RECOLECTOR IMP2476 INFANT X2
2,201AA3,2018-01-22,201,4.0,1010100,10075.63,233106.1001.EST,ACCESORIOS BEBE,RECOLECTOR IMP2476 INFANT X2
3,201AA3,2018-01-29,201,1.0,1010100,10075.63,206895.1001.EST,ACCESORIOS BEBE,ASPIRADOR NASAL BSL015 INFANT
4,201AA3,2018-02-05,201,6.0,1010100,10075.63,233106.1001.EST,ACCESORIOS BEBE,VASO GBC5047 ENTRENADOR ANTIGT


### Campañas de descuento

In [25]:
dates_campaigns_df = pd.read_excel('./datasets/b_eventos.xlsx', sheet_name='fechas')
dates_campaigns_df.rename(columns = {'fecha':'date','evento':'event'}, inplace = True)
discounts_campaigns_df = pd.read_excel('./datasets/b_eventos.xlsx', sheet_name='descuento por evento')
del discounts_campaigns_df['familia'] # delete `familia` column, is useless

In [26]:
# reshape the dataframe of the discounts by family codes
discounts_campaigns_df = pd.melt(discounts_campaigns_df, id_vars='cod_fami', var_name='event', value_name='discount')
discounts_campaigns_df.sort_values(by = ['cod_fami'], inplace=True)

In [27]:
# group the events names by week
dates_campaigns_df = dates_campaigns_df.groupby([pd.Grouper(key='date', freq='W-MON')])['event'].max().reset_index()
dates_campaigns_df.dropna(inplace=True)

# merge the dates of the discounts, with the values
campaigns_df = dates_campaigns_df.merge(discounts_campaigns_df,how = 'left', on = 'event')
# convert `cod_fami` column from int to str
campaigns_df['cod_fami'] = campaigns_df['cod_fami'].apply(lambda x: str(x))

In [28]:
discounts_campaigns_df.head()

Unnamed: 0,cod_fami,event,discount
175,201,REBAJAS II,20
145,201,REBAJAS I,20
85,201,ANIVERSARIO,20
385,201,SALE,40
205,201,VACACIONES,15


In [29]:
dates_campaigns_df.head()

Unnamed: 0,date,event
0,2018-02-05,TIJERETAZO I
1,2018-02-12,TIJERETAZO I
2,2018-02-19,TIJERETAZO I
3,2018-02-26,TIJERETAZO I
6,2018-03-19,VACACIONES


In [30]:
campaigns_df.head()

Unnamed: 0,date,event,cod_fami,discount
0,2018-02-05,TIJERETAZO I,201,30
1,2018-02-05,TIJERETAZO I,202,35
2,2018-02-05,TIJERETAZO I,209,40
3,2018-02-05,TIJERETAZO I,211,20
4,2018-02-05,TIJERETAZO I,214,30


In [31]:
master_sales_by_week.head()

Unnamed: 0,combination,date,cod_fami,quantity,store_id,price_taxes_excluded,product_id,description_fami,description
0,201AA3,2018-01-08,201,1.0,1010100,7554.62,229254.1001.EST,ACCESORIOS BEBE,PEZONERA GBC5103 EN SILICONA
1,201AA3,2018-01-15,201,1.0,1010100,12596.64,206906.1001.EST,ACCESORIOS BEBE,RECOLECTOR IMP2476 INFANT X2
2,201AA3,2018-01-22,201,4.0,1010100,10075.63,233106.1001.EST,ACCESORIOS BEBE,RECOLECTOR IMP2476 INFANT X2
3,201AA3,2018-01-29,201,1.0,1010100,10075.63,206895.1001.EST,ACCESORIOS BEBE,ASPIRADOR NASAL BSL015 INFANT
4,201AA3,2018-02-05,201,6.0,1010100,10075.63,233106.1001.EST,ACCESORIOS BEBE,VASO GBC5047 ENTRENADOR ANTIGT


### Merge discount campaings to master sales by week, using date and family

In [32]:
master_sales_by_week = master_sales_by_week.merge(campaigns_df,how = 'left', on = ['date','cod_fami'])

In [33]:
# Fill na for event and discounts
master_sales_by_week['event'].fillna('NO EVENT', inplace=True)
master_sales_by_week['discount'].fillna('0', inplace=True)

# cast discount float to int & quiantity from str to float
master_sales_by_week['discount'] = master_sales_by_week['discount'].apply(lambda x: int(x))
# round up the quantities
master_sales_by_week['quantity'] = master_sales_by_week['quantity'].apply(lambda x: np.ceil(x))
master_sales_by_week['quantity'] = master_sales_by_week['quantity'].apply(lambda x: int(x))

# negative "sales" input with 0
master_sales_by_week['quantity'] = master_sales_by_week['quantity'].apply(lambda x: 0 if x < 0 else x)

#### Export Historic Master Sales with combination + discounts until july 2024

In [34]:
master_sales_by_week.to_parquet('./datasets/master_sales_by_week_curated.parquet', index = False)