In [1]:
from prophet import Prophet
from prophet.serialize import model_to_json
import pandas as pd
import json
from tqdm import tqdm
from datetime import date
import numpy as np
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
import logging

# Set the logging level for cmdstanpy to WARNING
logging.getLogger('cmdstanpy').setLevel(logging.WARNING)


In [2]:
master_sales_by_week = pd.read_parquet('./datasets/master_sales_by_week_curated.parquet')

In [3]:
# add year column
master_sales_by_week['year'] = master_sales_by_week['date'].apply(lambda x: x.strftime('%Y'))

In [4]:
# get product_ids without any sale
total_sales_by_product_id = master_sales_by_week.groupby(['product_id'], as_index=False).agg({'quantity':'sum', })
products_without_sales = total_sales_by_product_id[total_sales_by_product_id['quantity'] == 0]
product_ids_without_sales = products_without_sales['product_id'].unique()
print(f'Total products_ids without sales: {len(product_ids_without_sales)}')
# remove products_id without any sale from the dataset
master_sales_by_week = master_sales_by_week[~(master_sales_by_week['product_id'].isin(product_ids_without_sales))]

Total products_ids without sales: 266


In [5]:
master_sales_by_week.head(1)

Unnamed: 0,combination,date,cod_fami,quantity,store_id,price_taxes_excluded,product_id,description_fami,description,event,discount,year
0,201AA3,2018-01-08,201,1,1010100,7554.62,229254.1001.EST,ACCESORIOS BEBE,PEZONERA GBC5103 EN SILICONA,NO EVENT,0,2018


In [6]:
for year in master_sales_by_week['year'].unique():
    n_unique_keys = master_sales_by_week[master_sales_by_week['date'].between(f'{year}-01-01',f'{year}-12-31')]['combination'].nunique()
    print(f"Año:{year}\n\t# Llaves Unicas: {n_unique_keys}")

Año:2018
	# Llaves Unicas: 6552
Año:2019
	# Llaves Unicas: 6517
Año:2020
	# Llaves Unicas: 5822
Año:2021
	# Llaves Unicas: 5552
Año:2022
	# Llaves Unicas: 5665
Año:2023
	# Llaves Unicas: 5539
Año:2024
	# Llaves Unicas: 4384


In [7]:
unique_keys_in_2022 = master_sales_by_week[master_sales_by_week['date'].between('2022-01-01','2022-12-31')]['combination'].unique()
unique_keys_in_2023 = master_sales_by_week[master_sales_by_week['date'].between('2023-01-01','2023-12-31')]['combination'].unique()

keys_to_forecast = list(set(unique_keys_in_2022) & set(unique_keys_in_2023)) # we going to forecast all the keys in 2022, and 2023


#### Create dataset for train (2018-2023)

In [8]:
master_sales_by_week_train = master_sales_by_week[master_sales_by_week['date'].between('2018-01-01','2023-12-31')]

In [9]:
weeks_of_information_by_combination = []
for i in tqdm(keys_to_forecast):
    df_temp = master_sales_by_week_train[master_sales_by_week_train['combination'] == i]
    dict_temp = {
        'combination' : i,
        'n_weeks' : df_temp['date'].nunique()
    }
    weeks_of_information_by_combination.append(dict_temp)
weeks_of_information_by_combination_df = pd.DataFrame(weeks_of_information_by_combination)

100%|██████████| 4871/4871 [01:24<00:00, 57.31it/s]


In [10]:
keys_with_less_than_12_weeks_of_info = weeks_of_information_by_combination_df[weeks_of_information_by_combination_df['n_weeks'] <= 12]['combination'].unique()

# keep just the keys present in 2022 and 2023
master_sales_by_week_train = master_sales_by_week_train[master_sales_by_week_train['combination'].isin(keys_to_forecast)]
# drop all the combination withot enoff information in train 
master_sales_by_week_train = master_sales_by_week_train[~(master_sales_by_week_train['combination'].isin(keys_with_less_than_12_weeks_of_info))]
print(f'#Keys with less than 12 weeks of info: {len(keys_with_less_than_12_weeks_of_info)}')

#Keys with less than 12 weeks of info: 289


In [12]:
logging.getLogger('cmdstanpy').setLevel(logging.WARNING)

for combination in tqdm(master_sales_by_week_train['combination'].unique()):
    sales_combination = master_sales_by_week_train[master_sales_by_week_train['combination'] == combination]
    sales_combination = sales_combination[['date','combination','quantity','cod_fami','discount','price_taxes_excluded']]


    first_date = sales_combination['date'].min() # Get the first date in the dataframe
    last_date = sales_combination['date'].max() # Get the last date in the dataframe

    df_dates = pd.DataFrame({'date': pd.date_range(start=f'{first_date.year}-01-08', end='2023-12-31', freq='W-MON')})

    sales_combination = df_dates.merge(sales_combination, on='date', how='left') # Merge the dataframes
    sales_combination['quantity'] = sales_combination['quantity'].fillna(0) # Replace NaN values with 0
    sales_combination = sales_combination[~(sales_combination['combination'].isnull())]

    # Create the prophet dataframe
    prophet_dataframe = pd.DataFrame()
    prophet_dataframe['ds'] = sales_combination['date']
    prophet_dataframe['y'] = sales_combination['quantity']
    prophet_dataframe['y'] = prophet_dataframe['y'].astype(int)
    prophet_dataframe['discount'] = sales_combination['discount']
    prophet_dataframe['price_taxes_excluded'] = sales_combination['price_taxes_excluded']

    n_unique_years = prophet_dataframe['ds'].dt.year.nunique() # Count the number of different years in the dataframe

    # Create the model
    model = Prophet(weekly_seasonality = 15)
    model.add_regressor('discount')
    model.add_regressor('price_taxes_excluded')
    
    model.fit(prophet_dataframe)
    
    # Save Model
    with open(f'./serialized_models/{str(combination)}.json', 'w') as file:
        file.write(model_to_json(model))  # Save model

    

100%|██████████| 4582/4582 [22:00<00:00,  3.47it/s]  
