In [1]:
from prophet import Prophet
from prophet.plot import plot_plotly, plot_components_plotly
from prophet.serialize import model_to_json, model_from_json
import pandas as pd
import json
from tqdm import tqdm
from datetime import date
from llaves_more_then_q95 import llaves_more_then_q95
import numpy as np
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
import logging

# Set the logging level for cmdstanpy to WARNING
logging.getLogger('cmdstanpy').setLevel(logging.WARNING)


In [2]:
df_grouped_by_week = pd.read_parquet('./datasets/almacenes_si_curated_by_week.parquet')

In [5]:
df_grouped_by_week[df_grouped_by_week['combination'] == '240BC5PC6']

Unnamed: 0,combination,date_week,quantity,discount_for_event,campaigns_name,price_taxes_excluded,campaign
88071,240BC5PC6,2023-09-11,47,35.0,TIJERETAZO II,541791.6,1
88072,240BC5PC6,2023-09-18,216,35.0,TIJERETAZO II,2059384.0,1
88073,240BC5PC6,2023-09-25,323,35.0,TIJERETAZO II,2409560.0,1
88074,240BC5PC6,2023-10-02,259,35.0,TIJERETAZO II,2535212.0,1
88075,240BC5PC6,2023-10-09,261,7.241379,no-discount,2738005.0,0
88076,240BC5PC6,2023-10-16,335,0.0,no-discount,2556080.0,0
88077,240BC5PC6,2023-10-23,263,0.0,no-discount,2361475.0,0
88078,240BC5PC6,2023-10-30,125,0.0,no-discount,1844936.0,0
88079,240BC5PC6,2023-11-06,176,0.0,no-discount,1809690.0,0
88080,240BC5PC6,2023-11-13,242,0.0,no-discount,2803595.0,0


In [7]:
archivo_campanias = df_grouped_by_week[['date_week','combination','campaign','discount_for_event']]
archivo_campanias['familia'] = archivo_campanias['combination'].apply(lambda x: str(x)[:3])
archivo_campanias = archivo_campanias.groupby(['date_week','familia'], as_index=False)[['discount_for_event','campaign']].max()
archivo_campanias = archivo_campanias[['familia','date_week','discount_for_event','campaign']]
archivo_campanias.head(1)

Unnamed: 0,familia,date_week,discount_for_event,campaign
0,201,2018-01-08,0.0,0


In [11]:
final_prediction_2023 = pd.DataFrame()
rmse_por_llave = pd.DataFrame()
test_model = False

for combination in tqdm(['240BC5PC6']):

    # Get the dataframe for the combination
    df_combination = df_grouped_by_week[df_grouped_by_week['combination'] == combination]
    df_combination = df_combination[['date_week','combination','quantity']]
    
    if combination in llaves_more_then_q95 and len(df_combination[df_combination['date_week'] >= '2023-01-01']) > 0: # si la llave esta en las llaves q95 y tiene data en el 2022, se entrena con data 2022, de lo contrario, se entra con todo lo disponible
        df_combination = df_combination[df_combination['date_week'] >= '2022-01-01']

    familia = str(combination[:3])
    df_discount_and_campaings = archivo_campanias[archivo_campanias['familia'] == familia]

    # df_discount_and_campaings = df_grouped_by_week[df_grouped_by_week['date_week'] >= '2024-01-01']
    # df_discount_and_campaings = df_discount_and_campaings[['combination','date_week','discount_for_event','campaign']]
    # Complete the missing weeks with 0
    first_date = df_combination['date_week'].min() # Get the first date in the dataframe
    last_date = df_combination['date_week'].max() # Get the last date in the dataframe
    
    if first_date.year <= 2023:
        # Create a dataframe with all the weeks between the year of the first date and the year of the last date
        df_dates = pd.DataFrame({'date_week': pd.date_range(start=f'{first_date.year}-01-08', end=f'2023-12-31', freq='W-MON')})    

        df_combination = df_dates.merge(df_combination, on='date_week', how='left') # Merge the dataframes
        df_combination['quantity'] = df_combination['quantity'].fillna(0) # Replace NaN values with 0
        df_combination = df_combination[~(df_combination['combination'].isnull())]
        df_combination = df_combination.merge(df_discount_and_campaings, on='date_week', how='left') # Merge the dataframes

        # Create the prophet dataframe
        prophet_dataframe = pd.DataFrame()
        prophet_dataframe['ds'] = df_combination['date_week']
        prophet_dataframe['y'] = df_combination['quantity']
        prophet_dataframe['campaign'] = df_combination['campaign']
        prophet_dataframe['discount_for_event'] = df_combination['discount_for_event']
        prophet_dataframe['y'] = prophet_dataframe['y'].astype(int)

        n_unique_years = prophet_dataframe['ds'].dt.year.nunique() # Count the number of different years in the dataframe

        # Create the model
        if n_unique_years == 1:
            model = Prophet(weekly_seasonality = 13)
            model.add_regressor('campaign')
            model.add_regressor('discount_for_event')

        else:
            model = Prophet(weekly_seasonality = 52)
            model.add_regressor('campaign')
            model.add_regressor('discount_for_event')
            
            
        try:
            model.fit(prophet_dataframe)
            # with open(f'./serialized_models/{str(combination)}.json', 'w') as file:
            #     file.write(model_to_json(model))  # Save model
            
            
            if test_model:
                # Calculate how many weeks are missing from the last date in the dataframe to the last week of 2023
                weeks = (date(2023, 12, 31) - prophet_dataframe['ds'].max().date()).days // 7

                # Create a dataframe with the dates from the last date in the dataframe to the last week of 2023
                df_grouped_by_week_test = df_grouped_by_week[df_grouped_by_week['date_week'].between('2023-01-01','2023-12-31')]
                df_combination_test = df_grouped_by_week_test[df_grouped_by_week_test['combination'] == combination][['date_week','quantity']] # get the y_true values of the combination selected
                df_combination_test = df_combination_test.merge(df_discount_and_campaings, on='date_week', how='left') # Merge the dataframes

                future_2023 = model.make_future_dataframe(periods=weeks, freq='W-MON')
                future_2023 = future_2023[future_2023['ds'] >= '2023-01-01']
                future_2023 = future_2023.merge(df_combination_test, right_on='date_week',left_on= 'ds', how = 'left')
                del future_2023['date_week']
                future_2023['familia'] = familia
                future_2023['campaign'].fillna(0,inplace=True)
                future_2023['quantity'].fillna(0,inplace=True)
                future_2023['discount_for_event'].fillna(0,inplace=True)
                
                
                forecast = model.predict(future_2023) # Make the predictions
                forecast_2023 = forecast[['ds', 'yhat']] # Get the predictions for 2023
                forecast_2023.columns = ['date', 'demand_yhat'] # Rename columns
                forecast_2023['llave'] = combination # Add the combination column
                # Add to the final predictions dataframe
                final_prediction_2023 = pd.concat([final_prediction_2023, forecast_2023])
                final_prediction_2023['demand_yhat'] = final_prediction_2023['demand_yhat'].apply(lambda x: 0 if x < 0 else x)
                final_prediction_2023['demand_yhat'] = final_prediction_2023['demand_yhat'].apply(lambda x: np.ceil(x))

                X_test = df_grouped_by_week[df_grouped_by_week['combination'] == combination][['date_week','quantity']]
                X_test.columns = ['date','y_true']
                df_ytrue_yhat = final_prediction_2023.merge(X_test,on= 'date', how = 'left')
                df_ytrue_yhat['y_true'].fillna(0, inplace=True)

                RMSE = np.sqrt(mean_squared_error(df_ytrue_yhat['y_true'], df_ytrue_yhat['demand_yhat']))
                dict_temp = {
                    'llave' : combination,
                    'rsme' : RMSE
                }
                df_rmse = pd.DataFrame([dict_temp])
                rmse_por_llave = pd.concat([rmse_por_llave,df_rmse])
        except Exception as e:
            print(f'problems with this key: {combination}\n{e}')



        
    else:
        print(combination)

  0%|          | 0/1 [00:00<?, ?it/s]14:35:06 - cmdstanpy - INFO - Chain [1] start processing
14:35:06 - cmdstanpy - INFO - Chain [1] done processing
100%|██████████| 1/1 [00:00<00:00,  6.03it/s]
