In [None]:
from prophet import Prophet
from prophet.plot import plot_plotly, plot_components_plotly
from prophet.serialize import model_to_json, model_from_json
import pandas as pd
import json
from tqdm import tqdm
from datetime import date
from llaves_more_then_q95 import llaves_more_then_q95
import numpy as np
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
import logging

# Set the logging level for cmdstanpy to WARNING
logging.getLogger('cmdstanpy').setLevel(logging.WARNING)


In [None]:
df_grouped_by_week = pd.read_parquet('./datasets/almacenes_si_curated_by_week.parquet')

In [None]:
archivo_insumo = df_grouped_by_week[['date_week','combination','campaign','discount_for_event']]
archivo_insumo = archivo_insumo[archivo_insumo['date_week'].between('2023-01-01','2023-12-31')]
archivo_insumo['familia'] = archivo_insumo['combination'].apply(lambda x: str(x)[:3])
archivo_insumo = archivo_insumo.groupby(['date_week','familia'], as_index=False)[['discount_for_event','campaign']].max()
archivo_insumo = archivo_insumo[['familia','date_week','discount_for_event','campaign']]

In [None]:
archivo_campanias = df_grouped_by_week[['date_week','combination','campaign','discount_for_event']]
archivo_campanias['familia'] = archivo_campanias['combination'].apply(lambda x: str(x)[:3])
archivo_campanias = archivo_campanias.groupby(['date_week','familia'], as_index=False)[['discount_for_event','campaign']].max()
archivo_campanias = archivo_campanias[['familia','date_week','discount_for_event','campaign']]
archivo_campanias

In [None]:
llaves_con_informacion_historica_suficiente_para_forecast = []
combinations_to_forecast = df_grouped_by_week['combination'].unique()
for combination in tqdm(combinations_to_forecast):
    df_combination = df_grouped_by_week[df_grouped_by_week['combination'] == combination]
    if len(df_combination[df_combination['date_week'] >= '2022-01-01']) >= 12: # si tiene por lo menos 12 semanas desde el 2022
        llaves_con_informacion_historica_suficiente_para_forecast.append(combination)
print(len(llaves_con_informacion_historica_suficiente_para_forecast))

In [None]:
df_grouped_by_week[df_grouped_by_week['combination'] == combination]

In [None]:
final_prediction_2023 = pd.DataFrame()
rmse_por_llave = pd.DataFrame()
test_model = True

for combination in tqdm(llaves_con_informacion_historica_suficiente_para_forecast):

    # Get the dataframe for the combination
    df_combination = df_grouped_by_week[df_grouped_by_week['combination'] == combination]
    df_combination = df_combination[['date_week','combination','quantity']]
    
    if combination in llaves_more_then_q95 and len(df_combination[df_combination['date_week'] >= '2022-01-01']) > 0: # si la llave esta en las llaves q95 y tiene data en el 2022, se entrena con data 2022, de lo contrario, se entra con todo lo disponible
        df_combination = df_combination[df_combination['date_week'] >= '2022-01-01']
    # df_combination_test = df_grouped_by_week_test[df_grouped_by_week_test['combination'] == combination][['date_week','quantity','campaign','discount_for_event']] # get the y_true values of the combination selected

    familia = str(combination[:3])
    df_discount_and_campaings = archivo_campanias[archivo_campanias['familia'] == familia]

    # df_discount_and_campaings = df_grouped_by_week[df_grouped_by_week['date_week'] >= '2024-01-01']
    # df_discount_and_campaings = df_discount_and_campaings[['combination','date_week','discount_for_event','campaign']]
    # Complete the missing weeks with 0
    first_date = df_combination['date_week'].min() # Get the first date in the dataframe
    last_date = df_combination['date_week'].max() # Get the last date in the dataframe
    
    if test_model == True and first_date.year <= 2022:
        # Create a dataframe with all the weeks between the year of the first date and the year of the last date
        df_dates = pd.DataFrame({'date_week': pd.date_range(start=f'{first_date.year}-01-08', end=f'2022-12-31', freq='W-MON')})    

        df_combination = df_dates.merge(df_combination, on='date_week', how='left') # Merge the dataframes
        df_combination['quantity'] = df_combination['quantity'].fillna(0) # Replace NaN values with 0
        df_combination = df_combination[~(df_combination['combination'].isnull())]
        df_combination = df_combination.merge(df_discount_and_campaings, on='date_week', how='left') # Merge the dataframes

        # Create the prophet dataframe
        prophet_dataframe = pd.DataFrame()
        prophet_dataframe['ds'] = df_combination['date_week']
        prophet_dataframe['y'] = df_combination['quantity']
        prophet_dataframe['campaign'] = df_combination['campaign']
        prophet_dataframe['discount_for_event'] = df_combination['discount_for_event']
        prophet_dataframe['y'] = prophet_dataframe['y'].astype(int)

        n_unique_years = prophet_dataframe['ds'].dt.year.nunique() # Count the number of different years in the dataframe

        # Create the model
        if n_unique_years == 1:
            model = Prophet(weekly_seasonality = 13)
            model.add_regressor('campaign')
            model.add_regressor('discount_for_event')

        else:
            model = Prophet(weekly_seasonality = 52)
            model.add_regressor('campaign')
            model.add_regressor('discount_for_event')
            
            
        try:
            model.fit(prophet_dataframe)
            with open(f'./serialized_models/{str(combination)}.json', 'w') as file:
                file.write(model_to_json(model))  # Save model
            
            
            if test_model:
                # Calculate how many weeks are missing from the last date in the dataframe to the last week of 2023
                weeks = (date(2023, 12, 31) - prophet_dataframe['ds'].max().date()).days // 7

                # Create a dataframe with the dates from the last date in the dataframe to the last week of 2023
                df_grouped_by_week_test = df_grouped_by_week[df_grouped_by_week['date_week'].between('2023-01-01','2023-12-31')]
                df_combination_test = df_grouped_by_week_test[df_grouped_by_week_test['combination'] == combination][['date_week','quantity']] # get the y_true values of the combination selected
                df_combination_test = df_combination_test.merge(df_discount_and_campaings, on='date_week', how='left') # Merge the dataframes

                future_2023 = model.make_future_dataframe(periods=weeks, freq='W-MON')
                future_2023 = future_2023[future_2023['ds'] >= '2023-01-01']
                future_2023 = future_2023.merge(df_combination_test, right_on='date_week',left_on= 'ds', how = 'left')
                del future_2023['date_week']
                future_2023['familia'] = familia
                future_2023['campaign'].fillna(0,inplace=True)
                future_2023['quantity'].fillna(0,inplace=True)
                future_2023['discount_for_event'].fillna(0,inplace=True)
                
                
                forecast = model.predict(future_2023) # Make the predictions
                forecast_2023 = forecast[['ds', 'yhat']] # Get the predictions for 2023
                forecast_2023.columns = ['date', 'demand_yhat'] # Rename columns
                forecast_2023['llave'] = combination # Add the combination column
                # Add to the final predictions dataframe
                final_prediction_2023 = pd.concat([final_prediction_2023, forecast_2023])
                final_prediction_2023['demand_yhat'] = final_prediction_2023['demand_yhat'].apply(lambda x: 0 if x < 0 else x)
                final_prediction_2023['demand_yhat'] = final_prediction_2023['demand_yhat'].apply(lambda x: np.ceil(x))

                X_test = df_grouped_by_week[df_grouped_by_week['combination'] == combination][['date_week','quantity']]
                X_test.columns = ['date','y_true']
                df_ytrue_yhat = final_prediction_2023.merge(X_test,on= 'date', how = 'left')
                df_ytrue_yhat['y_true'].fillna(0, inplace=True)

                RMSE = np.sqrt(mean_squared_error(df_ytrue_yhat['y_true'], df_ytrue_yhat['demand_yhat']))
                dict_temp = {
                    'llave' : combination,
                    'rsme' : RMSE
                }
                df_rmse = pd.DataFrame([dict_temp])
                rmse_por_llave = pd.concat([rmse_por_llave,df_rmse])
        except Exception as e:
            print(f'problems with this key: {combination}\n{e}')



        
    else:
        print(combination)

### Train with all data available

In [None]:
for combination in tqdm(df_grouped_by_week['combination'].unique()):

    # Get the dataframe for the combination
    df_combination = df_grouped_by_week[df_grouped_by_week['combination'] == combination]
    df_combination = df_combination[['date_week','combination','quantity']]
    
    if combination in llaves_more_then_q95 and len(df_combination[df_combination['date_week'] >= '2022-01-01']) > 0: # si la llave esta en las llaves q95 y tiene data en el 2023, se entrena con data 2023, de lo contrario, se entra con todo lo disponible
        df_combination = df_combination[df_combination['date_week'] >= '2022-01-01']
    # df_combination_test = df_grouped_by_week_test[df_grouped_by_week_test['combination'] == combination][['date_week','quantity','campaign','discount_for_event']] # get the y_true values of the combination selected

    familia = str(combination[:3])
    df_discount_and_campaings = archivo_campanias[archivo_campanias['familia'] == familia]

    # df_discount_and_campaings = df_grouped_by_week[df_grouped_by_week['date_week'] >= '2024-01-01']
    # df_discount_and_campaings = df_discount_and_campaings[['combination','date_week','discount_for_event','campaign']]
    # Complete the missing weeks with 0
    first_date = df_combination['date_week'].min() # Get the first date in the dataframe
    last_date = df_combination['date_week'].max() # Get the last date in the dataframe
    

    # Create a dataframe with all the weeks between the year of the first date and the year of the last date
    df_dates = pd.DataFrame({'date_week': pd.date_range(start=f'{first_date.year}-01-08', end=f'2023-12-31', freq='W-MON')})    

    df_combination = df_dates.merge(df_combination, on='date_week', how='left') # Merge the dataframes
    df_combination['quantity'] = df_combination['quantity'].fillna(0) # Replace NaN values with 0
    df_combination = df_combination[~(df_combination['combination'].isnull())]
    df_combination = df_combination.merge(df_discount_and_campaings, on='date_week', how='left') # Merge the dataframes

    # Create the prophet dataframe
    prophet_dataframe = pd.DataFrame()
    prophet_dataframe['ds'] = df_combination['date_week']
    prophet_dataframe['y'] = df_combination['quantity']
    prophet_dataframe['campaign'] = df_combination['campaign']
    prophet_dataframe['discount_for_event'] = df_combination['discount_for_event']
    prophet_dataframe['y'] = prophet_dataframe['y'].astype(int)

    n_unique_years = prophet_dataframe['ds'].dt.year.nunique() # Count the number of different years in the dataframe

    # Create the model
    if n_unique_years == 1:
        model = Prophet(weekly_seasonality = 13)
        model.add_regressor('campaign')
        model.add_regressor('discount_for_event')

    else:
        model = Prophet(weekly_seasonality = 52)
        model.add_regressor('campaign')
        model.add_regressor('discount_for_event')
        
        
    try:
        model.fit(prophet_dataframe)
        with open(f'./serialized_models/{str(combination)}.json', 'w') as file:
            file.write(model_to_json(model))  # Save model
    
    except Exception as e:
        print(f'problems with this key: {combination}\n{e}')




In [147]:
from glob import glob
len(glob('./serialized_models/*'))

7240

In [None]:
# final_prediction_2024 = pd.DataFrame()
# final_prediction_2024_with_test_data = pd.DataFrame()
# rmse_por_llave_resultados = []
# # Get the list of combinations
# for combination in tqdm(llaves_con_informacion_historica_suficiente_para_forecast):
#     # Get the dataframe for the combination
#     df_combination = df_grouped_by_week[df_grouped_by_week['combination'] == combination]
#     if combination in llaves_more_then_q95 and len(df_combination[df_combination['date_week'] >= '2022-01-01']) > 0: # si la llave esta en las llaves q95 y tiene data en el 2022, se entrena con data 2022, de lo contrario, se entra con todo lo disponible
#         df_combination = df_combination[df_combination['date_week'] >= '2022-01-01']
#     # df_combination_test = df_grouped_by_week_test[df_grouped_by_week_test['combination'] == combination][['date_week','quantity','campaign','discount_for_event']] # get the y_true values of the combination selected
#     df_discount_and_campaings = df_grouped_by_week[df_grouped_by_week['date_week'] >= '2024-01-01']
#     df_discount_and_campaings = df_discount_and_campaings[['combination','date_week','discount_for_event','campaign']]
#     # Complete the missing weeks with 0
#     first_date = df_combination['date_week'].min() # Get the first date in the dataframe
#     last_date = df_combination['date_week'].max() # Get the last date in the dataframe
#     # Create a dataframe with all the weeks between the year of the first date and the year of the last date
#     df_dates = pd.DataFrame({'date_week': pd.date_range(start=f'{first_date.year}-01-08', end=f'{last_date.year}-12-31', freq='W-MON')})    
    
#     df_combination = df_dates.merge(df_combination, on='date_week', how='left') # Merge the dataframes
#     df_combination['quantity'] = df_combination['quantity'].fillna(0) # Replace NaN values with 0
#     df_combination['campaign'] = df_combination['campaign'].fillna(0) # Replace NaN values with 0
#     df_combination['discount_for_event'] = df_combination['discount_for_event'].fillna(0) # Replace NaN values with 0

#     df_combination = df_combination[df_combination['date_week'] < '2023-12-31'] # Drop registers from 2024

#     # Create the prophet dataframe
#     prophet_dataframe = pd.DataFrame()
#     prophet_dataframe['ds'] = df_combination['date_week']
#     prophet_dataframe['y'] = df_combination['quantity']
#     prophet_dataframe['campaign'] = df_combination['campaign']
#     prophet_dataframe['discount_for_event'] = df_combination['discount_for_event']
#     prophet_dataframe['y'] = prophet_dataframe['y'].astype(int)

#     n_unique_years = prophet_dataframe['ds'].dt.year.nunique() # Count the number of different years in the dataframe

#     # Create the model
#     if n_unique_years == 1:
#         model = Prophet(weekly_seasonality = 13)
#         model.add_regressor('campaign')
#         model.add_regressor('discount_for_event')

#     else:
#         model = Prophet(weekly_seasonality = 52)
#         model.add_regressor('campaign')
#         model.add_regressor('discount_for_event')
        
        
#     model.fit(prophet_dataframe)

#     # Calculate how many weeks are missing from the last date in the dataframe to the last week of 2023
#     weeks = (date(2024, 12, 31) - prophet_dataframe['ds'].max().date()).days // 7
#     # Create a dataframe with the dates from the last date in the dataframe to the last week of 2023
#     future_2024 = model.make_future_dataframe(periods=weeks, freq='W-MON')
#     future_2024 = future_2024[future_2024['ds'] >= '2024-01-01']
#     future_2024 = future_2024.merge(df_combination_test, right_on='date_week',left_on= 'ds', how = 'left')
#     del future_2024['date_week']
#     future_2024[['quantity','campaign','discount_for_event']] = future_2024[['quantity','campaign','discount_for_event']].fillna(0)



#     forecast = model.predict(future_2024) # Make the predictions
#     forecast_2024 = forecast[['ds', 'yhat']] # Get the predictions for 2023
#     forecast_2024.columns = ['date', 'demand_yhat'] # Rename columns
#     forecast_2024['llave'] = combination # Add the combination column
#     # Add to the final predictions dataframe
#     final_prediction_2024 = pd.concat([final_prediction_2024, forecast_2024])
#     final_prediction_2024['demand_yhat'] = final_prediction_2024['demand_yhat'].apply(lambda x: 0 if x < 0 else x)
#     # # -----------------------------------------------------------------------------------
#     # # Do the same to generate de prophet_df, but with test data(2023) to compare results
#     # # -----------------------------------------------------------------------------------
#     # # Complete the missing weeks with 0
#     # first_date_test = df_combination_test['date_week'].min() # Get the first date in the dataframe
#     # last_date_test = df_combination_test['date_week'].max() # Get the last date in the dataframe
#     # # Create a dataframe with all the weeks between the year of the first date and the year of the last date
#     # df_dates_test = pd.DataFrame({'date_week': pd.date_range(start=f'{first_date_test.year}-01-08', end=f'{last_date_test.year}-12-31', freq='W-MON')})    
#     # df_combination_test = df_dates_test.merge(df_combination_test, on='date_week', how='left') # Merge the dataframes
#     # df_combination_test['quantity'] = df_combination_test['quantity'].fillna(0) # Replace NaN values with 0
#     # df_combination_test = df_combination_test[df_combination_test['date_week'] < '2024-01-01'] # Drop registers from 2024
    
#     # # Create the prophet TEST dataframe
#     # dataframe_test = pd.DataFrame()
#     # dataframe_test['ds'] = df_combination_test['date_week']
#     # dataframe_test['y_true'] = df_combination_test['quantity']

#     # # Merge the forecast_2024 with the y_true values
#     # prediction = forecast[['ds', 'yhat']].merge(dataframe_test, on='ds', how='left')
#     # prediction = prediction[prediction['ds'] >= '2023-01-01']
#     # prediction['y_true'] = prediction['y_true'].fillna(0) # Fill NaN values with 0
#     # prediction['y_true'] = prediction['y_true'].astype(int)
#     # prediction['yhat'] = prediction['yhat'].apply(lambda x: 0 if x < 0 else x) # Replace negative values with 0
    
    
#     # # Calculate the rsme for test_set
#     # df_rsme = prediction.copy()
#     # rsme = np.sqrt(mean_squared_error(df_rsme['y_true'], df_rsme['yhat']))
    
#     # # generate new column error with the abs(error)
#     # prediction['error'] = prediction['y_true'] - prediction['yhat']
#     # prediction['error'] = prediction['error'].apply(lambda x: abs(x))
#     # prediction['llave'] = combination
#     # final_prediction_2024_with_test_data = pd.concat([final_prediction_2024_with_test_data,prediction])
#     # rmse_por_llave = {
#     #     'llave': combination,
#     #     'rmse' : rsme
#     # }
#     # rmse_por_llave_resultados.append(rmse_por_llave)
#     with open(f'.././serialized_models/{str(combination)}.json', 'w') as file:
#         file.write(model_to_json(model))  # Save model

