In [None]:
from prophet import Prophet
from prophet.serialize import model_to_json
import pandas as pd
import json
from tqdm import tqdm
from datetime import date
import numpy as np
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')
import logging

# Set the logging level for cmdstanpy to WARNING
logging.getLogger('cmdstanpy').setLevel(logging.WARNING)


In [None]:
master_sales_by_week = pd.read_parquet('./datasets/master_sales_by_week_curated.parquet')

In [None]:
# add year column
master_sales_by_week['year'] = master_sales_by_week['date'].apply(lambda x: x.strftime('%Y'))

In [None]:
# get product_ids without any sale
total_sales_by_product_id = master_sales_by_week.groupby(['product_id'], as_index=False).agg({'quantity':'sum', })
products_without_sales = total_sales_by_product_id[total_sales_by_product_id['quantity'] == 0]
product_ids_without_sales = products_without_sales['product_id'].unique()
print(f'Total products_ids without sales: {len(product_ids_without_sales)}')
# remove products_id without any sale from the dataset
master_sales_by_week = master_sales_by_week[~(master_sales_by_week['product_id'].isin(product_ids_without_sales))]

In [None]:
master_sales_by_week.head(1)

In [None]:
for year in master_sales_by_week['year'].unique():
    n_unique_keys = master_sales_by_week[master_sales_by_week['date'].between(f'{year}-01-01',f'{year}-12-31')]['combination'].nunique()
    print(f"Año:{year}\n\t# Llaves Unicas: {n_unique_keys}")

In [None]:
unique_keys_in_2023 = master_sales_by_week[master_sales_by_week['date'].between('2023-01-01','2023-12-31')]['combination'].unique()
unique_keys_in_2024 = master_sales_by_week[master_sales_by_week['date'].between('2024-01-01','2024-12-31')]['combination'].unique()

keys_to_forecast = list(set(unique_keys_in_2023) & set(unique_keys_in_2024)) # we going to forecast all the keys in 2023, and 2024 for testing


#### Create dataset for train (2018-2023) and test(2024)

In [None]:
master_sales_by_week_train = master_sales_by_week[master_sales_by_week['date'].between('2018-01-01','2023-12-31')]
master_sales_by_week_test = master_sales_by_week[master_sales_by_week['date'].between('2024-01-01','2024-12-31')]

In [None]:
weeks_of_information_by_combination = []
for i in tqdm(keys_to_forecast):
    df_temp = master_sales_by_week_train[master_sales_by_week_train['combination'] == i]
    dict_temp = {
        'combination' : i,
        'n_weeks' : df_temp['date'].nunique()
    }
    weeks_of_information_by_combination.append(dict_temp)
weeks_of_information_by_combination_df = pd.DataFrame(weeks_of_information_by_combination)

In [None]:
keys_with_less_than_12_weeks_of_info = weeks_of_information_by_combination_df[weeks_of_information_by_combination_df['n_weeks'] <= 12]['combination'].unique()

# keep just the keys present in 2023 and 2024
master_sales_by_week_train = master_sales_by_week_train[master_sales_by_week_train['combination'].isin(keys_to_forecast)]
master_sales_by_week_test = master_sales_by_week_test[master_sales_by_week_test['combination'].isin(keys_to_forecast)]
# drop all the combination withot enoff information in train and test 
master_sales_by_week_train = master_sales_by_week_train[~(master_sales_by_week_train['combination'].isin(keys_with_less_than_12_weeks_of_info))]
master_sales_by_week_test = master_sales_by_week_test[~(master_sales_by_week_test['combination'].isin(keys_with_less_than_12_weeks_of_info))]
print(f'#Keys with less than 12 weeks of info: {len(keys_with_less_than_12_weeks_of_info)}')

In [None]:
logging.getLogger('cmdstanpy').setLevel(logging.WARNING)
final_prediction_2024 = pd.DataFrame()
# dataframe_ytrue_ytest = pd.DataFrame()
rmse_por_llave = pd.DataFrame()
for combination in tqdm(master_sales_by_week_train['combination'].unique()):
    sales_combination = master_sales_by_week_train[master_sales_by_week_train['combination'] == combination]
    sales_combination = sales_combination[['date','combination','quantity','cod_fami','discount','price_taxes_excluded']]


    first_date = sales_combination['date'].min() # Get the first date in the dataframe
    last_date = sales_combination['date'].max() # Get the last date in the dataframe

    df_dates = pd.DataFrame({'date': pd.date_range(start=f'{first_date.year}-01-08', end='2023-12-31', freq='W-MON')})

    sales_combination = df_dates.merge(sales_combination, on='date', how='left') # Merge the dataframes
    sales_combination['quantity'] = sales_combination['quantity'].fillna(0) # Replace NaN values with 0
    sales_combination = sales_combination[~(sales_combination['combination'].isnull())]

    # Create the prophet dataframe
    prophet_dataframe = pd.DataFrame()
    prophet_dataframe['ds'] = sales_combination['date']
    prophet_dataframe['y'] = sales_combination['quantity']
    prophet_dataframe['y'] = prophet_dataframe['y'].astype(int)
    prophet_dataframe['discount'] = sales_combination['discount']
    prophet_dataframe['price_taxes_excluded'] = sales_combination['price_taxes_excluded']

    n_unique_years = prophet_dataframe['ds'].dt.year.nunique() # Count the number of different years in the dataframe

    # Create the model
    if n_unique_years == 1:
        model = Prophet(weekly_seasonality = 13)
        model.add_regressor('discount')
        model.add_regressor('price_taxes_excluded')

    else:
        model = Prophet(weekly_seasonality = 52)
        model.add_regressor('discount')
        model.add_regressor('price_taxes_excluded')
        
        
    model.fit(prophet_dataframe)

    # -----------
    # Test Model
    # -----------
    # Calculate how many weeks are missing from the last date in the dataframe to the last week of 2024
    weeks = (date(2024, 12, 31) - prophet_dataframe['ds'].max().date()).days // 7

    # Create a dataframe with the dates from the last date in the dataframe to the last week of 2024
    sales_by_week_test = master_sales_by_week_test.copy()
    df_combination_test = sales_by_week_test[sales_by_week_test['combination'] == combination][['date','quantity','discount','price_taxes_excluded']] # get the y_true values of the combination selected

    # future Dataframe
    future_2024 = model.make_future_dataframe(periods=weeks, freq='W-MON')
    future_2024 = future_2024[future_2024['ds'] >= '2024-01-01']
    future_2024 = future_2024.merge(df_combination_test, right_on='date',left_on= 'ds', how = 'left')
    del future_2024['date']
    familia = combination[:3]
    future_2024['familia'] = familia
    future_2024['quantity'].fillna(0,inplace=True)
    future_2024['discount'].fillna(0,inplace=True)
    future_2024['price_taxes_excluded'].fillna(0,inplace=True)

    # Get Predictions
    forecast = model.predict(future_2024) # Make the predictions
    forecast_2023 = forecast[['ds', 'yhat']] # Get the predictions for 2024
    forecast_2023.columns = ['date', 'demand_yhat'] # Rename columns
    forecast_2023['llave'] = combination # Add the combination column
    # Add to the final predictions dataframe
    final_prediction_2024 = pd.concat([final_prediction_2024, forecast_2023])
    final_prediction_2024['demand_yhat'] = final_prediction_2024['demand_yhat'].apply(lambda x: 0 if x < 0 else x)
    final_prediction_2024['demand_yhat'] = final_prediction_2024['demand_yhat'].apply(lambda x: np.ceil(x))

    # Generate dataframe with y_true and y_hat
    df_combination_test = df_combination_test[['date','quantity']]
    df_combination_test.columns = ['date','y_true']
    df_ytrue_yhat = final_prediction_2024.merge(df_combination_test,on= 'date', how = 'left')
    df_ytrue_yhat['y_true'].fillna(0, inplace=True)
    # dataframe_ytrue_ytest = pd.concat([dataframe_ytrue_ytest, df_ytrue_yhat])
    RMSE = np.sqrt(mean_squared_error(df_ytrue_yhat['y_true'], df_ytrue_yhat['demand_yhat']))
    rmse_temp = {
        'llave' : combination,
        'rmse' : RMSE
    }
    df_rmse = pd.DataFrame([rmse_temp])
    rmse_por_llave = pd.concat([rmse_por_llave,df_rmse])

In [None]:
rmse_por_llave['rmse'].mean()

### Check performance 

In [None]:
master_sales_by_week_test = master_sales_by_week_test[['date','combination','quantity']]
master_sales_by_week_test.columns = ['date','llave','y_true']
master_sales_by_week_test.head()

In [None]:
y_true_y_hat = final_prediction_2024.merge(master_sales_by_week_test, how = 'left', on = ['date','llave'])
y_true_y_hat = y_true_y_hat[~(y_true_y_hat['y_true'].isnull())]
y_true_y_hat['cod_fami'] = y_true_y_hat['llave'].apply(lambda x: str(x[:3]))

In [None]:
y_true_y_hat.to_csv('./predicciones/2024_prediccion_vs_venta_real.csv', index = False, sep = ',')
rmse_por_llave.to_csv('./predicciones/2024_rmse_por_llave.csv', index = False, sep = ',')

In [None]:
x = y_true_y_hat.groupby(['date','cod_fami'], as_index=False)['demand_yhat','y_true'].sum()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
llaves_pred = list(set([i[:3] for i in master_sales_by_week_test['llave'].unique()])) # familias presentes en 2024
for i in llaves_pred:
    # Setting size of our plot
    fig, ax = plt.subplots(figsize=(15,5))
    familia = i
    t = x[x['cod_fami'] == familia]
    t.sort_values(by = ['date'], inplace = True)
    # Plotting each occupation category
    sns.lineplot(x=t['date'].values , y = t['y_true'].values, color='#414141', lw=2.2, alpha = 0.6, label = 'Valor Real')
    sns.lineplot(x=t['date'].values , y = t['demand_yhat'].values, color='#39A7FF', lw=2.2, alpha = 0.6, label = 'Prediccion')

    # leged
    plt.legend(bbox_to_anchor=(0.9, 1.15),loc='upper center',fancybox=True, fontsize = 10)
    # X and y labels
    plt.ylabel('Date - Week', fontsize=10, color='#414141',labelpad=15)
    plt.xlabel('SUM(Ventas)', fontsize=10, color='#414141',labelpad=15)
    # Bolded horizontal line at y=0
    plt.axhline(y=0, color='#414141', linewidth=1, alpha=.5)
    plt.yticks( fontsize=10, color='#414141');
    plt.xticks( fontsize=10, color='#414141');
    plt.title(label = f'\nFamilia: {familia}\n\n',fontsize=15, fontweight='bold',color='#414141', loc='left')
    # save fig
    plt.savefig(f'./plots/model_performance/familia_{familia}.png', bbox_inches = 'tight', dpi = 150)
