# fbProphet Vs to baysian dynamic linear model

## Dynamic Linear Models (DLMs) or state space models
The __pydlm__ package implementes the Bayesian dynamic linear model (Harrison and West, 1999) for time series data analysis. Modeling and fitting is simple and easy with pydlm. Complex models can be constructed via simple operations.Define a very general class of non-stationary time series models. Basicaly the model used Kalman filters to estimate the different state matrices.A dynamic linear model can handle non-stationary processes, missing values and non-uniform sampling as well as observations with varying accuracies


In [None]:
#install bdlm package 
pip install pydlm

In [None]:
#import Packages
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from fbprophet import Prophet
from fbprophet.diagnostics import cross_validation
from fbprophet.diagnostics import performance_metrics
from fbprophet.plot import plot_cross_validation_metric
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import boxcox
from scipy.special import inv_boxcox
import os
import gc
from pydlm import dlm, trend, seasonality, autoReg
#use multiple processing in code
from multiprocessing import Pool, cpu_count
import time
import tqdm

## Data Reduction Function

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

## Load data an prepare the forecast DF

In [None]:
sales_train_validation = reduce_mem_usage(pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv'))
calendar = reduce_mem_usage(pd.read_csv('/kaggle/input/m5-forecasting-accuracy/calendar.csv'))
sell_prices = reduce_mem_usage(pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sell_prices.csv'))

sell_prices['id'] = sell_prices.item_id+'_'+ sell_prices.store_id+'_validation'
ex_columns = ['item_id','dept_id','cat_id','store_id','state_id']
sales_train_validation = reduce_mem_usage(sales_train_validation.drop(ex_columns, axis = 1))
sales_train = reduce_mem_usage(sales_train_validation.melt(id_vars=["id"], 
        var_name="d", 
        value_name="sales_units"))


day_d = reduce_mem_usage(calendar[['date','d','wm_yr_wk']])
sales_date = reduce_mem_usage(sales_train.merge(day_d, on = 'd', how = 'left'))
sell_prices = reduce_mem_usage(sell_prices[['id','sell_price','wm_yr_wk']])

df_final = reduce_mem_usage(sales_date.merge(sell_prices, on=['id','wm_yr_wk'], how = 'left'))
df_final['y'] = df_final['sales_units']

#df_final = df_final[df_final['y']>=1]
df_final['y'] = df_final['y']+1
x_trans, lamb = boxcox(df_final['y'])
df_final['y'] = x_trans

#create holidays data frame

event_name = calendar[['event_name_1','date']].dropna(axis = 0)
event_name.columns = ['holiday','ds']
event_name['lower_window'] = 0
event_name['upper_window'] = 1
#reduce dataframe size
df_final = reduce_mem_usage(df_final)
event_name = reduce_mem_usage(event_name)

## Remove unwanted dfs

In [None]:
sales_date=pd.DataFrame()
calendar=pd.DataFrame()
sell_prices=pd.DataFrame()
sales_train_validation=pd.DataFrame()
sales_train=pd.DataFrame()
day_d=pd.DataFrame()
sales_date=pd.DataFrame()
gc.collect()

In [None]:
######################################
#TEST IF MODEL IS PREDICTING 
######################################
historic_data = df_final[df_final['id'].isin(df_final.id.unique()[:1000].tolist())]
historic_data = historic_data.rename(columns={'date':'ds'})
######################################
lists_ids = historic_data['id'].unique()

In [None]:
def forecast_df(lists_ids):
    df = historic_data.loc[historic_data['id']==lists_ids]
    df = df[['ds','y']]
    return df

def prophecy(lists_ids):
    hist_data = forecast_df(lists_ids)
    model = Prophet(uncertainty_samples=False,
                    holidays = event_name)
    
    model.fit(hist_data)
    build_forecast = model.make_future_dataframe(periods=28,freq='D',include_history=False)
    forecast = model.predict(build_forecast)
    
    forecast["yhat"] = (inv_boxcox(forecast["yhat"], lamb)-1).round()
    forecast_array = np.append(np.array([lists_ids]),forecast['yhat'].values.transpose())
    return forecast_array

#use multiple processing in code
from multiprocessing import Pool, cpu_count
print(f'Parallelism on {cpu_count()} CPU')

start_time = time.time()
forecast_array = []
with Pool(cpu_count()) as p:
    predictions = list(tqdm.tqdm(p.imap_unordered(prophecy, lists_ids),total=len(lists_ids)))
    
submission_df = pd.DataFrame(predictions)
submission_df = submission_df.fillna(0)
#submission_df.columns = submission_columns
#submission_df.to_csv('submission1.csv', header='column_names', index=False)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
def baysian_forecast_df(lists_ids):
    df = historic_data.loc[historic_data['id']==lists_ids]
    return df

def baysian_forecast(lists_ids):
    hist = baysian_forecast_df(lists_ids)
    hist_data = hist['y'].values
    linear_trend = trend(degree=1, discount=0.95, name='linear_trend', w=100)
    # weekly seasonality
    seasonal52 = seasonality(period=52, discount=0.99, name='seasonal52', w=1.0)
    # Build a simple dlm
    simple_dlm = dlm(hist_data) + linear_trend + seasonal52 
    # Fit the model
    simple_dlm.fit()
    forecast = simple_dlm.predictN(date=(len(hist_data) - 1), N=28)[0]
    forecast = (inv_boxcox(forecast, lamb).round()-1)
    #forecast = forecast.replace(np.inf, np.nan).replace(-np.inf, np.nan)
    #forecast = forecast.fillna(0)
    baysian_forecast_array = np.append(np.array([lists_ids]),forecast)
    return baysian_forecast_array

#use multiple processing in code
from multiprocessing import Pool, cpu_count
print(f'Parallelism on {cpu_count()} CPU')

start_time = time.time()
forecast_array = []
with Pool(10) as p:
    predictions = list(tqdm.tqdm(p.imap_unordered(baysian_forecast, lists_ids),total=len(lists_ids)))
    #predictions = list(p.imap_unordered(baysian_forecast, lists_ids))
    
#submission_df_baysian = pd.DataFrame(predictions)
#submission_df_df_baysian.columns = submission_columns
#submission_df_df_baysian.to_csv('submission.csv', header='column_names', index=False)

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
sample_submission = reduce_mem_usage(pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sample_submission.csv'))
submission_columns=sample_submission.columns
submission_df_baysian.columns = submission_columns
submission_df.columns = submission_columns

In [None]:
submission_df = submission_df.melt(id_vars=["id"], 
        var_name="day", 
        value_name="prophet_forecast_sales_units")

submission_df_baysian = submission_df_baysian.melt(id_vars=["id"], 
        var_name="day", 
        value_name="baysian_forecast_sales_units")

combined_forecast_df = submission_df.merge(submission_df_baysian, on=['id','day'], how ='inner')