# **Prophet with variable changepoint_prior_scale**

## Import packages

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from fbprophet import Prophet
from fbprophet.diagnostics import cross_validation
from fbprophet.diagnostics import performance_metrics
from fbprophet.plot import plot_cross_validation_metric
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import boxcox
from scipy.special import inv_boxcox
from scipy import stats
import datetime
import time
import tqdm
import os
import gc

## Memory reduction function

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

## Load data and reduce size

In [3]:
sales_train_validation = reduce_mem_usage(pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sales_train_validation.csv'))
calendar = reduce_mem_usage(pd.read_csv('/kaggle/input/m5-forecasting-accuracy/calendar.csv'))
sell_prices = reduce_mem_usage(pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sell_prices.csv'))
submission_sample = reduce_mem_usage(pd.read_csv('/kaggle/input/m5-forecasting-accuracy/sample_submission.csv'))

Memory usage after optimization is: 95.00 MB
Decreased by 78.7%
Memory usage after optimization is: 0.12 MB
Decreased by 41.9%
Memory usage after optimization is: 130.48 MB
Decreased by 37.5%
Memory usage after optimization is: 2.09 MB
Decreased by 84.5%


## Transpose and transform the data and create holiday effects DF

In [4]:
sell_prices['id'] = sell_prices.item_id+'_'+ sell_prices.store_id+'_validation'
ex_columns = ['item_id','dept_id','cat_id','store_id','state_id']
sales_train_validation = reduce_mem_usage(sales_train_validation.drop(ex_columns, axis = 1))
sales_train = reduce_mem_usage(sales_train_validation.melt(id_vars=["id"], 
        var_name="d", 
        value_name="sales_units"))


day_d = reduce_mem_usage(calendar[['date','d','wm_yr_wk']])
sales_date = reduce_mem_usage(sales_train.merge(day_d, on = 'd', how = 'left'))
sell_prices = reduce_mem_usage(sell_prices[['id','sell_price','wm_yr_wk']])

df_final = reduce_mem_usage(sales_date.merge(sell_prices, on=['id','wm_yr_wk'], how = 'left'))
df_final['y'] = df_final['sales_units']

y_trans, lamb = boxcox(df_final['y']+1)
#y_trans, lamb = stats.yeojohnson(df_final['y'])

df_final['y'] = y_trans

#create holidays data frame

event_name = calendar[['event_name_1','date']].dropna(axis = 0)
event_name.columns = ['holiday','ds']
event_name['lower_window'] = 0
event_name['upper_window'] = 1
#reduce dataframe size
df_final = reduce_mem_usage(df_final)
event_name = reduce_mem_usage(event_name)
submission_columns = submission_sample.columns

Memory usage after optimization is: 93.83 MB
Decreased by 0.0%
Memory usage after optimization is: 1001.26 MB
Decreased by 0.0%
Memory usage after optimization is: 0.03 MB
Decreased by 0.0%
Memory usage after optimization is: 2002.51 MB
Decreased by 0.0%
Memory usage after optimization is: 78.29 MB
Decreased by 0.0%
Memory usage after optimization is: 2113.76 MB
Decreased by 0.0%
Memory usage after optimization is: 2225.01 MB
Decreased by 13.0%
Memory usage after optimization is: 0.00 MB
Decreased by 35.0%


## Create a Dataframe with Specific Parameters for each model id

In [5]:
######################################
#TEST IF MODEL IS PREDICTING 
######################################
#[df_final['id'].isin(df_final.id.unique()[:10000].tolist())]
######################################
df_final['ds']= df_final['date']
df_final = df_final.drop(['d','sales_units','wm_yr_wk','sell_price','date'], axis = 1)
#Limit the data to at least 3 years of data
min_date = pd.to_datetime(df_final['ds'].max())-datetime.timedelta(800)
historic_data = df_final[df_final['ds'] >= min_date.strftime("%Y-%m-%d")]

## Clear dataframes from memory

In [6]:
df_final=pd.DataFrame()
sales_date=pd.DataFrame()
calendar=pd.DataFrame()
sell_prices=pd.DataFrame()
sales_train_validation=pd.DataFrame()
sales_train=pd.DataFrame()
day_d=pd.DataFrame()
sales_date=pd.DataFrame()
gc.collect()

40

## Forecasting dataframe

In [7]:
def forecast_df(lists_ids):
    df = historic_data.loc[historic_data['id']==lists_ids]
    df = df[['ds','y']]
    return df

## Forecasting Block

In [8]:
def prophecy(lists_ids):
    hist_data = forecast_df(lists_ids)
    model = Prophet(uncertainty_samples=False,
                    holidays = event_name)
    
    model.fit(hist_data)
    build_forecast = model.make_future_dataframe(periods=28,freq='D',include_history=False)
    forecast = model.predict(build_forecast)
    
    forecast["yhat"] = (inv_boxcox(forecast["yhat"], lamb)-1).round()
    forecast_array = np.append(np.array([lists_ids]),forecast['yhat'].values.transpose())
    return forecast_array

# Run and Submit model

In [16]:
#use multiple processing in code
from multiprocessing import Pool, cpu_count
print(f'Parallelism on {cpu_count()} CPU')

Parallelism on 4 CPU


## Forecasting List

In [10]:
lists_ids = historic_data['id'].unique()
lists_ids.sort()
first_list = lists_ids[:15000].copy()
last_list = lists_ids[15000:].copy()

In [11]:
#start_time = time.time()
#forecast_array = []
#with Pool(10) as p:
#    predictions = list(tqdm.tqdm(p.imap_unordered(prophecy, first_list),total=len(first_list)))
#    
#submission_df = pd.DataFrame(predictions)
#submission_df = submission_df.fillna(0)
#submission_df.columns = submission_columns
#submission_df.to_csv('submission.csv', header='column_names', index=False)
#
#print("--- %s seconds ---" % (time.time() - start_time))

100%|██████████| 15000/15000 [7:03:27<00:00,  1.69s/it]  


--- 25409.785083293915 seconds ---


In [15]:
len(submission_df.id.unique())

15000

## Last iteration

In [None]:
start_time = time.time()
forecast_array = []
with Pool(10) as p:
    predictions = list(tqdm.tqdm(p.imap_unordered(prophecy, last_list),total=len(last_list)))
    
submission_df = pd.DataFrame(predictions)
submission_df = submission_df.fillna(0)
submission_df.columns = submission_columns
submission_df.to_csv('submission1.csv', header='column_names', index=False)

print("--- %s seconds ---" % (time.time() - start_time))