In [1]:
%run ./imports_models.py
%run ./dataset_preparation_v2.ipynb
%run -i ./utils.py

Stored 'df_complete' (DataFrame)
Stored 'df_grouped_dates' (DataFrame)
Stored 'series_grouped_dates' (Series)
Stored 'pandas_dataframe_groupby_site' (DataFrameGroupBy)
Stored 'sites_names' (list)


In [2]:
PATH_SERIALIZER = './../serialized_files/'

# Data per site

In [3]:
%store -r df_grouped_dates
%store -r series_grouped_dates
%store -r pandas_dataframe_groupby_site
%store -r sites_names

### Select group

In [56]:
site_index = 0
print('%s has been selected'%sites_names[site_index])
site_name = sites_names[site_index]

site_df_complete_data = pandas_dataframe_groupby_site.get_group(site_name)
site_df_grouped_dates = site_df_complete_data.groupby(['DATES'], as_index=False).sum()

CB-EHE: CardAdminAt-PR-EX has been selected


In [57]:
df_data = pd.DataFrame({'ds':site_df_grouped_dates['DATES'], 
                        'y':site_df_grouped_dates['OCCUPANCY_COUNT']})

## Scaling data (optional)

In [58]:
occ_values_to_scale = site_df_grouped_dates['OCCUPANCY_COUNT'].values.reshape(-1,1)
scaler = MinMaxScaler()
scaler_fit = scaler.fit(occ_values_to_scale)

df_data['y'] = scaler_fit.transform(occ_values_to_scale)
df_data

Unnamed: 0,ds,y
0,2018-01-01 01:00:00,0.000000
1,2018-01-01 02:00:00,0.000000
2,2018-01-01 03:00:00,0.000000
3,2018-01-01 04:00:00,0.047619
4,2018-01-01 05:00:00,0.000000
...,...,...
25203,2020-12-31 19:00:00,0.285714
25204,2020-12-31 20:00:00,0.285714
25205,2020-12-31 21:00:00,0.095238
25206,2020-12-31 22:00:00,0.095238


# Save scaler into serialized file

In [None]:
filename_scaler = ''
pickle.dump(scaler_fit, open(PATH_SERIALIZER+filename_model,'wb'))

# Split train and test

In [59]:
df_data_train = df_data[:16791]
df_data_test = df_data[16791:]

# Hyperparameter tuning

In [60]:
print('Tuning...')
start_time_tuning = time.time()
cutoffs = pd.to_datetime(['2018-12-31', '2019-05-30', '2019-09-30'])
param_grid = {
    'changepoint_prior_scale': [0.001, 0.01, 0.1],
    'seasonality_prior_scale': [0.01, 0.1, 1.0]
}

all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]
rmses = []

for params in tqdm(all_params):
    p = Prophet(**params
                ).add_seasonality(name='monthly', 
                                  period=30.5, 
                                  fourier_order=5, 
                                  prior_scale=0.02
                                  ).fit(df_data_train)
    df_cv = cross_validation(p, cutoffs=cutoffs, horizon='30 days', parallel="processes")
    df_p = performance_metrics(df_cv, rolling_window=1)
    rmses.append(df_p['rmse'].values[0])

tuning_results = pd.DataFrame(all_params)
tuning_results['rmse'] = rmses
print("--- Tuning ended at %s seconds ---" % (time.time() - start_time_tuning))

Tuning...


  0%|          | 0/9 [00:00<?, ?it/s]

INFO:fbprophet:Applying in parallel with <concurrent.futures.process.ProcessPoolExecutor object at 0x0000016394C98248>
INFO:fbprophet:Skipping MAPE because y close to 0
INFO:fbprophet:Applying in parallel with <concurrent.futures.process.ProcessPoolExecutor object at 0x0000016394C70C88>
INFO:fbprophet:Skipping MAPE because y close to 0
INFO:fbprophet:Applying in parallel with <concurrent.futures.process.ProcessPoolExecutor object at 0x0000016394C7BAC8>
INFO:fbprophet:Skipping MAPE because y close to 0
INFO:fbprophet:Applying in parallel with <concurrent.futures.process.ProcessPoolExecutor object at 0x0000016394C74F48>
INFO:fbprophet:Skipping MAPE because y close to 0
INFO:fbprophet:Applying in parallel with <concurrent.futures.process.ProcessPoolExecutor object at 0x000001639820B088>
INFO:fbprophet:Skipping MAPE because y close to 0
INFO:fbprophet:Applying in parallel with <concurrent.futures.process.ProcessPoolExecutor object at 0x00000163981FDD08>
INFO:fbprophet:Skipping MAPE because

--- Tuning ended at 350.3074400424957 seconds ---


In [61]:
tuning_results

Unnamed: 0,changepoint_prior_scale,seasonality_prior_scale,rmse
0,0.001,0.01,0.10105
1,0.001,0.1,0.101622
2,0.001,1.0,0.101599
3,0.01,0.01,0.096893
4,0.01,0.1,0.097169
5,0.01,1.0,0.097218
6,0.1,0.01,0.09635
7,0.1,0.1,0.096677
8,0.1,1.0,0.096694


In [62]:
best_parameters = tuning_results[tuning_results['rmse'] == tuning_results['rmse'].min()]
site_seasonality_prior_scale = best_parameters['seasonality_prior_scale']
site_changepoint_prior_scale = best_parameters['changepoint_prior_scale']

best_parameters

Unnamed: 0,changepoint_prior_scale,seasonality_prior_scale,rmse
6,0.1,0.01,0.09635


### Save hyperparameter tuning into serialized file

In [63]:
filename_hyperparameter = ''
pickle.dump(tuning_results, open(PATH_SERIALIZER+filename_hyperparameter,'wb'))

PermissionError: [Errno 13] Permission denied: './../serialized_files/'

# Instantiate model

In [None]:
print('Training...')
start_time_training = time.time()
prophet_instance = Prophet(yearly_seasonality=True, 
                           weekly_seasonality=True,
                           seasonality_prior_scale=site_seasonality_prior_scale,
                           changepoint_prior_scale=site_changepoint_prior_scale,
                           interval_width=0.95)
prophet_instance.add_seasonality(name='monthly', 
                                 period=30.5, 
                                 fourier_order=5, 
                                 prior_scale=0.02)

prophet_instance.fit(df_data_train)
print("--- Training ended at %s seconds ---" % (time.time() - start_time_training))

# Cross validation

In [None]:
cross_validation_results = cross_validation(prophet_instance, 
                                            initial='12000 hours', 
                                            period='3600 hours',
                                            horizon='168 hours')
cross_validation_results

In [None]:
cross_validation_results_no_negative = replace_negative_values_with_ceros(cross_validation_results.copy())
performance_overall_metrics = performance_metrics(cross_validation_results_no_negative)
mse_mean_training = np.mean(performance_overall_metrics['mse'])
rmse_mean_training = np.mean(performance_overall_metrics['rmse'])
mae_mean_training = np.mean(performance_overall_metrics['mae'])

In [None]:
training_performace_df = pd.DataFrame({'mse_training':mse_mean_training, 
                                       'rmsetraining':rmse_mean_training, 
                                       'mae_training':mae_mean_training},
                                     index=[0])
training_performace_df

In [None]:
fig_mae_training = plot_cross_validation_metric(cross_validation_results, metric='mae')

# Forecast

In [None]:
print('Forecasting...')
start_time_forecast = time.time()
forecast = prophet_instance.predict(df_data_test)
print("--- Forecasting ended at %s seconds ---" % (time.time() - start_time_forecast))
forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].head()

In [None]:
forecast_no_negative = forecast.copy()
forecast_no_negative['yhat'] = replace_negative_values_with_ceros(forecast.copy(), forecast_df=True)
forecast_no_negative[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]

# Forecast performance

In [None]:
y_true = df_data_test['y']
y_pred = forecast_no_negative['yhat']
metrics_evaluation_forecast = get_performance_metrics_forecast(y_true, y_pred)
mse_forecast = metrics_evaluation_forecast[0]
rmse_forecast = metrics_evaluation_forecast[1]
mae_forecast = metrics_evaluation_forecast[2]
maape_forecast = metrics_evaluation_forecast[3]

In [None]:
forecast_performace_df = pd.DataFrame({'mse_forecast':mse_forecast, 
                                       'rmse_forecast':rmse_forecast, 
                                       'mae_forecast':mae_forecast,
                                       'maape_forecast':maape_forecast},
                                     index=[0])
forecast_performace_df

# Save model into serialized file

In [None]:
filename_model = ''
# pickle.dump([prophet_instance, forecast, cross_validation_results, performance_overall_metrics], 
#             open(PATH_SERIALIZER+filename_model,'wb'))

# Visualizating

In [None]:
fig_forecast = prophet_instance.plot(forecast_no_negative, xlabel='date')
a = add_changepoints_to_plot(fig_forecast.gca(), prophet_instance, forecast_no_negative)

In [None]:
fig_forecast_components = prophet_instance.plot_components(forecast)

# Plot by year

In [None]:
y_pred_df = forecast_no_negative[['ds','yhat']]
y_pred_df.index = series_grouped_dates[16791:].index
y_true_df = df_data_test.copy()
y_true_df['ds'] = pd.to_datetime(y_true_df['ds'])
y_true_df.index = series_grouped_dates[16791:].index

In [None]:
init_date = '2020-01-01'
end_date = '2020-12-31'
ax_2020=y_pred_df[init_date:end_date].plot(x='ds',y='yhat',legend=True,label='y_pred',figsize=(20,8))
y_true_df[init_date:end_date].plot(x='ds',y='y',legend=True,label='y',ax=ax_2020)
plt.title('Year 2020')

# Plots by month

In [None]:
init_date = '2020-01-01'
end_date = '2020-01-31'
ax_jan=y_pred_df[init_date:end_date].plot(x='ds',y='yhat',legend=True,label='y_pred',figsize=(20,8))
y_true_df[init_date:end_date].plot(x='ds',y='y',legend=True,label='y',ax=ax_jan)
plt.title('January\n %s to %s'%(init_date, end_date))

In [None]:
init_date = '2020-05-01'
end_date = '2020-05-31'
ax_aug=y_pred_df[init_date:end_date].plot(x='ds',y='yhat',legend=True,label='y_pred',figsize=(20,8))
y_true_df[init_date:end_date].plot(x='ds',y='y',legend=True,label='y',ax=ax_aug)
plt.title('May\n %s to %s'%(init_date, end_date))

In [None]:
init_date = '2020-07-01'
end_date = '2020-07-31'
ax_aug=y_pred_df[init_date:end_date].plot(x='ds',y='yhat',legend=True,label='y_pred',figsize=(20,8))
y_true_df[init_date:end_date].plot(x='ds',y='y',legend=True,label='y',ax=ax_aug)
plt.title('July\n %s to %s'%(init_date, end_date))

# Plots per week

In [None]:
init_date = '2020-05-04'
end_date = '2020-05-11'
ax_aug=y_pred_df[init_date:end_date].plot(x='ds',y='yhat',legend=True,label='y_pred',figsize=(20,8))
y_true_df[init_date:end_date].plot(x='ds',y='y',legend=True,label='y',ax=ax_aug)
plt.title('May\n %s to %s'%(init_date, end_date))

In [None]:
init_date = '2020-05-18'
end_date = '2020-05-25'
ax_aug=y_pred_df[init_date:end_date].plot(x='ds',y='yhat',legend=True,label='y_pred',figsize=(20,8))
y_true_df[init_date:end_date].plot(x='ds',y='y',legend=True,label='y',ax=ax_aug)
plt.title('May\n %s to %s'%(init_date, end_date))

# Plots per day

In [None]:
init_date = '2020-05-04 01:00:00'
end_date = '2020-05-04 23:00:00'
ax_aug=y_pred_df[init_date:end_date].plot(x='ds',y='yhat',legend=True,label='y_pred',figsize=(20,8))
y_true_df[init_date:end_date].plot(x='ds',y='y',legend=True,label='y',ax=ax_aug)
plt.title('May: monday\n %s to %s'%(init_date, end_date))

In [None]:
init_date = '2020-05-05 01:00:00'
end_date = '2020-05-05 23:00:00'
ax_aug=y_pred_df[init_date:end_date].plot(x='ds',y='yhat',legend=True,label='y_pred',figsize=(20,8))
y_true_df[init_date:end_date].plot(x='ds',y='y',legend=True,label='y',ax=ax_aug)
plt.title('May: tuesday\n %s to %s'%(init_date, end_date))

In [None]:
init_date = '2020-05-06 01:00:00'
end_date = '2020-05-06 23:00:00'
ax_aug=y_pred_df[init_date:end_date].plot(x='ds',y='yhat',legend=True,label='y_pred',figsize=(20,8))
y_true_df[init_date:end_date].plot(x='ds',y='y',legend=True,label='y',ax=ax_aug)
plt.title('May: wednesday\n %s to %s'%(init_date, end_date))

In [None]:
init_date = '2020-05-07 01:00:00'
end_date = '2020-05-07 23:00:00'
ax_aug=y_pred_df[init_date:end_date].plot(x='ds',y='yhat',legend=True,label='y_pred',figsize=(20,8))
y_true_df[init_date:end_date].plot(x='ds',y='y',legend=True,label='y',ax=ax_aug)
plt.title('May: thursday\n %s to %s'%(init_date, end_date))

In [None]:
init_date = '2020-05-08 01:00:00'
end_date = '2020-05-08 23:00:00'
ax_aug=y_pred_df[init_date:end_date].plot(x='ds',y='yhat',legend=True,label='y_pred',figsize=(20,8))
y_true_df[init_date:end_date].plot(x='ds',y='y',legend=True,label='y',ax=ax_aug)
plt.title('May: friday\n %s to %s'%(init_date, end_date))

In [None]:
init_date = '2020-05-09 01:00:00'
end_date = '2020-05-09 23:00:00'
ax_aug=y_pred_df[init_date:end_date].plot(x='ds',y='yhat',legend=True,label='y_pred',figsize=(20,8))
y_true_df[init_date:end_date].plot(x='ds',y='y',legend=True,label='y',ax=ax_aug)
plt.title('May: saturday\n %s to %s'%(init_date, end_date))

In [None]:
init_date = '2020-05-10 01:00:00'
end_date = '2020-05-10 23:00:00'
ax_aug=y_pred_df[init_date:end_date].plot(x='ds',y='yhat',legend=True,label='y_pred',figsize=(20,8))
y_true_df[init_date:end_date].plot(x='ds',y='y',legend=True,label='y',ax=ax_aug)
plt.title('May: sunday\n %s to %s'%(init_date, end_date))