### Imports & installation

In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

In [None]:
pip uninstall holidays -y

In [None]:
pip install holidays==0.23

In [None]:
pip install prophet

In [48]:
from prophet import Prophet

### Loading & transforming the data

In [49]:
train_data = pd.read_csv('burglary_train.csv')
test_data = pd.read_csv('burglary_test.csv')

In [4]:
def to_datetime(df):
    # Changes date to datetime 
    for index, value in enumerate(df['Month']):
        df.at[index, 'Month'] = datetime.strptime(value, '%Y-%m')
    return df

def get_most_frequent_locations(df):
    # Returns a list of most frequent location per month 
    df_grouped = df.groupby([pd.Grouper(key='Month', freq='M'), 'LSOA code']).size()
    most_frequent_location = df_grouped.groupby(level=0).idxmax()
    locs = []
    for loc in range(len(most_frequent_location)):
        locs.append(most_frequent_location[loc][1])
    return locs

def count_per_month(df):
    # Return a dataframe with crimes count per month
    df_per_month = df.groupby(pd.Grouper(key='Month', freq='M')).size()
    df_per_month = pd.DataFrame(df_per_month)
    df_per_month['ds'] = df_per_month.index
    df_per_month = df_per_month.rename(columns={0: 'y'})
    return df_per_month 

def add_locs(df, locs):
    # Adds most frequent location column to the dataframe
    df['loc'] = locs
    return df

In [5]:
def loc_encoding(df):
    # Returns a dataframe with encoded locations
    one_hot_encoded = pd.get_dummies(df['loc'])
    df_encoded = pd.concat([df, one_hot_encoded], axis=1)
    df_encoded = df_encoded.drop('loc', axis=1)
    return df_encoded

def get_cols_for_pred(df):
    # Returns a list of additional columns for regression 
    columns = df.iloc[:, 2:]
    return columns

In [50]:
train_data = to_datetime(train_data)
data_month = (train_data.resample('M', on='Month').mean()).iloc[:, 1:7]
locs = get_most_frequent_locations(train_data)
train_data = count_per_month(train_data)
train_data = add_locs(train_data, locs)
train_data = loc_encoding(train_data)
pred_cols = get_cols_for_pred(train_data)
train_data = pd.concat([train_data, data_month], axis=1)

train_data

Unnamed: 0_level_0,y,ds,E01000116,E01000118,E01000122,E01000123,E01000128,E01000129,E01000130,E01000131,...,E01000315,E01000316,E01000322,E01033572,Ward - % All Working-age (16-64),Ward - % All Older people aged 65+,Ward - Population density (persons per sq km),Ward - % Not Born in UK,Ward - House price lower third,Ward - House price upper third
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-12-31,372,2010-12-31,0,0,0,0,0,0,0,0,...,0,0,0,0,65.525269,13.755914,51603.037634,38.946237,0.075269,0.271505
2011-01-31,486,2011-01-31,0,0,0,0,0,0,0,0,...,0,0,0,0,65.399177,14.056996,50323.456790,38.524486,0.082305,0.242798
2011-02-28,403,2011-02-28,0,0,0,0,0,0,0,0,...,0,0,0,0,65.477419,13.908933,52452.573201,38.984119,0.086849,0.267990
2011-03-31,408,2011-03-31,0,0,0,0,0,0,0,0,...,0,0,0,0,65.405637,13.919118,52567.068627,39.178186,0.063725,0.252451
2011-04-30,414,2011-04-30,0,0,0,0,0,0,0,0,...,0,0,0,0,65.312802,14.005314,51209.925121,38.998309,0.065217,0.210145
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-07-31,281,2019-07-31,0,0,0,0,0,0,0,0,...,0,0,0,0,65.401779,13.961922,52871.786477,39.655516,0.081851,0.241993
2019-08-31,227,2019-08-31,0,0,0,0,0,0,0,0,...,0,0,0,0,65.612775,13.615859,53337.136564,39.715859,0.074890,0.233480
2019-09-30,270,2019-09-30,0,0,0,0,0,0,0,0,...,0,0,0,0,65.550370,13.748889,53791.411111,39.787778,0.103704,0.229630
2019-10-31,275,2019-10-31,0,0,0,0,0,0,0,0,...,0,0,0,0,65.534545,13.642545,51196.120000,38.878909,0.112727,0.174545


In [51]:
test_data = to_datetime(test_data)
data_month_test = (test_data.resample('M', on='Month').mean()).iloc[:, 1:7]
locs = get_most_frequent_locations(test_data)
test_data = count_per_month(test_data)
test_data = add_locs(test_data, locs)
test_data = loc_encoding(test_data) 
pred_cols_test = get_cols_for_pred(test_data)
test_data = pd.concat([test_data, data_month_test], axis=1)

In [28]:
test_data

Unnamed: 0_level_0,y,ds,E01000124,E01000125,E01000128,E01000129,E01000131,E01000136,E01000138,E01000141,...,E01000299,E01000308,E01000316,E01033573,Ward - % All Working-age (16-64),Ward - % All Older people aged 65+,Ward - Population density (persons per sq km),Ward - % Not Born in UK,Ward - House price lower third,Ward - House price upper third
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-11-30,101,2019-11-30,0,0,0,0,0,0,0,0,...,0,0,0,0,65.515842,13.327723,55216.623762,39.89802,0.059406,0.306931
2019-12-31,346,2019-12-31,0,0,0,0,0,0,0,0,...,0,0,0,0,65.501445,13.704913,52379.514451,38.715318,0.109827,0.231214
2020-01-31,280,2020-01-31,0,0,0,0,0,0,0,0,...,1,0,0,0,65.464643,13.768929,51599.507143,39.186429,0.082143,0.292857
2020-02-29,282,2020-02-29,0,0,0,0,0,0,0,0,...,0,0,0,0,65.300709,13.713121,53267.283688,39.437589,0.102837,0.230496
2020-03-31,258,2020-03-31,0,0,0,0,0,0,0,0,...,0,0,0,0,65.417829,13.763566,51912.023256,39.081008,0.096899,0.232558
2020-04-30,258,2020-04-30,0,0,0,0,0,0,0,0,...,0,0,0,0,65.528682,13.488372,51430.674419,40.699225,0.077519,0.24031
2020-05-31,160,2020-05-31,0,0,0,0,0,0,0,0,...,0,0,0,0,65.6975,13.3525,53288.40625,40.58625,0.0625,0.23125
2020-06-30,166,2020-06-30,0,0,0,0,0,0,0,0,...,0,0,0,0,65.422289,13.83494,52784.849398,39.63494,0.090361,0.216867
2020-07-31,186,2020-07-31,0,0,0,0,0,0,0,0,...,0,0,0,0,65.494086,13.805376,53584.860215,40.303763,0.080645,0.204301
2020-08-31,239,2020-08-31,0,0,0,0,0,0,0,0,...,0,0,0,0,65.233054,13.861925,50428.175732,39.837238,0.066946,0.246862


NameError: name 'prophet_params' is not defined

In [52]:
all_data = pd.concat([train_data, test_data[1:13]], axis=0).fillna(0)
all_data

common_cols_1 =list(set.intersection(set(all_data), set(train_data)))

len(common_cols_1)

test = all_data[common_cols_1]

test

Unnamed: 0_level_0,E01000146,E01000174,E01000207,E01000223,Ward - House price upper third,E01000224,E01033572,E01000150,E01000161,Ward - % All Older people aged 65+,...,Ward - % Not Born in UK,E01000142,E01000240,E01000307,E01000130,E01000228,E01000291,E01000129,E01000275,E01000281
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-12-31,0,0.0,0.0,0.0,0.271505,0,0.0,0.0,0.0,13.755914,...,38.946237,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
2011-01-31,0,0.0,0.0,0.0,0.242798,0,0.0,0.0,0.0,14.056996,...,38.524486,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
2011-02-28,0,0.0,0.0,0.0,0.267990,0,0.0,0.0,0.0,13.908933,...,38.984119,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
2011-03-31,0,0.0,0.0,0.0,0.252451,0,0.0,0.0,0.0,13.919118,...,39.178186,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
2011-04-30,0,0.0,0.0,0.0,0.210145,0,0.0,0.0,0.0,14.005314,...,38.998309,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-07-31,0,0.0,0.0,0.0,0.204301,0,0.0,0.0,0.0,13.805376,...,40.303763,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
2020-08-31,0,0.0,0.0,0.0,0.246862,0,0.0,0.0,0.0,13.861925,...,39.837238,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
2020-09-30,0,0.0,0.0,0.0,0.192913,0,0.0,0.0,0.0,13.629134,...,39.794882,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0
2020-10-31,0,0.0,0.0,0.0,0.276667,0,0.0,0.0,0.0,13.925667,...,38.948333,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0


# Prophet

### Basic implementation on aggregated data

In [None]:
prophet_model = Prophet(yearly_seasonality=True)

prophet_model.fit(train_data)

future = prophet_model.make_future_dataframe(periods=12, freq='M')
predictions = prophet_model.predict(future)

In [None]:
future

In [None]:
plt = prophet_model.plot(predictions)
plt.legend()
plt.show()

In [None]:
test_predictions = predictions[-12:]
test_predictions_reset = test_predictions.reset_index(drop=True)
test_data_reset = test_data[-12:].reset_index(drop=True)

test_data_reset

In [None]:
mse = ((test_predictions_reset['yhat'] - test_data_reset['y']) ** 2).mean()
rmse = mse ** 0.5
print('RMSE: {:.2f}'.format(rmse))

In [None]:
components = prophet_model.plot_components(predictions)

In [None]:
all_regressors_test.set_index(pd.Index(range(108,120)))

#### On aggregated data with most common location per month as and additional regressor

In [53]:
# new model with location as additional regressor (in dummy variables)
prophet_model_extra = Prophet(yearly_seasonality=True)

# adds all location dummies to the model and fits the training data
all_regressors = pd.concat([pred_cols,data_month],axis=1)

for col in all_regressors:
    prophet_model_extra.add_regressor(col)

prophet_model_extra.fit(train_data)

10:07:09 - cmdstanpy - INFO - Chain [1] start processing
10:07:09 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x1b94fb56e20>

In [32]:
pred_cols

Unnamed: 0_level_0,E01000116,E01000118,E01000122,E01000123,E01000128,E01000129,E01000130,E01000131,E01000134,E01000135,...,E01000296,E01000299,E01000305,E01000307,E01000310,E01000314,E01000315,E01000316,E01000322,E01033572
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-12-31,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2011-01-31,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2011-02-28,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2011-03-31,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2011-04-30,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-07-31,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2019-08-31,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2019-09-30,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2019-10-31,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
# extract column names. This is needed to create the future dataset
colnames = all_regressors.columns.values.tolist()

In [55]:
# create future dataframe, merge it with the columns where the date matches
future_extra = prophet_model_extra.make_future_dataframe(periods=12, freq='M')
future_extra = train_data[['ds']+colnames].merge(future_extra, how='outer', on='ds')

future_extra

Unnamed: 0,ds,E01000116,E01000118,E01000122,E01000123,E01000128,E01000129,E01000130,E01000131,E01000134,...,E01000315,E01000316,E01000322,E01033572,Ward - % All Working-age (16-64),Ward - % All Older people aged 65+,Ward - Population density (persons per sq km),Ward - % Not Born in UK,Ward - House price lower third,Ward - House price upper third
0,2010-12-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,65.525269,13.755914,51603.037634,38.946237,0.075269,0.271505
1,2011-01-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,65.399177,14.056996,50323.456790,38.524486,0.082305,0.242798
2,2011-02-28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,65.477419,13.908933,52452.573201,38.984119,0.086849,0.267990
3,2011-03-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,65.405637,13.919118,52567.068627,39.178186,0.063725,0.252451
4,2011-04-30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,65.312802,14.005314,51209.925121,38.998309,0.065217,0.210145
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,2020-07-31,,,,,,,,,,...,,,,,,,,,,
116,2020-08-31,,,,,,,,,,...,,,,,,,,,,
117,2020-09-30,,,,,,,,,,...,,,,,,,,,,
118,2020-10-31,,,,,,,,,,...,,,,,,,,,,


In [56]:
test = test.drop(columns='y')

In [57]:
test = test.reset_index(drop=True)

test

Unnamed: 0,E01000146,E01000174,E01000207,E01000223,Ward - House price upper third,E01000224,E01033572,E01000150,E01000161,Ward - % All Older people aged 65+,...,Ward - % Not Born in UK,E01000142,E01000240,E01000307,E01000130,E01000228,E01000291,E01000129,E01000275,E01000281
0,0,0.0,0.0,0.0,0.271505,0,0.0,0.0,0.0,13.755914,...,38.946237,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
1,0,0.0,0.0,0.0,0.242798,0,0.0,0.0,0.0,14.056996,...,38.524486,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
2,0,0.0,0.0,0.0,0.267990,0,0.0,0.0,0.0,13.908933,...,38.984119,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
3,0,0.0,0.0,0.0,0.252451,0,0.0,0.0,0.0,13.919118,...,39.178186,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
4,0,0.0,0.0,0.0,0.210145,0,0.0,0.0,0.0,14.005314,...,38.998309,1.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,0,0.0,0.0,0.0,0.204301,0,0.0,0.0,0.0,13.805376,...,40.303763,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
116,0,0.0,0.0,0.0,0.246862,0,0.0,0.0,0.0,13.861925,...,39.837238,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0
117,0,0.0,0.0,0.0,0.192913,0,0.0,0.0,0.0,13.629134,...,39.794882,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,0.0
118,0,0.0,0.0,0.0,0.276667,0,0.0,0.0,0.0,13.925667,...,38.948333,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0


In [58]:
forecast = prophet_model_extra.predict(test) 

In [None]:
plt = prophet_model_extra.plot(forecast)
plt.legend()
plt.show()

In [59]:
test_predictions = forecast[-12:]
test_predictions_reset = test_predictions.reset_index(drop=True)
test_data_reset = test_data[-12:].reset_index(drop=True)

In [60]:
test_data_reset

Unnamed: 0,y,ds,E01000124,E01000125,E01000128,E01000129,E01000131,E01000136,E01000138,E01000141,...,E01000299,E01000308,E01000316,E01033573,Ward - % All Working-age (16-64),Ward - % All Older people aged 65+,Ward - Population density (persons per sq km),Ward - % Not Born in UK,Ward - House price lower third,Ward - House price upper third
0,171,2022-04-30,0,0,0,0,0,0,0,0,...,0,0,0,0,65.629825,13.716959,50951.064327,40.52924,0.064327,0.187135
1,218,2022-05-31,0,0,0,0,0,0,0,0,...,0,0,0,0,66.000459,13.124312,57932.486239,41.08578,0.09633,0.174312
2,195,2022-06-30,0,0,0,0,1,0,0,0,...,0,0,0,0,65.677436,13.397949,55591.394872,40.630769,0.128205,0.169231
3,190,2022-07-31,0,0,0,0,0,0,0,0,...,0,0,0,1,65.617368,13.369474,52933.621053,40.679474,0.078947,0.284211
4,171,2022-08-31,0,0,0,0,0,0,0,0,...,0,0,0,0,65.94152,13.122222,55886.929825,41.680117,0.099415,0.192982
5,189,2022-09-30,0,0,0,0,0,0,0,0,...,0,0,0,0,65.764021,13.375661,53620.253968,40.424339,0.084656,0.222222
6,248,2022-10-31,0,0,0,0,0,0,1,0,...,0,0,0,0,65.841129,13.291532,54608.798387,41.189919,0.084677,0.189516
7,246,2022-11-30,0,0,0,0,0,1,0,0,...,0,0,0,0,65.758537,13.540244,54461.01626,41.49878,0.085366,0.186992
8,228,2022-12-31,0,0,0,1,0,0,0,0,...,0,0,0,0,65.504386,13.582895,52818.27193,40.627632,0.070175,0.254386
9,242,2023-01-31,0,0,0,0,0,0,0,1,...,0,0,0,0,65.906612,13.218595,54600.061983,40.892562,0.115702,0.206612


In [61]:
mse = ((test_predictions_reset['yhat'] - test_data_reset['y']) ** 2).mean()
rmse = mse ** 0.5
print('RMSE: {:.2f}'.format(rmse))

RMSE: 145.04


The plot differs a bit from the first one, but RMSE did not improve.

## Without aggregation

In [None]:
train_data = pd.read_csv('burglary_train.csv')
test_data = pd.read_csv('burglary_test.csv')

In [None]:
train_data = to_datetime(train_data)
test_data = to_datetime(test_data)

In [None]:
def drop(df):
    # Dropping columns (which are not encoded and not used for further predictions)
    df = df.drop(df.columns[0], axis=1)
    df = df.drop(['Crime ID', 'Reported by', 'Longitude', 'Latitude', 'Location', 'LSOA name', 'Crime type', 'Last outcome category'], axis=1)
    return df

In [None]:
train_data = drop(train_data)
test_data = drop(test_data)

In [None]:
def loc_enc(df):
    # Returns a dataframe with encoded locations (based on LSOA code)
    one_hot_encoded = pd.get_dummies(df['LSOA code'])
    df_encoded = pd.concat([df, one_hot_encoded], axis=1)
    df_encoded = df_encoded.drop('LSOA code', axis=1)
    return df_encoded

In [None]:
train_data = loc_enc(train_data)
test_data = loc_enc(test_data)

#### Predicting for one location (0 - 1 crime takes place or not) - without agreggation 
Prophet model can take only one y value, for each chosen location y value is its encoded column

In [None]:
# new dataframe to not to get confused
df_prophet = pd.DataFrame()

loc = 'E01000302' # CHOOSING LOCATION FOR WHICH WE MAKE PREDICTIONS

# adding ds and y columns for prophet
df_prophet['ds'] = train_data['Month']
df_prophet['y'] = train_data[loc]
test_data['ds'] = test_data['Month']
test_data['y'] = test_data[loc]

In [None]:
model = Prophet()
model.fit(df_prophet)

In [None]:
future_dates = model.make_future_dataframe(periods=12, freq='M')

In [None]:
predictions = model.predict(future_dates)

In [None]:
plt = model.plot(predictions)
plt.legend()
plt.show()

In [None]:
test_predictions = predictions[-12:]
test_predictions_reset = test_predictions.reset_index(drop=True)
test_data_reset = test_data[-12:].reset_index(drop=True)

In [None]:
mse = ((test_predictions_reset['yhat'] - test_data_reset['y']) ** 2).mean()
rmse = mse ** 0.5
print('RMSE: {:.2f}'.format(rmse))

This implementation is wrong, as periods=12 with freq=Month `for all data points` will take only 12 last datapoints (so 12 last crimes in the dataset) for predictions and testing. It is not surprising that RMSE is that low and only 0s are predicted, as probably in these 12 test points there was no crime for chosen location... Decided to leave it here as it was a good lesson.

#### Predicting for one location, with aggregation (monthly counts)

In [None]:
# choosing only rows where crimes occur for chosen location
desired_rows = df_prophet.query("y == 1")

In [None]:
def count_per_loc(df):
    # Return a dataframe with crimes count per month (grouped by ds)
    df_per_month = df.groupby(pd.Grouper(key='ds', freq='M')).size()
    df_per_month = pd.DataFrame(df_per_month)
    df_per_month['ds'] = df_per_month.index
    df_per_month = df_per_month.rename(columns={0: 'y'})
    return df_per_month 

In [None]:
data = count_per_loc(desired_rows)
test_data = test_data.query("y == 1")
test_data = count_per_loc(test_data)

In [None]:
model = Prophet()
model.fit(data)

In [None]:
future_dates = model.make_future_dataframe(periods=12, freq='M')
predictions = model.predict(future_dates)
plt = model.plot(predictions)
plt.legend()
plt.show()

In [None]:
test_predictions = predictions[-12:]
test_predictions_reset = test_predictions.reset_index(drop=True)
test_data_reset = test_data[-12:].reset_index(drop=True)

In [None]:
mse = ((test_predictions_reset['yhat'] - test_data_reset['y']) ** 2).mean()
rmse = mse ** 0.5
print('RMSE: {:.2f}'.format(rmse))

Very poor result, probably due to the lack of information, which is not surprising. Fitting on all data points and obtaining scores for each location is a better idea, it may take a while though.

#### Fitting per case for each location, saving RMSE, location and crime amount in the test set in a DF

In [None]:
# Loading and transforming data again, so the notebook can be run from top to bottom
train_data = pd.read_csv('burglary_train.csv')
test_data = pd.read_csv('burglary_test.csv')
train_data = to_datetime(train_data)
test_data = to_datetime(test_data)
train_data = drop(train_data)
test_data = drop(test_data)
train_data = loc_enc(train_data)
test_data = loc_enc(test_data)

In [None]:
more_train = test_data.iloc[:6326, :]

new_test = test_data.iloc[6326:, :]

In [None]:
merged = pd.concat([train_data, more_train], axis=0)

In [None]:
train_data = merged
test_data = new_test

In [None]:
# creating a dataframe to save the results
results = pd.DataFrame({'LSOA':[], 'RMSE':[], 'Burglaries in train data (over 41739)':[], 'Burglaries in test data (over 2527)':[], 'Predicted burglaries (sum of prob)':[]})
i = 0

In [None]:
df_prophet = pd.DataFrame()
df_prophet['ds'] = train_data['Month']
test_data['ds'] = test_data['Month']

In [None]:
# going though lsoas in encoded columns (first one is ds, thus is omitted)
rmse_mean = []
for lsoa in train_data.columns[1:30]:
    # setting y value to the lsoa
    df_prophet['y'] = train_data[lsoa]
    test_data['y'] = test_data[lsoa]
    # fitting the model
    model = Prophet(yearly_seasonality=True)
    model.fit(df_prophet)
    # making predictions 
    # I am not sure how to set the number of periods
    future_dates = model.make_future_dataframe(periods=12, freq='M')
    predictions = model.predict(future_dates)
    test_predictions = predictions[-12:]
    test_predictions_reset = test_predictions.reset_index(drop=True)
    test = test_data[['Month', lsoa]]
    test = test.groupby(pd.Grouper(key='Month', freq='M')).sum()
    test = pd.DataFrame(test)
    test['ds'] = test.index
    test = test.rename(columns={lsoa: 'y'})
    test = test.reset_index(drop=True)
    test_data_reset = test[:12].reset_index(drop=True)
    # evaluation
    mse = ((test_predictions_reset['yhat'] - test_data_reset['y']) ** 2).mean()
    rmse = mse ** 0.5
    # adding to the dataframe
    results.loc[i] = lsoa, rmse, df_prophet['y'].sum(), test_data_reset['y'].sum(), test_predictions_reset['yhat'].sum()
    i += 1
    rmse_mean.append(rmse)

print('Mean RMSE per LSOA: {}'.format(sum(rmse_mean)/len(rmse_mean)))

In [None]:
predictions[-12:]

In [None]:
results

##### The goal is setting the right number for periods, so that it can make use of all points in test data, but also let us extract predictions for the upcoming year (we can also change the research question if necessary...)
##### yhat is not an integer, I don't not if its a problem, but it should be added to the table for a better overview
##### Other perfomance metrics should be implemented and addded to the table

As for each location the model consideres Os already, the only thing left (apart from figuring out the periods) is adding additional data as regressors

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=986fb8fd-fd48-481b-91de-f6577d6c21b9' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>