In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
import warnings
from skforecast.exceptions import LongTrainingWarning
from skforecast.ForecasterAutoregDirect import ForecasterAutoregDirect
from skforecast.model_selection import grid_search_forecaster
from skforecast.model_selection import backtesting_forecaster

In [2]:
# Load the data
train_data = pd.read_csv("add_burglary_train.csv")
val_data = pd.read_csv("add_burglary_validation.csv")
test_data = pd.read_csv("add_burglary_test.csv")


In [3]:
# The LSOA name will be the y, so create new data for it
y_train = train_data["LSOA name"]
y_val = val_data["LSOA name"]
y_test = test_data["LSOA name"]

# Combine all labels
all_labels = pd.concat([y_train, y_val, y_test])

# Encode labels with integer values
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)

# Transform labels for each dataset
y_train = label_encoder.transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)

In [4]:
# Delete the y from the x data and unwanted columns
drop = ["Last outcome category", "LSOA name", "LSOA11NM"]
x_train = train_data.drop(labels=drop, axis=1, inplace=False)
x_val = val_data.drop(labels=drop, axis=1, inplace=False)
x_test = test_data.drop(labels=drop, axis=1, inplace=False)

# Set categorical features format to category
x_train['Location'] = x_train['Location'].astype('category')
x_train['LSOA code'] = x_train['LSOA code'].astype('category')
x_train["Ward name"] = x_train["Ward name"].astype("category")
x_train["Ward - House price lower third"] = x_train["Ward - House price lower third"].astype("category")
x_train["Ward - House price upper third"] = x_train["Ward - House price upper third"].astype("category")

x_val['Location'] = x_val['Location'].astype('category')
x_val['LSOA code'] = x_val['LSOA code'].astype('category')
x_val["Ward name"] = x_val["Ward name"].astype("category")
x_val["Ward - House price lower third"] = x_val["Ward - House price lower third"].astype("category")
x_val["Ward - House price upper third"] = x_val["Ward - House price upper third"].astype("category")

x_test['Location'] = x_test['Location'].astype('category')
x_test['LSOA code'] = x_test['LSOA code'].astype('category')
x_test["Ward name"] = x_test["Ward name"].astype("category")
x_test["Ward - House price lower third"] = x_test["Ward - House price lower third"].astype("category")
x_test["Ward - House price upper third"] = x_test["Ward - House price upper third"].astype("category")

# Change format of the Month column from string to datetime
x_train['Month'] = pd.to_datetime(x_train['Month'], format='%Y-%m')
x_val['Month'] = pd.to_datetime(x_val['Month'], format='%Y-%m')
x_test['Month'] = pd.to_datetime(x_test['Month'], format='%Y-%m')


In [5]:
exog = ['Month', 'Longitude', 'Latitude', 'Location', 'LSOA code', 'Ward name', 'Ward - House price lower third', 'Ward - House price upper third', 'Ward - % All Working-age (16-64)', 'Ward - % All Older people aged 65+', 'Ward - Population density (persons per sq km)', 'Ward - % Not Born in UK']

In [6]:
# Transformer: Ordinal encoding + cast to category type
pipeline_categorical = make_pipeline(
                            OrdinalEncoder(
                                dtype=int,
                                handle_unknown="use_encoded_value",
                              unknown_value=-1,
                                encoded_missing_value=-1
                            ),
                            FunctionTransformer(
                                func=lambda x: x.astype('category'),
                                feature_names_out= 'one-to-one'
                            )
                       )
transformer_exog = make_column_transformer(
                        (
                            pipeline_categorical,
                            make_column_selector(dtype_exclude=np.number)
                        ),
                        remainder="passthrough",
                        verbose_feature_names_out=False,
                   ).set_output(transform="pandas")


In [7]:
# Create forecaster with automatic categorical detection
forecaster = ForecasterAutoregDirect(
    regressor=xgb.XGBRFRegressor(tree_method='hist', random_state=42, enable_categorical='auto'),
    lags=12,
    steps=12,
    transformer_exog=transformer_exog
)

In [8]:
# Create forecaster with automatic categorical detection
forecaster1 = ForecasterAutoregDirect(
    regressor=xgb.XGBRFClassifier(tree_method="hist", objective="multi:softmax", random_state=42, enable_categorical='auto'),
    lags=12,
    steps=12,
    transformer_exog=transformer_exog
)

In [9]:

# Define hyperparameters
param_grid = {
    'n_estimators': [100],
    'max_depth': [4],
    'learning_rate': [0.35],
}
# 100,

#, 0.75, 0.85, 0.95, 1
# Lags used as predictors
lags_grid = [12, 24, 36]


In [10]:

# Obtaining best model
x = pd.concat([x_train, x_val], ignore_index=True)
y = np.append(y_train, y_val)
y = pd.Series(y)
y = y.astype(int)
warnings.simplefilter('ignore', category=LongTrainingWarning)

result_grid = grid_search_forecaster(
    forecaster=forecaster,
    y=y,
    exog=x.loc[:, exog],
    param_grid=param_grid,
    lags_grid=lags_grid,
    steps=12,
    refit=False,
    metric=['mean_squared_error', 'mean_absolute_error'],
    initial_train_size=len(x_train),
    fixed_train_size=False,
    return_best=True,
    verbose=False
)


Number of models compared: 3.


lags grid:   0%|          | 0/3 [00:00<?, ?it/s]

params grid:   0%|          | 0/1 [00:00<?, ?it/s]

params grid:   0%|          | 0/1 [00:00<?, ?it/s]

params grid:   0%|          | 0/1 [00:00<?, ?it/s]

`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24] 
  Parameters: {'learning_rate': 0.35, 'max_depth': 4, 'n_estimators': 100}
  Backtesting metric: 962.2181774808413



In [11]:
# Include the RSME value into the results grid for all the models compared and print the grid to compare them

result_grid.insert(0, "RMSE", np.sqrt(result_grid["mean_squared_error"]))

result_grid

Unnamed: 0,RMSE,lags,params,mean_squared_error,mean_absolute_error,learning_rate,max_depth,n_estimators
1,31.019642,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 0.35, 'max_depth': 4, 'n_est...",962.218177,25.794049,0.35,4.0,100.0
2,31.245618,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 0.35, 'max_depth': 4, 'n_est...",976.288644,25.811872,0.35,4.0,100.0
0,31.407933,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]","{'learning_rate': 0.35, 'max_depth': 4, 'n_est...",986.458225,26.678894,0.35,4.0,100.0


In [12]:

# Backtesting with the test data the best model
x2 = pd.concat([x, x_test], ignore_index=True)
y = np.append(y, y_test)
y = pd.Series(y)

# Predictions for the next 12 months without exogenous features
fut_exog = x2.loc[:, exog]
# Adjust the index of the future_exog to start from the appropriate index
#fut_exog.index = range(len(x2), len(x2) + len(x_test))

metric, predictions = backtesting_forecaster(
    forecaster=forecaster,
    y=y,
    initial_train_size=len(x),
    fixed_train_size=True,
    steps=12,
    refit=False,
    metric=['mean_squared_error', 'mean_absolute_error'],
    verbose=False
)

  0%|          | 0/738 [00:00<?, ?it/s]

In [13]:
# Obtain the RSME of the Backstesting and the Absolute Mean error of the backtesting
rsme_backtest = np.sqrt(metric[0])

absolute_backtest = metric[1]

rsme_backtest, absolute_backtest

(77.24303709824838, 66.84677219458196)

In [14]:
predictions = predictions.astype(float)
predictions = predictions.round().astype(int)
lsoa_names = label_encoder.inverse_transform(predictions)
lsoa_names

  y = column_or_1d(y, warn=True)


array(['Barnet 003B', 'Barnet 003B', 'Barnet 003B', ..., 'Barnet 006C',
       'Barnet 006C', 'Barnet 006E'], dtype=object)

In [15]:
np.unique(lsoa_names)

array(['Barnet 001D', 'Barnet 003B', 'Barnet 004C', 'Barnet 004D',
       'Barnet 005B', 'Barnet 005C', 'Barnet 005D', 'Barnet 006B',
       'Barnet 006C', 'Barnet 006E', 'Barnet 007A', 'Barnet 007B',
       'Barnet 007D', 'Barnet 007E', 'Barnet 007F', 'Barnet 008A',
       'Barnet 008D', 'Barnet 008E', 'Barnet 009A', 'Barnet 009B',
       'Barnet 009C', 'Barnet 009E', 'Barnet 010A', 'Barnet 010B',
       'Barnet 010C', 'Barnet 010D', 'Barnet 011E', 'Barnet 012A',
       'Barnet 012B', 'Barnet 012C', 'Barnet 012E', 'Barnet 013A',
       'Barnet 013B', 'Barnet 013C', 'Barnet 014F'], dtype=object)

In [18]:
# Predictions for the next 12 months without exogenous features
future_exog = x_test.loc[:, exog]
# Adjust the index of the future_exog to start from the appropriate index
future_exog.index = range(len(x), len(x) + len(x_test))
# Predict for the next 12 months
predicted = forecaster.predict(steps=12, exog=future_exog)

In [19]:
predicted = predicted.astype(float)
predicted = predicted.round().astype(int)
predicted_lsoa_names = label_encoder.inverse_transform(predicted)
predicted_lsoa_names

array(['Barnet 003B', 'Barnet 003B', 'Barnet 003B', 'Barnet 003B',
       'Barnet 003B', 'Barnet 003B', 'Barnet 003B', 'Barnet 003B',
       'Barnet 003B', 'Barnet 003B', 'Barnet 003B', 'Barnet 003B'],
      dtype=object)

In [23]:
import geojson

df = pd.read_csv('burglary_data.csv')

df = df[df['LSOA name'].isin(lsoa_names)]

df.drop(labels=[ 'Location', 'Crime type', "Last outcome category", 'Crime ID', 'Reported by', 'Month'], axis=1, inplace=True)

df = df.drop_duplicates(subset='LSOA name')

In [24]:
df

Unnamed: 0,Longitude,Latitude,LSOA code,LSOA name
13,-0.158203,51.650167,E01000168,Barnet 003B
15,-0.194493,51.648905,E01000288,Barnet 004C
17,-0.175755,51.644326,E01000270,Barnet 005D
19,-0.168221,51.646565,E01000171,Barnet 006B
20,-0.219121,51.651188,E01000253,Barnet 007B
29,-0.188808,51.628684,E01000280,Barnet 007D
30,-0.219216,51.647664,E01000286,Barnet 007E
31,-0.225182,51.64628,E01000290,Barnet 007F
32,-0.178126,51.642142,E01000271,Barnet 008A
36,-0.176329,51.633626,E01000281,Barnet 008D


In [29]:
import folium
from selenium import webdriver

# Create a map centered on Barnet
map_barnet = folium.Map(location=[51.6167, -0.2070], zoom_start=12)

# Add markers for the filtered wards
for _, ward in df.iterrows():
    folium.Marker(
        location=[ward['Latitude'], ward['Longitude']],
        popup=ward['LSOA name']
    ).add_to(map_barnet)

# Save the map as an HTML file
map_barnet