In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
import warnings
from skforecast.exceptions import LongTrainingWarning
from skforecast.ForecasterAutoregDirect import ForecasterAutoregDirect
from skforecast.model_selection import grid_search_forecaster
from skforecast.model_selection import backtesting_forecaster
import folium

In [2]:
# Load the original data
train_data = pd.read_csv("burglary_train.csv")
val_data = pd.read_csv("burglary_validation.csv")
test_data = pd.read_csv("burglary_test.csv")


In [2]:
# DO NOT RUN THIS CELL IF USING ORIGINAL DATA
# Load the data with additional dataset
train_data = pd.read_csv("add_burglary_train.csv")
val_data = pd.read_csv("add_burglary_validation.csv")
test_data = pd.read_csv("add_burglary_test.csv")


In [3]:
# The LSOA name will be the y, so create new data for it
y_train = train_data["LSOA name"]
y_val = val_data["LSOA name"]
y_test = test_data["LSOA name"]

# Combine all labels
all_labels = pd.concat([y_train, y_val, y_test])

# Encode labels with integer values
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)

# Transform labels for each dataset
y_train = label_encoder.transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)

In [4]:
# Delete the y from the x data and unwanted columns
drop = ["Last outcome category", "LSOA name"]
x_train = train_data.drop(labels=drop, axis=1, inplace=False)
x_val = val_data.drop(labels=drop, axis=1, inplace=False)
x_test = test_data.drop(labels=drop, axis=1, inplace=False)

# Set categorical features format to category
x_train['Location'] = x_train['Location'].astype('category')
x_train['LSOA code'] = x_train['LSOA code'].astype('category')

x_val['Location'] = x_val['Location'].astype('category')
x_val['LSOA code'] = x_val['LSOA code'].astype('category')

x_test['Location'] = x_test['Location'].astype('category')
x_test['LSOA code'] = x_test['LSOA code'].astype('category')

# Change format of the Month column from string to datetime
x_train['Month'] = pd.to_datetime(x_train['Month'], format='%Y-%m')
x_val['Month'] = pd.to_datetime(x_val['Month'], format='%Y-%m')
x_test['Month'] = pd.to_datetime(x_test['Month'], format='%Y-%m')


In [None]:
# For the data with the additional dataset run this cell as well

# Delete extra unwanted column called LSOA11NM
add_drop = ["LSOA11NM"]
x_train = train_data.drop(labels=add_drop, axis=1, inplace=False)
x_val = val_data.drop(labels=add_drop, axis=1, inplace=False)
x_test = test_data.drop(labels=add_drop, axis=1, inplace=False)

# Set categorical features format to category only in the additional dataset
x_train["Ward name"] = x_train["Ward name"].astype("category")
x_train["Ward - House price lower third"] = x_train["Ward - House price lower third"].astype("category")
x_train["Ward - House price upper third"] = x_train["Ward - House price upper third"].astype("category")

x_val["Ward name"] = x_val["Ward name"].astype("category")
x_val["Ward - House price lower third"] = x_val["Ward - House price lower third"].astype("category")
x_val["Ward - House price upper third"] = x_val["Ward - House price upper third"].astype("category")

x_test["Ward name"] = x_test["Ward name"].astype("category")
x_test["Ward - House price lower third"] = x_test["Ward - House price lower third"].astype("category")
x_test["Ward - House price upper third"] = x_test["Ward - House price upper third"].astype("category")

In [5]:
# Exogeneous variables from original data
exog = ['Month', 'Longitude', 'Latitude', 'Location', 'LSOA code']

# Exogeneous variables from additional data
add_exog = ['Month', 'Longitude', 'Latitude', 'Location', 'LSOA code', 'Ward name', 'Ward - House price lower third', 'Ward - House price upper third', 'Ward - % All Working-age (16-64)', 'Ward - % All Older people aged 65+', 'Ward - Population density (persons per sq km)', 'Ward - % Not Born in UK']

In [6]:
# Creating a pipeline for categorical features
pipeline_categorical = make_pipeline(
                            OrdinalEncoder(
                                dtype=int,
                                handle_unknown="use_encoded_value",
                              unknown_value=-1,
                                encoded_missing_value=-1
                            ),
                            FunctionTransformer(
                                func=lambda x: x.astype('category'),
                                feature_names_out= 'one-to-one'
                            )
                       )
# Creating a column transformer
transformer_exog = make_column_transformer(
                        (
                            pipeline_categorical, # Applying the categorical pipeline to selected columns
                            make_column_selector(dtype_exclude=np.number) # Selecting columns with non-numeric data types
                        ),
                        remainder="passthrough", # Passing through the remaining columns without any transformation
                        verbose_feature_names_out=False, # Setting verbose feature names to False
                   ).set_output(transform="pandas") # Setting the output format of the transformed data as pandas DataFrame


In [7]:
# Create forecaster with automatic categorical detection
forecaster = ForecasterAutoregDirect(
    regressor=xgb.XGBRFRegressor(tree_method='hist', random_state=42, enable_categorical='auto'),
    lags=12,
    steps=12,
    transformer_exog=transformer_exog
)

In [9]:
# Define hyperparameters
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 10],
    'learning_rate': [0.1, 0.5, 0.75, 1],
}

# Lags used as predictors
lags_grid = [ 24, 36, 48]


In [10]:
# Obtaining best model
# Concatenate x_train and x_val into one
x = pd.concat([x_train, x_val], ignore_index=True)
# Concatenare y_train and y_val into one
y = np.append(y_train, y_val)
y = pd.Series(y) # Set y as a pandas series
y = y.astype(int) # Set all y values as integers
warnings.simplefilter('ignore', category=LongTrainingWarning) # Remove warning regarding Long Training

# Perform a grid search for all combinations of the hyperparameters and output is the table with all of them nd its performance
result_grid = grid_search_forecaster(
    forecaster=forecaster,
    y=y,
    exog=x.loc[:, exog], #or add_exog if using additional dataset
    param_grid=param_grid,
    lags_grid=lags_grid,
    steps=12,
    refit=False,
    metric=['mean_squared_error', 'mean_absolute_error'],
    initial_train_size=len(x_train),
    fixed_train_size=False,
    return_best=True,
    verbose=False
)


Number of models compared: 48.


lags grid:   0%|          | 0/4 [00:00<?, ?it/s]

params grid:   0%|          | 0/12 [00:00<?, ?it/s]

params grid:   0%|          | 0/12 [00:00<?, ?it/s]

params grid:   0%|          | 0/12 [00:00<?, ?it/s]

params grid:   0%|          | 0/12 [00:00<?, ?it/s]

`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48] 
  Parameters: {'learning_rate': 1, 'max_depth': 10, 'n_estimators': 200}
  Backtesting metric: 0.024123825111409737



In [11]:
# Include the RSME value into the results grid for all the models compared and print the grid to compare them

result_grid.insert(2, "RMSE", np.sqrt(result_grid["mean_squared_error"]))
result_grid.drop(labels=["mean_squared_error"], axis=1, inplace=True)
result_grid = result_grid.rename(columns={'mean_absolute_error': 'MAE'})

result_grid

Unnamed: 0,RMSE,lags,params,mean_absolute_error,learning_rate,max_depth,n_estimators
46,0.155318,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 1, 'max_depth': 10, 'n_estim...",0.037028,1,10,200
47,0.15801,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 1, 'max_depth': 10, 'n_estim...",0.036286,1,10,300
45,0.163813,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 1, 'max_depth': 10, 'n_estim...",0.038082,1,10,100
23,0.173403,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 1, 'max_depth': 10, 'n_estim...",0.037598,1,10,300
35,0.173553,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 1, 'max_depth': 10, 'n_estim...",0.041017,1,10,300
34,0.173683,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 1, 'max_depth': 10, 'n_estim...",0.041612,1,10,200
22,0.175358,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 1, 'max_depth': 10, 'n_estim...",0.038164,1,10,200
33,0.178906,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 1, 'max_depth': 10, 'n_estim...",0.042988,1,10,100
11,0.179645,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]","{'learning_rate': 1, 'max_depth': 10, 'n_estim...",0.043842,1,10,300
10,0.180123,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]","{'learning_rate': 1, 'max_depth': 10, 'n_estim...",0.043511,1,10,200


In [12]:

# Backtesting with the test data the best model
x2 = pd.concat([x, x_test], ignore_index=True)
y = np.append(y, y_test)
y = pd.Series(y)

# Predictions for the next 12 months without exogenous features
fut_exog = x2.loc[:, exog]   #or add_exog if using additional dataset
# Adjust the index of the future_exog to start from the appropriate index
#fut_exog.index = range(len(x2), len(x2) + len(x_test))

metric, predictions = backtesting_forecaster(
    forecaster=forecaster,
    y=y,
    initial_train_size=len(x),
    fixed_train_size=True,
    steps=12,
    refit=False,
    metric=['mean_squared_error', 'mean_absolute_error'],
    verbose=False
)

  0%|          | 0/738 [00:00<?, ?it/s]

In [13]:
# Obtain the RSME of the Backstesting
rsme_backtest = np.sqrt(metric[0])
# and the Absolute Mean error of the backtesting
absolute_backtest = metric[1]

print('The RSME value of backtesting is: ' + str(rsme_backtest))
print('The MAE value of backtesting is: ' + str(absolute_backtest))

The RSME value of backtesting is: 15.352051211423044
The MAE value of backtesting is: 3.7870968856758203


In [20]:
# Declare predicted as float
predictions = predictions.astype(float)
# Round up the predictions and turn them to integers
predictions = predictions.round().astype(int)
# Decode the predictions to the categorical labels
lsoa_names = label_encoder.inverse_transform(predictions)
# Print the predicted lsoa names with no repetition
np.unique(lsoa_names)

  y = column_or_1d(y, warn=True)


array(['Barnet 022F', 'Barnet 023A', 'Barnet 023B', ..., 'Barnet 028D',
       'Barnet 027F', 'Barnet 024D'], dtype=object)

  y = column_or_1d(y, warn=True)


array(['Barnet 022F', 'Barnet 023A', 'Barnet 023B', ..., 'Barnet 028D',
       'Barnet 027F', 'Barnet 024D'], dtype=object)

In [16]:
# Predictions for the next 12 months without exogenous features
future_exog = x_test.loc[:, exog]  #or add_exog if using additional dataset
# Adjust the index of the future_exog to start from the appropriate index
future_exog.index = range(len(x), len(x) + len(x_test))
# Predict for the next 12 months
predicted = forecaster.predict(steps=12, exog=future_exog)
# Declare predicted as float
predicted = predicted.astype(float)
# Round up the predictions and turn them to integers
predicted = predicted.round().astype(int)
# Decode the predictions to the categorical labels
predicted_lsoa_names = label_encoder.inverse_transform(predicted)
# Print the predicted lsoa names
predicted_lsoa_names

In [23]:
# Merge the two lists with predictions together
merged_predictions = np.concatenate((lsoa_names, predicted_lsoa_names))
# Obtain the values of the merged lists and the number of times they show
unique_values, counts = np.unique(merged_predictions, return_counts=True)
# Create a dictionary with the labels and the number of repetitions
dic = {'LSOA names': unique_values, 'Count': counts}
# Turn the dictionary into pandas DataFrame
chance_area = pd.DataFrame(dic)
# Sort the dataframe by the repetition from highest to lowest
chance_area = chance_area.sort_values(by='Count', ascending=False)
# Save the dataframe into a csv file
chance_area.to_csv('predicted_lsoa_count.csv', index=False)
# Print the dataframe
chance_area

Unnamed: 0,LSOA names,Count
196,Barnet 038D,165
197,Barnet 038E,135
198,Barnet 039A,91
114,Barnet 023B,69
190,Barnet 037D,67
...,...,...
204,Barnet 040C,18
203,Barnet 040B,11
0,Barnet 001B,9
205,Barnet 040D,2


In [51]:
# Given the large dataframe, it is reduced by keeping only those with a count higher than 75-percentile thus the most likely
# Obtain the 75th percentile of count
third_quartile = np.percentile(counts, 75)
# Filter out those values with count lower than 75-percentile
chance_area = chance_area[chance_area['Count'] > third_quartile]
# Load the burglary dataset
df = pd.read_csv('burglary_data.csv')
# Keep only the data for the predicted labels
df = df[df['LSOA name'].isin(chance_area['LSOA names'])]
# Delete all the columns below
df.drop(labels=[ 'Location', 'Crime type', "Last outcome category", 'Crime ID', 'Reported by', 'Month'], axis=1, inplace=True)
# Delete duplicates
df = df.drop_duplicates(subset='LSOA name')
# Obtain the center of all the labels, by calculating mean latitude and mean longitude
center_latitude = df['Latitude'].mean()
center_longitude = df['Longitude'].mean()
# Save dataframe to csv file
df.to_csv('most_likely_lsoa.csv', index=False)
# Print dataframe
df

Unnamed: 0,Longitude,Latitude,LSOA code,LSOA name
20,-0.219121,51.651188,E01000253,Barnet 007B
43,-0.149385,51.621415,E01000124,Barnet 010E
44,-0.159244,51.622676,E01000266,Barnet 011B
46,-0.160137,51.627393,E01000274,Barnet 011C
70,-0.245641,51.616508,E01000264,Barnet 016D
71,-0.211346,51.608887,E01000257,Barnet 017B
72,-0.236108,51.618371,E01000261,Barnet 017C
83,-0.183163,51.609416,E01000302,Barnet 019C
86,-0.176381,51.608906,E01000318,Barnet 019E
107,-0.163636,51.610138,E01000315,Barnet 022F


In [52]:
# Create a map centered on Barnet
map_barnet = folium.Map(location=[center_latitude, center_longitude], zoom_start=12)

# Add markers for the filtered wards
for _, ward in df.iterrows():
    folium.Marker(
        location=[ward['Latitude'], ward['Longitude']],
        popup=ward['LSOA name']
    ).add_to(map_barnet)

# Show the map
map_barnet