In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
from skforecast.ForecasterAutoregDirect import ForecasterAutoregDirect
from skforecast.model_selection import grid_search_forecaster
from skforecast.model_selection import backtesting_forecaster

In [107]:

train_data = pd.read_csv("burglary_train.csv")
val_data = pd.read_csv("burglary_validation.csv")
test_data = pd.read_csv("burglary_test.csv")

# The LSOA name will be the y, so create new data for it
y_train = train_data["LSOA name"]
y_val = val_data["LSOA name"]
y_test = test_data["LSOA name"]

# Combine all labels
all_labels = pd.concat([y_train, y_val, y_test])

# Encode labels with integer values
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)

# Transform labels for each dataset
y_train = label_encoder.transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)


In [61]:
# Delete the y from the x data and unwanted columns
drop = ["Crime type", "Last outcome category", "Reported by", "Crime ID", "LSOA name"]
x_train = train_data.drop(labels=drop, axis=1, inplace=False)
x_val = val_data.drop(labels=drop, axis=1, inplace=False)
x_test = test_data.drop(labels=drop, axis=1, inplace=False)

# Set format of Location and LSOA code to category
x_train['Location'] = x_train['Location'].astype('category')
x_train['LSOA code'] = x_train['LSOA code'].astype('category')

x_val['Location'] = x_val['Location'].astype('category')
x_val['LSOA code'] = x_val['LSOA code'].astype('category')

x_test['Location'] = x_test['Location'].astype('category')
x_test['LSOA code'] = x_test['LSOA code'].astype('category')

# Change format of the Month column and set it to index
x_train['Time'] = pd.to_datetime(x_train['Month'], format='%Y-%m')
x_val['Time'] = pd.to_datetime(x_val['Month'], format='%Y-%m')
x_test['Time'] = pd.to_datetime(x_test['Month'], format='%Y-%m')


In [62]:
x_train.tail(1)

Unnamed: 0.1,Unnamed: 0,Month,Longitude,Latitude,Location,LSOA code,Time
30985,30985,2012-12,-0.15361,51.613424,On or near Friern Barnet Road,E01000158,2012-12-01


In [63]:
x_val.head(1)

Unnamed: 0.1,Unnamed: 0,Month,Longitude,Latitude,Location,LSOA code,Time
0,30987,2012-12,-0.148134,51.613113,On or near Royal Drive,E01000160,2012-12-01


In [64]:
exog = ['Location', 'LSOA code']

In [65]:
# Transformer: Ordinal encoding + cast to category type
pipeline_categorical = make_pipeline(
                            OrdinalEncoder(
                                dtype=int,
                                handle_unknown="use_encoded_value",
                                unknown_value=-1,
                                encoded_missing_value=-1
                            ),
                            FunctionTransformer(
                                func=lambda x: x.astype('category'),
                                feature_names_out= 'one-to-one'
                            )
                       )
transformer_exog = make_column_transformer(
                        (
                            pipeline_categorical,
                            make_column_selector(dtype_exclude=np.number)
                        ),
                        remainder="passthrough",
                        verbose_feature_names_out=False,
                   ).set_output(transform="pandas")

# Create forecaster with automatic categorical detection
forecaster = ForecasterAutoregDirect(
    regressor=xgb.XGBRFRegressor(tree_method='hist', random_state=123, enable_categorical='auto'),
    lags=12,
    steps=12,
    transformer_exog=transformer_exog
)

In [66]:

# Define hyperparameters
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [3, 5, 10],
    'learning_rate': [0.01, 0.1]
}

# Lags used as predictors
lags_grid = [4, 12, 24, 48, 72]


In [67]:
# Obtaining best model
x = pd.concat([x_train, x_val], ignore_index=True)
y = np.append(y_train, y_val)
y = pd.Series(y)

result_grid = grid_search_forecaster(
    forecaster=forecaster,
    y=y,
    exog=x.loc[:, exog],
    param_grid=param_grid,
    lags_grid=lags_grid,
    steps=12,
    refit=False,
    metric='mean_absolute_error',
    initial_train_size=len(x_train),
    fixed_train_size=False,
    return_best=True,
    verbose=False
)


Number of models compared: 120.


lags grid:   0%|          | 0/5 [00:00<?, ?it/s]

params grid:   0%|          | 0/24 [00:00<?, ?it/s]

params grid:   0%|          | 0/24 [00:00<?, ?it/s]

params grid:   0%|          | 0/24 [00:00<?, ?it/s]

params grid:   0%|          | 0/24 [00:00<?, ?it/s]

params grid:   0%|          | 0/24 [00:00<?, ?it/s]

`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [1 2 3 4] 
  Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
  Backtesting metric: 98.03029047980925



In [80]:

# Backtesting with the test data the best model
x2 = pd.concat([x, x_test], ignore_index=True)
y = np.append(y, y_test)
y = pd.Series(y)

metric, predictions = backtesting_forecaster(
    forecaster=forecaster,
    y=y,
    initial_train_size=len(x),
    fixed_train_size=False,
    steps=12,
    refit=False,
    metric='mean_absolute_error',
    verbose=False
)

  0%|          | 0/2214 [00:00<?, ?it/s]

In [81]:

predictions

Unnamed: 0,pred
35412,11.675626
35413,12.057780
35414,12.041039
35415,11.990654
35416,11.918101
...,...
61966,13.620949
61967,13.333845
61968,20.375689
61969,17.651146


In [110]:
lsoa_names = []
for prediction in predictions:
    try:
        label = label_encoder.inverse_transform(prediction)
        lsoa_names.append(label)
    except ValueError:
        # Handle unseen label
        lsoa_names.append("Unknown Label")

In [111]:
lsoa_names

['Unknown Label',
 'Unknown Label',
 'Unknown Label',
 'Unknown Label',
 'Unknown Label',
 'Unknown Label',
 'Unknown Label',
 'Unknown Label',
 'Unknown Label',
 'Unknown Label',
 'Unknown Label',
 'Unknown Label']

In [82]:
metric

97.91539102238639

In [92]:
# Predictions for the next 12 months
future_exog = x_test.loc[35410:35423, exog]  # Exogenous features for the future period
future_predictions = forecaster.predict(steps=12, exog=future_exog)

# Convert the predicted labels back to the original LSOA names
label_encoder = result_grid.best_forecaster_._label_encoder
predicted_lsoa_names = label_encoder.inverse_transform(future_predictions)

ValueError: Forecaster trained with exogenous variable/s. Same variable/s must be provided when predicting.

In [113]:
# Predictions for the next 12 months without exogenous features
future_exog = x_test.loc[:, exog]
# Adjust the index of the future_exog to start from the appropriate index
future_exog.index = range(len(x), len(x) + len(x_test))
# Predict for the next 12 months
predictions = forecaster.predict(steps=12, exog=future_exog)

# Convert the predicted labels back to the original LSOA names
predicted_lsoa_names = []
for prediction in predictions:
    try:
        label = label_encoder.inverse_transform([prediction])[0]
        predicted_lsoa_names.append(label)
    except ValueError:
        # Handle unseen label
        predicted_lsoa_names.append("Unknown Label")



In [114]:
predicted_lsoa_names

['Unknown Label',
 'Unknown Label',
 'Unknown Label',
 'Unknown Label',
 'Unknown Label',
 'Unknown Label',
 'Unknown Label',
 'Unknown Label',
 'Unknown Label',
 'Unknown Label',
 'Unknown Label',
 'Unknown Label']