In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
import warnings
from skforecast.exceptions import LongTrainingWarning
from skforecast.ForecasterAutoregDirect import ForecasterAutoregDirect
from skforecast.model_selection import grid_search_forecaster
from skforecast.model_selection import backtesting_forecaster

In [2]:
# Load the data
train_data = pd.read_csv("burglary_train.csv")
val_data = pd.read_csv("burglary_validation.csv")
test_data = pd.read_csv("burglary_test.csv")


In [3]:
# The LSOA name will be the y, so create new data for it
y_train = train_data["LSOA name"]
y_val = val_data["LSOA name"]
y_test = test_data["LSOA name"]

# Combine all labels
all_labels = pd.concat([y_train, y_val, y_test])

# Encode labels with integer values
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)

# Transform labels for each dataset
y_train = label_encoder.transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)

In [4]:
# Delete the y from the x data and unwanted columns
drop = ["Crime type", "Last outcome category", "Reported by", "Crime ID", "LSOA name"]
x_train = train_data.drop(labels=drop, axis=1, inplace=False)
x_val = val_data.drop(labels=drop, axis=1, inplace=False)
x_test = test_data.drop(labels=drop, axis=1, inplace=False)

# Set format of Location and LSOA code to category
x_train['Location'] = x_train['Location'].astype('category')
x_train['LSOA code'] = x_train['LSOA code'].astype('category')

x_val['Location'] = x_val['Location'].astype('category')
x_val['LSOA code'] = x_val['LSOA code'].astype('category')

x_test['Location'] = x_test['Location'].astype('category')
x_test['LSOA code'] = x_test['LSOA code'].astype('category')

# Change format of the Month column and set it to index
x_train['Time'] = pd.to_datetime(x_train['Month'], format='%Y-%m')
x_val['Time'] = pd.to_datetime(x_val['Month'], format='%Y-%m')
x_test['Time'] = pd.to_datetime(x_test['Month'], format='%Y-%m')


In [5]:
x_train.tail(1)

Unnamed: 0.1,Unnamed: 0,Month,Longitude,Latitude,Location,LSOA code,Time
30985,30985,2012-12,-0.15361,51.613424,On or near Friern Barnet Road,E01000158,2012-12-01


In [6]:
x_val.head(1)

Unnamed: 0.1,Unnamed: 0,Month,Longitude,Latitude,Location,LSOA code,Time
0,30987,2012-12,-0.148134,51.613113,On or near Royal Drive,E01000160,2012-12-01


In [7]:
exog = ['Location', 'LSOA code']

In [8]:
# Transformer: Ordinal encoding + cast to category type
pipeline_categorical = make_pipeline(
                            OrdinalEncoder(
                                dtype=int,
                                handle_unknown="use_encoded_value",
                              unknown_value=-1,
                                encoded_missing_value=-1
                            ),
                            FunctionTransformer(
                                func=lambda x: x.astype('category'),
                                feature_names_out= 'one-to-one'
                            )
                       )
transformer_exog = make_column_transformer(
                        (
                            pipeline_categorical,
                            make_column_selector(dtype_exclude=np.number)
                        ),
                        remainder="passthrough",
                        verbose_feature_names_out=False,
                   ).set_output(transform="pandas")

# Create forecaster with automatic categorical detection
forecaster = ForecasterAutoregDirect(
    regressor=xgb.XGBRFRegressor(tree_method='hist', random_state=123, enable_categorical='auto'),
    lags=12,
    steps=12,
    transformer_exog=transformer_exog
)

In [22]:

# Define hyperparameters
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 10],
    'learning_rate': [0.01, 0.1]
}

# Lags used as predictors
lags_grid = [12, 24, 48, 60]


In [23]:

# Obtaining best model
x = pd.concat([x_train, x_val], ignore_index=True)
y = np.append(y_train, y_val)
y = pd.Series(y)

warnings.simplefilter('ignore', category=LongTrainingWarning)

result_grid = grid_search_forecaster(
    forecaster=forecaster,
    y=y,
    exog=x.loc[:, exog],
    param_grid=param_grid,
    lags_grid=lags_grid,
    steps=12,
    refit=True,
    metric='mean_absolute_error',
    initial_train_size=len(x_train),
    fixed_train_size=False,
    return_best=True,
    verbose=False
)


Number of models compared: 72.


lags grid:   0%|          | 0/4 [00:00<?, ?it/s]

params grid:   0%|          | 0/18 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [11]:

# Backtesting with the test data the best model
x2 = pd.concat([x, x_test], ignore_index=True)
y = np.append(y, y_test)
y = pd.Series(y)

# Predictions for the next 12 months without exogenous features
fut_exog = x2.loc[:, exog]
# Adjust the index of the future_exog to start from the appropriate index
#fut_exog.index = range(len(x2), len(x2) + len(x_test))

metric, predictions = backtesting_forecaster(
    forecaster=forecaster,
    y=y,
    initial_train_size=len(x),
    fixed_train_size=True,
    steps=12,
    refit=False,
    metric='mean_absolute_error',
    verbose=False
)

  0%|          | 0/738 [00:00<?, ?it/s]

In [12]:

result_grid

Unnamed: 0,lags,params,mean_squared_error,learning_rate,max_depth,n_estimators
92,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 0.1, 'max_depth': 10, 'n_est...",12745.820687,0.10,10.0,100.0
95,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 0.1, 'max_depth': 10, 'n_est...",12745.821023,0.10,10.0,500.0
94,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 0.1, 'max_depth': 10, 'n_est...",12745.826051,0.10,10.0,300.0
44,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]","{'learning_rate': 0.1, 'max_depth': 10, 'n_est...",12745.834401,0.10,10.0,100.0
93,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 0.1, 'max_depth': 10, 'n_est...",12745.865764,0.10,10.0,200.0
...,...,...,...,...,...,...
27,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]","{'learning_rate': 0.01, 'max_depth': 3, 'n_est...",15424.367283,0.01,3.0,500.0
3,"[1, 2, 3, 4]","{'learning_rate': 0.01, 'max_depth': 3, 'n_est...",15424.624825,0.01,3.0,500.0
2,"[1, 2, 3, 4]","{'learning_rate': 0.01, 'max_depth': 3, 'n_est...",15424.640973,0.01,3.0,300.0
0,"[1, 2, 3, 4]","{'learning_rate': 0.01, 'max_depth': 3, 'n_est...",15424.643230,0.01,3.0,100.0


In [13]:
predictions = predictions.astype(float)
predictions = predictions.round().astype(int)
lsoa_names = label_encoder.inverse_transform(predictions)


  y = column_or_1d(y, warn=True)


In [14]:
lsoa_names

array(['Barnet 003B', 'Barnet 003B', 'Barnet 003B', ..., 'Barnet 004A',
       'Barnet 003D', 'Barnet 003B'], dtype=object)

In [15]:
predictions

Unnamed: 0,pred
35412,12
35413,12
35414,12
35415,12
35416,12
...,...
44260,17
44261,15
44262,15
44263,14


In [16]:
metric

97.95554336705182

In [17]:
# Predictions for the next 12 months without exogenous features
future_exog = x_test.loc[:, exog]
# Adjust the index of the future_exog to start from the appropriate index
future_exog.index = range(len(x), len(x) + len(x_test))
# Predict for the next 12 months
predicted = forecaster.predict(steps=12, exog=future_exog)

In [18]:
predicted = predicted.astype(float)
predicted = predicted.round().astype(int)
predicted_lsoa_names = label_encoder.inverse_transform(predicted)
predicted_lsoa_names

array(['Barnet 003B', 'Barnet 003B', 'Barnet 003B', 'Barnet 003B',
       'Barnet 003B', 'Barnet 003B', 'Barnet 003B', 'Barnet 003B',
       'Barnet 003C', 'Barnet 003C', 'Barnet 003C', 'Barnet 003C'],
      dtype=object)

In [19]:
predicted

35412    12
35413    12
35414    12
35415    12
35416    12
35417    12
35418    12
35419    12
35420    13
35421    13
35422    13
35423    13
Name: pred, dtype: int64

In [20]:
predicted_lsoa_names

array(['Barnet 003B', 'Barnet 003B', 'Barnet 003B', 'Barnet 003B',
       'Barnet 003B', 'Barnet 003B', 'Barnet 003B', 'Barnet 003B',
       'Barnet 003C', 'Barnet 003C', 'Barnet 003C', 'Barnet 003C'],
      dtype=object)