In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
import warnings
from skforecast.exceptions import LongTrainingWarning
from skforecast.ForecasterAutoregDirect import ForecasterAutoregDirect
from skforecast.model_selection import grid_search_forecaster
from skforecast.model_selection import backtesting_forecaster

In [2]:
# Load the data
train_data = pd.read_csv("burglary_train.csv")
val_data = pd.read_csv("burglary_validation.csv")
test_data = pd.read_csv("burglary_test.csv")


In [3]:
# The LSOA name will be the y, so create new data for it
y_train = train_data["LSOA name"]
y_val = val_data["LSOA name"]
y_test = test_data["LSOA name"]

# Combine all labels
all_labels = pd.concat([y_train, y_val, y_test])

# Encode labels with integer values
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)

# Transform labels for each dataset
y_train = label_encoder.transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)

In [4]:
# Delete the y from the x data and unwanted columns
drop = ["Crime type", "Last outcome category", "Reported by", "Crime ID", "LSOA name"]
x_train = train_data.drop(labels=drop, axis=1, inplace=False)
x_val = val_data.drop(labels=drop, axis=1, inplace=False)
x_test = test_data.drop(labels=drop, axis=1, inplace=False)

# Set format of Location and LSOA code to category
x_train['Location'] = x_train['Location'].astype('category')
x_train['LSOA code'] = x_train['LSOA code'].astype('category')

x_val['Location'] = x_val['Location'].astype('category')
x_val['LSOA code'] = x_val['LSOA code'].astype('category')

x_test['Location'] = x_test['Location'].astype('category')
x_test['LSOA code'] = x_test['LSOA code'].astype('category')

# Change format of the Month column and set it to index
x_train['Time'] = pd.to_datetime(x_train['Month'], format='%Y-%m')
x_val['Time'] = pd.to_datetime(x_val['Month'], format='%Y-%m')
x_test['Time'] = pd.to_datetime(x_test['Month'], format='%Y-%m')


In [5]:
exog = ['Location', 'LSOA code']

In [6]:
# Transformer: Ordinal encoding + cast to category type
pipeline_categorical = make_pipeline(
                            OrdinalEncoder(
                                dtype=int,
                                handle_unknown="use_encoded_value",
                              unknown_value=-1,
                                encoded_missing_value=-1
                            ),
                            FunctionTransformer(
                                func=lambda x: x.astype('category'),
                                feature_names_out= 'one-to-one'
                            )
                       )
transformer_exog = make_column_transformer(
                        (
                            pipeline_categorical,
                            make_column_selector(dtype_exclude=np.number)
                        ),
                        remainder="passthrough",
                        verbose_feature_names_out=False,
                   ).set_output(transform="pandas")

# Create forecaster with automatic categorical detection
forecaster = ForecasterAutoregDirect(
    regressor=xgb.XGBRFRegressor(tree_method='hist', random_state=123, enable_categorical='auto'),
    lags=12,
    steps=12,
    transformer_exog=transformer_exog
)

In [22]:

# Define hyperparameters
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10],
    'learning_rate': [0.65, 0.75, 0.85, 0.95, 1]
}

# Lags used as predictors
lags_grid = [12, 24, 36, 48]


In [23]:

# Obtaining best model
x = pd.concat([x_train, x_val], ignore_index=True)
y = np.append(y_train, y_val)
y = pd.Series(y)

warnings.simplefilter('ignore', category=LongTrainingWarning)

result_grid = grid_search_forecaster(
    forecaster=forecaster,
    y=y,
    exog=x.loc[:, exog],
    param_grid=param_grid,
    lags_grid=lags_grid,
    steps=12,
    refit=False,
    metric='mean_squared_error',
    initial_train_size=len(x_train),
    fixed_train_size=False,
    return_best=True,
    verbose=False
)


Number of models compared: 60.


lags grid:   0%|          | 0/4 [00:00<?, ?it/s]

params grid:   0%|          | 0/15 [00:00<?, ?it/s]

params grid:   0%|          | 0/15 [00:00<?, ?it/s]

params grid:   0%|          | 0/15 [00:00<?, ?it/s]

params grid:   0%|          | 0/15 [00:00<?, ?it/s]

`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48] 
  Parameters: {'learning_rate': 1, 'max_depth': 10, 'n_estimators': 100}
  Backtesting metric: 0.06352016207905069



In [24]:
result_grid.insert(0, "RMSE", np.sqrt(result_grid["mean_squared_error"]))

result_grid

Unnamed: 0,RMSE,lags,params,mean_squared_error,learning_rate,max_depth,n_estimators
57,0.252032,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 1, 'max_depth': 10, 'n_estim...",0.06352,1.0,10.0,100.0
59,0.279904,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 1, 'max_depth': 10, 'n_estim...",0.078346,1.0,10.0,300.0
58,0.326448,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 1, 'max_depth': 10, 'n_estim...",0.106568,1.0,10.0,200.0
43,0.336938,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 1, 'max_depth': 10, 'n_estim...",0.113527,1.0,10.0,200.0
44,0.346768,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 1, 'max_depth': 10, 'n_estim...",0.120248,1.0,10.0,300.0
14,0.357374,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]","{'learning_rate': 1, 'max_depth': 10, 'n_estim...",0.127716,1.0,10.0,300.0
28,0.363229,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 1, 'max_depth': 10, 'n_estim...",0.131935,1.0,10.0,200.0
12,0.366295,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]","{'learning_rate': 1, 'max_depth': 10, 'n_estim...",0.134172,1.0,10.0,100.0
13,0.372222,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]","{'learning_rate': 1, 'max_depth': 10, 'n_estim...",0.138549,1.0,10.0,200.0
29,0.390212,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 1, 'max_depth': 10, 'n_estim...",0.152265,1.0,10.0,300.0


In [25]:

# Backtesting with the test data the best model
x2 = pd.concat([x, x_test], ignore_index=True)
y = np.append(y, y_test)
y = pd.Series(y)

# Predictions for the next 12 months without exogenous features
fut_exog = x2.loc[:, exog]
# Adjust the index of the future_exog to start from the appropriate index
#fut_exog.index = range(len(x2), len(x2) + len(x_test))

metric, predictions = backtesting_forecaster(
    forecaster=forecaster,
    y=y,
    initial_train_size=len(x),
    fixed_train_size=True,
    steps=12,
    refit=False,
    metric='mean_squared_error',
    verbose=False
)

  0%|          | 0/738 [00:00<?, ?it/s]

In [26]:
rsme_backtest = np.sqrt(metric)

rsme_backtest

15.392895780353877

In [27]:
predictions = predictions.astype(float)
predictions = predictions.round().astype(int)
lsoa_names = label_encoder.inverse_transform(predictions)
lsoa_names

  y = column_or_1d(y, warn=True)


array(['Barnet 022F', 'Barnet 023A', 'Barnet 023B', ..., 'Barnet 028E',
       'Barnet 027F', 'Barnet 023B'], dtype=object)

In [28]:
# Predictions for the next 12 months without exogenous features
future_exog = x_test.loc[:, exog]
# Adjust the index of the future_exog to start from the appropriate index
future_exog.index = range(len(x), len(x) + len(x_test))
# Predict for the next 12 months
predicted = forecaster.predict(steps=12, exog=future_exog)

In [29]:
predicted = predicted.astype(float)
predicted = predicted.round().astype(int)
predicted_lsoa_names = label_encoder.inverse_transform(predicted)
predicted_lsoa_names

array(['Barnet 023A', 'Barnet 023A', 'Barnet 024A', 'Barnet 024A',
       'Barnet 024A', 'Barnet 024A', 'Barnet 024A', 'Barnet 024C',
       'Barnet 024D', 'Barnet 024D', 'Barnet 024D', 'Barnet 024D'],
      dtype=object)