In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
import warnings
from skforecast.exceptions import LongTrainingWarning
from skforecast.ForecasterAutoregDirect import ForecasterAutoregDirect
from skforecast.model_selection import grid_search_forecaster
from skforecast.model_selection import backtesting_forecaster

In [2]:
# Load the data
train_data = pd.read_csv("add_burglary_train.csv")
val_data = pd.read_csv("add_burglary_validation.csv")
test_data = pd.read_csv("add_burglary_test.csv")


In [3]:
# The LSOA name will be the y, so create new data for it
y_train = train_data["LSOA name"]
y_val = val_data["LSOA name"]
y_test = test_data["LSOA name"]

# Combine all labels
all_labels = pd.concat([y_train, y_val, y_test])

# Encode labels with integer values
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)

# Transform labels for each dataset
y_train = label_encoder.transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)

In [4]:
# Delete the y from the x data and unwanted columns
drop = ["Last outcome category", "LSOA name", "LSOA11NM"]
x_train = train_data.drop(labels=drop, axis=1, inplace=False)
x_val = val_data.drop(labels=drop, axis=1, inplace=False)
x_test = test_data.drop(labels=drop, axis=1, inplace=False)

# Set categorical features format to category
x_train['Location'] = x_train['Location'].astype('category')
x_train['LSOA code'] = x_train['LSOA code'].astype('category')
x_train["Ward name"] = x_train["Ward name"].astype("category")
x_train["Ward - House price lower third"] = x_train["Ward - House price lower third"].astype("category")
x_train["Ward - House price upper third"] = x_train["Ward - House price upper third"].astype("category")

x_val['Location'] = x_val['Location'].astype('category')
x_val['LSOA code'] = x_val['LSOA code'].astype('category')
x_val["Ward name"] = x_val["Ward name"].astype("category")
x_val["Ward - House price lower third"] = x_val["Ward - House price lower third"].astype("category")
x_val["Ward - House price upper third"] = x_val["Ward - House price upper third"].astype("category")

x_test['Location'] = x_test['Location'].astype('category')
x_test['LSOA code'] = x_test['LSOA code'].astype('category')
x_test["Ward name"] = x_test["Ward name"].astype("category")
x_test["Ward - House price lower third"] = x_test["Ward - House price lower third"].astype("category")
x_test["Ward - House price upper third"] = x_test["Ward - House price upper third"].astype("category")

# Change format of the Month column from string to datetime
x_train['Month'] = pd.to_datetime(x_train['Month'], format='%Y-%m')
x_val['Month'] = pd.to_datetime(x_val['Month'], format='%Y-%m')
x_test['Month'] = pd.to_datetime(x_test['Month'], format='%Y-%m')


In [6]:
exog = ['Month', 'Longitude', 'Latitude', 'Location', 'LSOA code', 'Ward name', 'Ward - House price lower third', 'Ward - House price upper third', 'Ward - % All Working-age (16-64)', 'Ward - % All Older people aged 65+', 'Ward - Population density (persons per sq km)', 'Ward - % Not Born in UK']

In [7]:
# Transformer: Ordinal encoding + cast to category type
pipeline_categorical = make_pipeline(
                            OrdinalEncoder(
                                dtype=int,
                                handle_unknown="use_encoded_value",
                              unknown_value=-1,
                                encoded_missing_value=-1
                            ),
                            FunctionTransformer(
                                func=lambda x: x.astype('category'),
                                feature_names_out= 'one-to-one'
                            )
                       )
transformer_exog = make_column_transformer(
                        (
                            pipeline_categorical,
                            make_column_selector(dtype_exclude=np.number)
                        ),
                        remainder="passthrough",
                        verbose_feature_names_out=False,
                   ).set_output(transform="pandas")


In [8]:
# Create forecaster with automatic categorical detection
forecaster = ForecasterAutoregDirect(
    regressor=xgb.XGBRFRegressor(tree_method='hist', random_state=42, enable_categorical='auto'),
    lags=12,
    steps=12,
    transformer_exog=transformer_exog
)

In [9]:
# Create forecaster with automatic categorical detection
forecaster1 = ForecasterAutoregDirect(
    regressor=xgb.XGBRFClassifier(tree_method="hist", objective="multi:softmax", random_state=42, enable_categorical='auto'),
    lags=12,
    steps=12,
    transformer_exog=transformer_exog
)

In [12]:

# Define hyperparameters
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5, 7, 10],
    'learning_rate': [0.35],
}
# 100,

#, 0.75, 0.85, 0.95, 1
# Lags used as predictors
lags_grid = [12, 24, 36, 48]


In [13]:

# Obtaining best model
x = pd.concat([x_train, x_val], ignore_index=True)
y = np.append(y_train, y_val)
y = pd.Series(y)
y = y.astype(int)
warnings.simplefilter('ignore', category=LongTrainingWarning)

result_grid = grid_search_forecaster(
    forecaster=forecaster,
    y=y,
    exog=x.loc[:, exog],
    param_grid=param_grid,
    lags_grid=lags_grid,
    steps=12,
    refit=False,
    metric=['mean_squared_error', 'mean_absolute_error'],
    initial_train_size=len(x_train),
    fixed_train_size=False,
    return_best=True,
    verbose=False
)


Number of models compared: 120.


lags grid:   0%|          | 0/4 [00:00<?, ?it/s]

params grid:   0%|          | 0/30 [00:00<?, ?it/s]

params grid:   0%|          | 0/30 [00:00<?, ?it/s]

params grid:   0%|          | 0/30 [00:00<?, ?it/s]

params grid:   0%|          | 0/30 [00:00<?, ?it/s]

`Forecaster` refitted using the best-found lags and parameters, and the whole data set: 
  Lags: [ 1  2  3  4  5  6  7  8  9 10 11 12] 
  Parameters: {'learning_rate': 0.35, 'max_depth': 4, 'n_estimators': 100}
  Backtesting metric: 965.1255925213918



In [33]:
# Include the RSME value into the results grid for all the models compared and print the grid to compare them

result_grid.insert(0, "RMSE", np.sqrt(result_grid["mean_squared_error"]))

result_grid

Unnamed: 0,RMSE,lags,params,mean_squared_error,mean_absolute_error,learning_rate,max_depth,n_estimators
21,31.019642,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 0.35, 'max_depth': 4, 'n_est...",962.218177,25.794049,0.35,4.0,100.0
39,31.245618,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 0.35, 'max_depth': 4, 'n_est...",976.288644,25.811872,0.35,4.0,100.0
23,31.296489,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 0.35, 'max_depth': 4, 'n_est...",979.470231,26.282742,0.35,4.0,300.0
22,31.328753,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 0.35, 'max_depth': 4, 'n_est...",981.490793,26.370290,0.35,4.0,200.0
41,31.332893,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 0.35, 'max_depth': 4, 'n_est...",981.750165,26.191350,0.35,4.0,300.0
...,...,...,...,...,...,...,...,...
33,38.835660,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 0.45, 'max_depth': 5, 'n_est...",1508.208521,32.934293,0.45,5.0,100.0
69,38.860244,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 0.45, 'max_depth': 5, 'n_est...",1510.118593,32.919672,0.45,5.0,100.0
70,38.907327,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","{'learning_rate': 0.45, 'max_depth': 5, 'n_est...",1513.780099,32.927268,0.45,5.0,200.0
17,39.336407,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]","{'learning_rate': 0.45, 'max_depth': 5, 'n_est...",1547.352921,33.404178,0.45,5.0,300.0


In [12]:

# Backtesting with the test data the best model
x2 = pd.concat([x, x_test], ignore_index=True)
y = np.append(y, y_test)
y = pd.Series(y)

# Predictions for the next 12 months without exogenous features
fut_exog = x2.loc[:, exog]
# Adjust the index of the future_exog to start from the appropriate index
#fut_exog.index = range(len(x2), len(x2) + len(x_test))

metric, predictions = backtesting_forecaster(
    forecaster=forecaster,
    y=y,
    initial_train_size=len(x),
    fixed_train_size=True,
    steps=12,
    refit=False,
    metric=['mean_squared_error', 'mean_absolute_error'],
    verbose=False
)

  0%|          | 0/738 [00:00<?, ?it/s]

In [13]:
# Obtain the RSME of the Backstesting and the Absolute Mean error of the backtesting
rsme_backtest = np.sqrt(metric[0])

absolute_backtest = metric[1]

rsme_backtest, absolute_backtest

array([10.25638995,  2.8136443 ])

In [14]:
predictions = predictions.astype(float)
predictions = predictions.round().astype(int)
lsoa_names = label_encoder.inverse_transform(predictions)
lsoa_names

  y = column_or_1d(y, warn=True)


array(['Barnet 009A', 'Barnet 009A', 'Barnet 009A', ..., 'Barnet 014D',
       'Barnet 015A', 'Barnet 015D'], dtype=object)

In [15]:
# Predictions for the next 12 months without exogenous features
future_exog = x_test.loc[:, exog]
# Adjust the index of the future_exog to start from the appropriate index
future_exog.index = range(len(x), len(x) + len(x_test))
# Predict for the next 12 months
predicted = forecaster.predict(steps=12, exog=future_exog)

In [16]:
predicted = predicted.astype(float)
predicted = predicted.round().astype(int)
predicted_lsoa_names = label_encoder.inverse_transform(predicted)
predicted_lsoa_names

array(['Barnet 009A', 'Barnet 009A', 'Barnet 009A', 'Barnet 009A',
       'Barnet 009A', 'Barnet 009A', 'Barnet 009A', 'Barnet 009A',
       'Barnet 009A', 'Barnet 009A', 'Barnet 009A', 'Barnet 009A'],
      dtype=object)