In [26]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

import xgboost as xgb
from xgboost import DMatrix

import matplotlib.pyplot as plt
import seaborn as sns
import time

In [43]:
train_data = pd.read_csv("burglary_train.csv")
val_data = pd.read_csv("burglary_validation.csv")
test_data = pd.read_csv("burglary_test.csv")

# The LSOA name will be the y, so create new data for it
y_train = train_data["LSOA name"]
y_val = val_data["LSOA name"]
y_test = test_data["LSOA name"]

# Change the y data to numerical data for model by replacing letter at the end and removing the word Barnet
mapping = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5}
print('The LSOA name will be changed to numerical data by replacing letter at the end: ' + str(mapping))
print('For example, Barnet 023A will be 230.')

num_train = np.array([int(value.split()[1][:3]) for value in y_train])
let_train = np.array([mapping[value.split()[1][-1]] for value in y_train])
y_train = num_train * 10 + let_train


num_val = np.array([int(value.split()[1][:3]) for value in y_val])
let_val = np.array([mapping[value.split()[1][-1]] for value in y_val])
y_val = num_val * 10 + let_val
y_val = y_val.reshape(-1, 1)

num_test = np.array([int(value.split()[1][:3]) for value in y_test])
let_test = np.array([mapping[value.split()[1][-1]] for value in y_test])
y_test = num_test * 10 + let_test
y_test = y_test.reshape(-1, 1)
print("The outcome data is now numerical.")

# Delete the y from the x data and unwanted columns
drop = ["Crime type", "Last outcome category", "Reported by", "Crime ID", "Location", "LSOA name"]
x_train = train_data.drop(labels=drop, axis=1, inplace=False)
x_val = val_data.drop(labels=drop, axis=1, inplace=False)
x_test = test_data.drop(labels=drop, axis=1, inplace=False)

# Change month data to numerical data
x_train["Month"] = x_train["Month"].str.replace("-", "").astype(int)
x_val["Month"] = x_val["Month"].str.replace("-", "").astype(int)
x_test["Month"] = x_test["Month"].str.replace("-", "").astype(int)

# Extract numerical data of LSOA code
x_train["LSOA code"] = x_train["LSOA code"].str[1:].astype(int)
x_val["LSOA code"] = x_val["LSOA code"].str[1:].astype(int)
x_test["LSOA code"] = x_test["LSOA code"].str[1:].astype(int)

data_list = [x_train, x_val, x_test]

def data_encoder(lst: list):
    for data in lst:
        if 'LSOA code' in data.columns:
            encoder = LabelEncoder()
            try:
                encoded_labels = encoder.fit_transform(data['LSOA code'])
                data['LSOA_Encoded'] = np.where(encoded_labels <= 210, encoded_labels, -1)
            except Exception as e:
                print(f"Error occurred while encoding column: {e}")
                continue
        else:
            print("Column 'LSOA code' not found in dataframe.")

data_encoder(data_list)

The LSOA name will be changed to numerical data by replacing letter at the end: {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5}
For example, Barnet 023A will be 230.
The outcome data is now numerical.


In [44]:
dtrain_reg_1 = xgb.DMatrix(x_train, y_train, enable_categorical = True)
dtest_reg_1 = xgb.DMatrix(x_test, y_test, enable_categorical = True)

In [45]:
dtrain_reg = xgb.DMatrix(x_train, label = y_train)
dtest_reg = xgb.DMatrix(x_test, label = y_test)

In [46]:
dtrain_reg_1 == dtrain_reg

False

In [47]:
x_train

Unnamed: 0.1,Unnamed: 0,Month,Longitude,Latitude,LSOA code,LSOA_Encoded
0,0,202211,-0.204414,51.653895,1000248,132
1,1,202211,-0.207948,51.654471,1000248,132
2,2,202211,-0.209802,51.658770,1000249,133
3,3,202211,-0.199524,51.652885,1000250,134
4,4,202211,-0.196977,51.653682,1000250,134
...,...,...,...,...,...,...
30981,30981,201212,-0.282356,51.621970,1000191,75
30982,30982,201212,-0.274363,51.620927,1000191,75
30983,30983,201212,-0.151981,51.615476,1000158,42
30984,30984,201212,-0.150158,51.616625,1000158,42


In [48]:
num_classes = len(np.unique(y_train))
print(len(y_train))

print(np.unique(y_test))
print(num_classes)

30986
[ 10  11  12  13  14  15  20  21  22  23  24  30  31  32  33  40  41  42
  43  44  45  50  51  52  53  60  61  62  63  64  70  71  72  73  74  75
  80  81  82  83  84  90  91  92  93  94 100 101 102 103 104 110 111 112
 113 114 120 121 122 123 124 130 131 132 133 134 135 140 141 142 143 144
 145 150 151 152 153 154 155 160 161 162 163 170 171 172 173 180 181 182
 183 184 190 191 192 193 194 195 200 201 202 203 204 210 211 212 213 214
 220 221 222 223 224 225 230 231 232 233 240 241 242 243 244 245 250 251
 252 253 254 260 261 262 263 264 270 271 272 273 274 275 280 281 282 283
 284 290 291 292 293 294 295 300 301 303 304 305 310 311 312 313 320 321
 322 323 324 325 330 331 332 333 334 335 340 341 342 343 350 351 352 353
 354 355 360 361 362 363 364 365 370 371 372 373 374 375 380 381 382 383
 384 390 391 392 393 400 401 402 403 410 411 412 413]
211


In [36]:
params = {
    'objective': 'multi:softmax',  # Binary classification
    'eval_metric': 'auc', # Log loss as the evaluation metric
    'tree_method': 'gpu_hist'  
}   
params['num_class'] = num_classes      

n = 10
model = xgb.train(
   params=params,
   dtrain=dtrain_reg_1,
   num_boost_round=n,
)

XGBoostError: [17:37:52] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-07593ffd91cd9da33-1\xgboost\xgboost-ci-windows\src\objective\multiclass_obj.cu:123: SoftmaxMultiClassObj: label must be in [0, num_class).

## Issue with code

LSOA in y variables do not match the encoding of x variables. 
Need to match them
