In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import zipfile
import io
import requests
from pathlib import Path
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("viridis")

SEED = 42
np.random.seed(SEED)

In [None]:
def download_data(data_dir=None, force_download=False):
    # Data source URLs
    urls = {
        'recent_crime': 'https://raw.githubusercontent.com/IflyNY2PR/DSSS_cw/6bac9ee3834c73d705106153bf91b315bb1faf01/MPS%20LSOA%20Level%20Crime%20(most%20recent%2024%20months).csv',
        'historical_crime': 'https://raw.githubusercontent.com/IflyNY2PR/DSSS_cw/refs/heads/main/MPS%20LSOA%20Level%20Crime%20(Historical).csv',
        'shapefile': 'https://github.com/IflyNY2PR/DSSS_cw/raw/main/statistical-gis-boundaries-london.zip'
    }

    # Create data directory
    data_dir = Path('./crime_data') if data_dir is None else data_dir
    data_dir.mkdir(exist_ok=True)
    shapefile_dir = data_dir / 'shapefiles'
    shapefile_dir.mkdir(exist_ok=True)

    # Define file paths
    paths = {
        'recent_crime': str(data_dir / 'recent_crime.csv'),
        'historical_crime': str(data_dir / 'historical_crime.csv')
    }

    # Check if files need to be downloaded
    files_exist = all([
        Path(paths['recent_crime']).exists(),
        Path(paths['historical_crime']).exists(),
        (shapefile_dir / 'statistical-gis-boundaries-london').exists()
    ])

    if not files_exist or force_download:
        print("Downloading data files...")
        # Download CSV files
        for name in ['recent_crime', 'historical_crime']:
            print(f"Downloading {name}...")
            pd.read_csv(urls[name]).to_csv(paths[name], index=False)

        # Download and extract shapefile
        print("Downloading and extracting shapefile...")
        try:
            r = requests.get(urls['shapefile'])
            r.raise_for_status()
            z = zipfile.ZipFile(io.BytesIO(r.content))
            z.extractall(shapefile_dir)
        except Exception as e:
            print(f"Error downloading shapefile: {e}")
            return None
    else:
        print("Data files already exist.")
    return paths

def preprocess_data(historical_df, recent_df):
    # Define non-date columns
    non_date_columns = ['LSOA Code', 'LSOA Name', 'Borough', 'Major Category', 'Minor Category']

    # Get date columns for each dataframe
    historical_date_cols = [col for col in historical_df.columns if col not in non_date_columns]
    recent_date_cols = [col for col in recent_df.columns if col not in non_date_columns]

    # Create melted dataframes with 'date' column
    historical_melted = pd.melt(
        historical_df,
        id_vars=non_date_columns,
        value_vars=historical_date_cols,
        var_name='date',
        value_name='count'
    )

    recent_melted = pd.melt(
        recent_df,
        id_vars=non_date_columns,
        value_vars=recent_date_cols,
        var_name='date',
        value_name='count'
    )
    # Combine datasets
    combined_df = pd.concat([historical_melted, recent_melted])

    # Convert date strings to datetime (add day 01 to make it a valid date)
    combined_df['date'] = pd.to_datetime(combined_df['date'] + '01', format='%Y%m%d')

    # Check for and handle duplicates
    duplicate_check = combined_df.duplicated(subset=['LSOA Code', 'Major Category', 'Minor Category', 'date'], keep=False)
    if duplicate_check.any():
        print(f"Found {duplicate_check.sum()} duplicate entries. Keeping most recent data.")
        combined_df = combined_df.drop_duplicates(
            subset=['LSOA Code', 'Major Category', 'Minor Category', 'date'],
            keep='last'
        )

    # Sort by date and other identifiers
    combined_df = combined_df.sort_values(['date', 'LSOA Code', 'Major Category', 'Minor Category'])

    # Add temporal features
    combined_df['month'] = combined_df['date'].dt.month
    combined_df['year'] = combined_df['date'].dt.year
    combined_df['day_of_week'] = combined_df['date'].dt.dayofweek

    return combined_df

In [3]:
# Load and preprocess data
data_paths = download_data()

recent_crime_df = pd.read_csv(data_paths['recent_crime'])
historical_crime_df = pd.read_csv(data_paths['historical_crime'])

crime_df = preprocess_data(historical_crime_df, recent_crime_df)

# Filter for demo categories
demo_categories = ['THEFT', 'VIOLENCE AGAINST THE PERSON', 'VEHICLE OFFENCES']
crime_df = crime_df[crime_df['Major Category'].isin(demo_categories)]

print(f"Filtered dataset shape: {crime_df.shape}")
print(f"Categories: {crime_df['Major Category'].unique()}")

Data files already exist.
Filtered dataset shape: (8703900, 10)
Categories: ['THEFT' 'VEHICLE OFFENCES' 'VIOLENCE AGAINST THE PERSON']


In [4]:
def get_category_data(df, category):
    cat_df = df[df['Major Category'] == category].copy()
    # Group by LSOA and Date, summing counts (aggregating minor categories)
    cat_df = cat_df.groupby(['date', 'LSOA Code', 'month', 'year'])['count'].sum().reset_index()
    return cat_df

def prepare_tabular_data(data, window_size=3):
    # Create lag features
    df = data.copy()
    for i in range(1, window_size + 1):
        df[f'lag_{i}'] = df.groupby('LSOA Code')['count'].shift(i)
    
    df = df.dropna()
    return df

In [None]:
def run_xgboost(category, window_size=3):
    print(f"Running XGBoost for {category}...")
    data = get_category_data(crime_df, category)
    df = prepare_tabular_data(data, window_size)
    
    dates = sorted(df['date'].unique())
    split_idx = int(len(dates) * 0.8)
    train_dates = dates[:split_idx]
    test_dates = dates[split_idx:]
    
    train_df = df[df['date'].isin(train_dates)]
    test_df = df[df['date'].isin(test_dates)]
    
    features = [f'lag_{i}' for i in range(1, window_size + 1)] + ['month', 'year']
    target = 'count'
    
    X_train = train_df[features].values
    y_train = train_df[target].values
    X_test = test_df[features].values
    y_test = test_df[target].values
    
    model = XGBRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        random_state=SEED,
        n_jobs=-1
    )
    
    model.fit(X_train, y_train)
    
    preds = model.predict(X_test)
    # Ensure non-negative predictions
    preds = np.maximum(preds, 0)
    
    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    
    # Add MAPE and rRMSE
    y_test_flat = y_test.flatten()
    preds_flat = preds.flatten()
    # Use WMAPE (Weighted MAPE) instead of standard MAPE to handle zeros
    mape = (np.sum(np.abs(y_test_flat - preds_flat)) / (np.sum(y_test_flat) + 1e-8)) * 100
    rrmse = rmse / (np.mean(y_test_flat) + 1e-8)
    
    print(f"XGBoost Results for {category}: MAE={mae:.4f}, RMSE={rmse:.4f}, WMAPE={mape:.4f}%, rRMSE={rrmse:.4f}")
    return model, mae, rmse, mape, rrmse

xgboost_results = {}
for cat in demo_categories:
    _, mae, rmse, mape, rrmse = run_xgboost(cat)
    xgboost_results[cat] = {'MAE': mae, 'RMSE': rmse, 'WMAPE': mape, 'rRMSE': rrmse}

# Display results
results_df = pd.DataFrame(xgboost_results).T
print("\nSummary of XGBoost Results:")
print(results_df)

Running XGBoost for THEFT...
XGBoost Results for THEFT: MAE=2.2457, RMSE=12.2009, WMAPE=12.2564%, rRMSE=2.6636
Running XGBoost for VIOLENCE AGAINST THE PERSON...
XGBoost Results for VIOLENCE AGAINST THE PERSON: MAE=1.9546, RMSE=2.6734, WMAPE=12.4798%, rRMSE=0.6828
Running XGBoost for VEHICLE OFFENCES...
XGBoost Results for VEHICLE OFFENCES: MAE=1.1679, RMSE=1.6435, WMAPE=17.4471%, rRMSE=0.9820

Summary of XGBoost Results:
                                  MAE       RMSE      WMAPE     rRMSE
THEFT                        2.245677  12.200862  12.256428  2.663589
VIOLENCE AGAINST THE PERSON  1.954618   2.673394  12.479815  0.682762
VEHICLE OFFENCES             1.167941   1.643459  17.447124  0.982023
