In [1]:
import time

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from xgboost import XGBRegressor

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# read and verify the dataset
df = pd.read_csv("sydney_traffic_incidents_clean.csv")
df.head()

Unnamed: 0,Main_Category,Longitude,Latitude,Day,duration,Primary_Vehicle,Secondary_Vehicle,Is_Major_Incident,Advice_A,Advice_B,...,Emergency services,Transport for NSW,Tow truck,Heavy vehicle tow truck,Incident_Type,Num_Vehicles_Involved,Month,Hour,distance_to_CBD,duration_class
0,1,151.058,-33.9431,5,39.31995,1,2,0,10,0,...,1,0,0,0,0,2,8,19,15.781064,1
1,0,151.159,-33.8096,6,12.029,1,9,0,10,134,...,0,0,0,0,1,1,8,0,8.323913,0
2,1,150.875,-33.7344,6,16.956667,1,9,0,26,134,...,1,0,0,0,0,1,8,1,34.354983,0
3,1,150.973,-33.8172,6,42.804117,5,2,0,10,0,...,1,1,0,0,0,1,8,1,22.500677,1
4,1,150.981,-33.7919,6,40.84,3,4,0,10,134,...,0,0,0,0,0,3,8,1,22.753003,1


In [3]:
# for linear regression we need the continuous "duration" column
y = df["duration"]

# drop both duration and duration_class as they are highly corelated
X = df.drop(columns=["duration_class", "duration"])

In [4]:
# much like SVM, we need to scale the numeric fields
numeric_cols = X.select_dtypes(include=["float64", "float32"]).columns.tolist()

# also like SVM, categories should be one-hot encoded
int_cols = X.select_dtypes(include=["int64", "int32"]).columns.tolist()

# I will similarly eliminate the categories that are simply too large
# to deal with this, I will have to remove the high cardinality columns before one-hot encoding because they choke the model
cardinality = X[int_cols].nunique().sort_values()

low_cardinality = [c for c in int_cols if X[c].nunique() <= 50]
high_cardinality = [c for c in int_cols if X[c].nunique() > 50]

print("Low-cardinality categorical:", low_cardinality[:10])
print("High-cardinality to drop:", high_cardinality[:10])

Low-cardinality categorical: ['Main_Category', 'Day', 'Primary_Vehicle', 'Secondary_Vehicle', 'Is_Major_Incident', 'Advice_A', 'Closure_Type', 'Direction', 'Affected_Lanes', 'Actual_Number_of_Lanes']
High-cardinality to drop: ['Advice_B', 'Main_Street', 'Suburb', 'SA2_CODE21', '12_NoN', '13_NDEs', '15_NNC3L', '16_NNC4L', '18_NE', '21 NBS']


In [5]:
# drop the high cardinality columns
X_filtered = X.drop(columns=high_cardinality)

#one-hot encode and scale numerics
preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), low_cardinality)
    ]
)

In [6]:
# finally, split and train the model
X_train, X_test, y_train, y_test = train_test_split(
    X_filtered, y, test_size=0.2, random_state=42, shuffle=True
)

lin_reg = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LinearRegression())
])
start_time = time.time()
lin_reg.fit(X_train, y_train)
end_time = time.time()

y_pred = lin_reg.predict(X_test)

print(f"Training time: {end_time - start_time:.2f} seconds")
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 Score:", r2_score(y_test, y_pred))

Training time: 3.25 seconds
MAE: 25.98973634487247
RMSE: 41.65279368704854
R2 Score: 0.185416093757792


In [7]:
# now, let's compare to the XGBoost version of linear regression, using the same preprocessing and default values

xgb_linear = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", XGBRegressor(
        booster="gblinear",
        objective="reg:squarederror",
        n_estimators=100,
        reg_lambda=1.0,
        reg_alpha=0.0,
        n_jobs=-1,
        random_state=42,
    )),
])

start_time = time.time()
xgb_linear.fit(X_train, y_train)
end_time = time.time()

y_pred = xgb_linear.predict(X_test)

print(f"Training time: {end_time - start_time:.2f} seconds")
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
print("R2 Score:", r2_score(y_test, y_pred))

Training time: 1.10 seconds
MAE: 27.816036021811282
RMSE: 44.648345221062314
R2 Score: 0.06403784211544539
