In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Random seed for reproducibility
np.random.seed(17)

In [2]:
# ===================== LOAD AND CLEAN DATA =====================
data_ksi = pd.read_csv("./Total_KSI.csv")

# Initial data overview
"""Displays an overview of the dataset."""
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)

print("\n===================== DATA OVERVIEW =====================")
print("\nFirst 3 Records:\n", data_ksi.head(3))
print("\nShape of the DataFrame:", data_ksi.shape)
print("\nData Types:\n", data_ksi.dtypes)

print("\n===================== DATA DESCRIPTION =====================")
print("\nStatistical Summary:\n", data_ksi.describe())

print("\n===================== COLUMN INFORMATION =====================")
data_ksi.info()

print("\n===================== MISSING VALUES =====================")
missing_data = data_ksi.isnull().sum().sort_values(ascending=False)
missing_percent = (missing_data / len(data_ksi)) * 100
print(pd.concat([missing_data, missing_percent], axis=1, keys=['Total Missing', 'Percent Missing']))

print("Class Distribution:\n", data_ksi['ACCLASS'].value_counts())

print("\n===================== UNIQUE VALUES =====================")
for column in data_ksi.columns:
    print(f"\nUnique values in {column} ({len(data_ksi[column].unique())}):", data_ksi[column].unique())


# Drop unnecessary columns

columns_to_drop = [ 'OBJECTID', 'INDEX',  # index_id 
    'FATAL_NO', # sequence No. - high missing values
    'OFFSET', #high missing values
    'x', 'y','CYCLISTYPE', 'PEDTYPE', 'PEDACT', # high correlation
    'EMERG_VEH',       # 0 permutation importance 
    'CYCCOND',         # 0 permutation importance 
    "HOOD_140","NEIGHBOURHOOD_140","HOOD_158","STREET1","STREET2"
]





First 3 Records:
    OBJECTID    INDEX    ACCNUM                  DATE  TIME       STREET1  \
0         1  3389067  893184.0  1/1/2006 10:00:00 AM   236  WOODBINE AVE   
1         2  3389068  893184.0  1/1/2006 10:00:00 AM   236  WOODBINE AVE   
2         3  3389069  893184.0  1/1/2006 10:00:00 AM   236  WOODBINE AVE   

       STREET2 OFFSET      ROAD_CLASS               DISTRICT   LATITUDE  \
0  O CONNOR DR    NaN  Major Arterial  Toronto and East York  43.699595   
1  O CONNOR DR    NaN  Major Arterial  Toronto and East York  43.699595   
2  O CONNOR DR    NaN  Major Arterial  Toronto and East York  43.699595   

   LONGITUDE                ACCLOC    TRAFFCTL VISIBILITY LIGHT RDSFCOND  \
0 -79.318797  Intersection Related  No Control      Clear  Dark      Wet   
1 -79.318797  Intersection Related  No Control      Clear  Dark      Wet   
2 -79.318797  Intersection Related  No Control      Clear  Dark      Wet   

            ACCLASS    IMPACTYPE    INVTYPE    INVAGE INJURY  FATAL_N

In [3]:
# Drop unnecessary columns
data_ksi.drop(columns=columns_to_drop, inplace=True)

# Handle missing target values and specific rows
data_ksi['ACCLASS'] = data_ksi['ACCLASS'].fillna('Fatal')
data_ksi.drop(data_ksi[data_ksi['ACCLASS'] == 'Property Damage O'].index, inplace=True)
data_ksi.drop_duplicates(inplace=True)

print("Class Distribution:\n", data_ksi['ACCLASS'].value_counts())


Class Distribution:
 ACCLASS
Non-Fatal Injury    15744
Fatal                2587
Name: count, dtype: int64


In [4]:
def aggregate_rows(group):
    # Find the row with the maximum number of non-null values in the non-'Fatal' group
    max_non_null_row_idx = group.notnull().sum(axis=1).idxmax()
    max_non_null_row = group.loc[max_non_null_row_idx].copy()  
    
    # Apply aggregation based on the column type (mean for numerical, mode for categorical)
    for col in max_non_null_row.index:
        if pd.api.types.is_numeric_dtype(group[col]):
            # For numerical columns, apply the mean
            mean_value = group[col].mean()
            max_non_null_row[col] = mean_value
        else:
            # For categorical columns, apply the mode
            mode_value = group[col].mode().iloc[0] if not group[col].mode().empty else None
            max_non_null_row[col] = mode_value
    
    # Return the processed non-Fatal row
    return max_non_null_row.to_frame().T


In [5]:
# Separate fatal rows
fatal_rows = data_ksi[data_ksi['ACCLASS'] == 'Fatal']

# Separate non-fatal rows
non_fatal_rows = data_ksi[data_ksi['ACCLASS'] != 'Fatal']

# Apply aggregation logic on non-fatal rows based on ACCNUM
aggregated_data = non_fatal_rows.groupby(['ACCNUM'], as_index=False).apply(aggregate_rows, include_groups=False).reset_index(drop=True)

# Combine the aggregated data with the fatal rows
data_ksi = pd.concat([aggregated_data, fatal_rows], ignore_index=True)

# Shuffle the combined data
data_ksi = data_ksi.sample(frac=1, random_state=42).reset_index(drop=True)

print("Class Distribution:\n", data_ksi['ACCLASS'].value_counts())


Class Distribution:
 ACCLASS
Non-Fatal Injury    4324
Fatal               2587
Name: count, dtype: int64


In [6]:
# Map month numbers to seasons using the MONTH column
def month_to_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

In [7]:
# Format date and time
data_ksi["DATE"] = pd.to_datetime(data_ksi["DATE"]).dt.to_period("D").astype(str)

# Extract date components from the 'DATE' column
data_ksi['MONTH'] = pd.to_datetime(data_ksi['DATE']).dt.month
data_ksi['DAY'] = pd.to_datetime(data_ksi['DATE']).dt.day
data_ksi['DAYOFWEEK'] = pd.to_datetime(data_ksi['DATE']).dt.dayofweek

# Extract week of month as number (1 to 5)
data_ksi['WEEK_OF_MONTH'] = ((data_ksi['DAY'] - 1) // 7 + 1)

# Extract season
data_ksi['SEASON'] = data_ksi['MONTH'].apply(month_to_season)

# Replace specific values
data_ksi['ROAD_CLASS'] = data_ksi['ROAD_CLASS'].str.replace(r'MAJOR ARTERIAL ', 'MAJOR ARTERIAL', regex=False)

# Fill missing values
unknown_columns = ['PEDCOND', 'DRIVCOND', 'MANOEUVER', 'CYCACT',
                   'INJURY', 'VEHTYPE', 'INVTYPE', 'IMPACTYPE', 'DISTRICT', 'INITDIR']
other_columns = ['ROAD_CLASS', 'ACCLOC', 'VISIBILITY', 'LIGHT', 'RDSFCOND', 'DRIVACT']
boolean_columns = ['PEDESTRIAN', 'CYCLIST', 'MOTORCYCLE',
                   'PASSENGER', 'SPEEDING', 'AG_DRIV', 'REDLIGHT', 'ALCOHOL', 'TRSN_CITY_VEH', 'DISABILITY','AUTOMOBILE','TRUCK']

data_ksi[boolean_columns] = data_ksi[boolean_columns].fillna("No")

data_ksi['INVAGE'] = data_ksi['INVAGE'].fillna("unknown")

# Handle age column
data_ksi['INVAGE'] = data_ksi['INVAGE'].replace('unknown', np.nan)
data_ksi['INVAGE'] = data_ksi['INVAGE'].str.replace('OVER 95', '95 to 100')
data_ksi[['min_age', 'max_age']] = data_ksi['INVAGE'].str.split(' to ', expand=True)
data_ksi['min_age'] = pd.to_numeric(data_ksi['min_age'], errors='coerce')
data_ksi['max_age'] = pd.to_numeric(data_ksi['max_age'], errors='coerce')
data_ksi['AVG_AGE'] = data_ksi[['min_age', 'max_age']].mean(axis=1).astype(float)

data_ksi.drop(columns=['INVAGE','min_age', 'max_age'], inplace=True)
data_ksi[other_columns] = data_ksi[other_columns].fillna("Other")
data_ksi[unknown_columns] = data_ksi[unknown_columns].fillna("Unknown")

# Convert boolean columns to numeric
pd.set_option('future.no_silent_downcasting', True)
data_ksi[boolean_columns] = data_ksi[boolean_columns].replace({'Yes': 1, 'No': 0}).astype(float)

data_ksi["TRAFFCTL"] = data_ksi["TRAFFCTL"].fillna("No_Control")


In [8]:
# Apply the function to extract hours and minutes
data_ksi['HOUR'] = data_ksi['TIME'].apply(lambda x: f"{int(x) // 100:02d}" if x >= 100 else '00')  # Extract hours for 3 or 4 digits
data_ksi['MINUTE'] = data_ksi['TIME'].apply(lambda x: f"{int(x) % 100:02d}" if x >= 100 else f"{int(x):02d}")  # Extract minutes

data_ksi['HOUR'] = data_ksi['HOUR'].astype(int)
data_ksi['MINUTE'] = data_ksi['MINUTE'].astype(int)

data_ksi.drop(columns=['TIME','DATE','MONTH','DAY','ACCNUM'], inplace=True)

from imblearn.under_sampling import RandomUnderSampler
# Handle class imbalance
rus = RandomUnderSampler(random_state=17)
X_res, y_res = rus.fit_resample(data_ksi.drop(columns=['ACCLASS']), data_ksi['ACCLASS'])
data_ksi = pd.concat([X_res, y_res], axis=1).sample(frac=1, random_state=17).reset_index(drop=True)

print("\n===================== DATA CLEANING DONE =====================")
print("\nShape of the DataFrame after cleaning:", data_ksi.shape)
print("Class Distribution:\n", data_ksi['ACCLASS'].value_counts())



Shape of the DataFrame after cleaning: (5174, 40)
Class Distribution:
 ACCLASS
Fatal               2587
Non-Fatal Injury    2587
Name: count, dtype: int64


In [9]:
print("\n===================== UNIQUE VALUES =====================")
for column in data_ksi.columns:
    print(f"\nUnique values in {column}:", data_ksi[column].unique())



Unique values in ROAD_CLASS: ['Other' 'Collector' 'Major Arterial' 'Minor Arterial' 'Local'
 'Expressway' 'Expressway Ramp' 'Laneway' 'Pending' 'Major Arterial '
 'Major Shoreline']

Unique values in DISTRICT: ['Toronto and East York' 'North York' 'Scarborough' 'Unknown'
 'Etobicoke York']

Unique values in LATITUDE: [43.636344 43.726345 43.672345 ... 43.675574 43.770243 43.730945]

Unique values in LONGITUDE: [-79.466892 -79.43439 -79.37959 ... -79.348621 -79.537082 -79.460712]

Unique values in ACCLOC: ['Other' 'At Intersection' 'Intersection Related' 'Non Intersection'
 'At/Near Private Drive' 'Laneway' 'Overpass or Bridge' 'Private Driveway'
 'Underpass or Tunnel']

Unique values in TRAFFCTL: ['No Control' 'Stop Sign' 'Traffic Signal' 'Pedestrian Crossover'
 'Traffic Controller' 'Traffic Gate' 'School Guard' 'No_Control'
 'Yield Sign' 'Streetcar (Stop for)' 'Police Control']

Unique values in VISIBILITY: ['Clear' 'Rain' 'Other' 'Snow' 'Fog, Mist, Smoke, Dust' 'Freezing Rain'
 'St

In [10]:
features = data_ksi.drop(columns=["ACCLASS"])
target = data_ksi["ACCLASS"]

unseen_features = features[-10:]
unseen_labels = target[-10:]

features = features[:-10]
target = target[:-10]

cleaned_df = data_ksi.drop(data_ksi.index[-10:])

# Encode the target variable
label_encoder = LabelEncoder()
target = label_encoder.fit_transform(target)

# Encode unseen labels
unseen_labels = label_encoder.transform(unseen_labels)

In [11]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# Split the data into train & test
X_train, X_test, y_train, y_test = train_test_split(
        features, target, stratify=target, test_size=0.2, random_state=17)

# Preprocess the data
num_features = features.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_features = features.select_dtypes(include=['object']).columns.tolist()

print("\n===================== FEATURES INFO =====================")
print("\nNumerical Features:", num_features)
print("\nCategorical Features:", cat_features)

num_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

cat_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='constant', fill_value='mode')),
        ('encoder', OneHotEncoder(handle_unknown="ignore", sparse_output=False))
    ])

preprocessor = ColumnTransformer([
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ])




Numerical Features: ['PEDESTRIAN', 'CYCLIST', 'AUTOMOBILE', 'MOTORCYCLE', 'TRUCK', 'TRSN_CITY_VEH', 'PASSENGER', 'SPEEDING', 'AG_DRIV', 'REDLIGHT', 'ALCOHOL', 'DISABILITY', 'AVG_AGE']

Categorical Features: ['ROAD_CLASS', 'DISTRICT', 'LATITUDE', 'LONGITUDE', 'ACCLOC', 'TRAFFCTL', 'VISIBILITY', 'LIGHT', 'RDSFCOND', 'IMPACTYPE', 'INVTYPE', 'INJURY', 'INITDIR', 'VEHTYPE', 'MANOEUVER', 'DRIVACT', 'DRIVCOND', 'PEDCOND', 'CYCACT', 'NEIGHBOURHOOD_158', 'DIVISION', 'SEASON']


In [12]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [13]:
def fit_and_predict(classifiers, X_train, y_train, X_test, y_test, voting_type):
    from sklearn.metrics import accuracy_score,f1_score, roc_auc_score, confusion_matrix, precision_score, recall_score, classification_report
    
    print(f"\n===================== {voting_type.upper()} VOTING PREDICTIONS =====================")
    print("\npredicting for the first three instances of test data:")
    for classifier in classifiers:
        classifier.fit(X_train, y_train)
        predictions = classifier.predict(X_test[:3])
        print(f"\nClassifier: {classifier}")
        print("Confusion Matrix:\n", confusion_matrix(y_test, classifier.predict(X_test)))
        print("Classification Report:\n", classification_report(y_test, classifier.predict(X_test)))

        print ("Training Accuracy Score:", classifier.score(X_train, y_train))
        print("Testing Accuracy Score:", accuracy_score(y_test, classifier.predict(X_test)))

        print("Precision Score:", precision_score(y_test, classifier.predict(X_test)))
        print("Recall Score:", recall_score(y_test, classifier.predict(X_test)))
        print("F1 Score:", f1_score(y_test, classifier.predict(X_test)))
        print("ROC AUC Score:", roc_auc_score(y_test, classifier.predict(X_test)))
            
        print(f"Predictions: {predictions}")
        print(f"Actual Values: {y_test[:3]}")

        classifier.fit(unseen_features, unseen_labels)
        unseen_predictions = classifier.predict(unseen_features)
        unseen_accuracy = classifier.score(unseen_features, unseen_labels)

        print("\n===================== UNSEEN DATA METRICS =====================")
        print("\nUnseen Predictions:", unseen_predictions)
        for i in range(len(unseen_features)):
            print(f"Predicted: {unseen_predictions[i]} Actual: {unseen_labels[i]}")
        
        print(f"Unseen Data Accuracy: {unseen_accuracy:.4f}")


In [14]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

# Define the classifiers
log_reg_H = LogisticRegression(max_iter=1400)

dt_H = DecisionTreeClassifier(criterion='entropy', max_depth=42)

nn_H = MLPClassifier(
    activation='tanh',
    alpha=0.01,
    hidden_layer_sizes=(15, 10, 1),
    learning_rate='invscaling',
    max_iter=1000,
    solver='adam')

svm_H = SVC(C=1, kernel='linear')

svm_soft_H = SVC(C=1, kernel='linear',probability=True)  # For soft voting

rf_H = RandomForestClassifier(
    n_estimators=1000, 
    random_state=37, 
    n_jobs=-1, 
    class_weight='balanced')


In [15]:
# hard voting
voting_H = VotingClassifier(estimators=[('lr', log_reg_H), ('rf', rf_H), ('svm', svm_H), ('dt', dt_H), ('nn', nn_H)], voting='hard')
voting_H.fit(X_train, y_train)
classifiers_hard = [log_reg_H, rf_H, svm_H, dt_H, nn_H, voting_H]
unseen_features = preprocessor.transform(unseen_features)
fit_and_predict(classifiers_hard, X_train, y_train, X_test, y_test, "Hard")



predicting for the first three instances of test data:

Classifier: LogisticRegression(max_iter=1400)
Confusion Matrix:
 [[502  14]
 [  7 510]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98       516
           1       0.97      0.99      0.98       517

    accuracy                           0.98      1033
   macro avg       0.98      0.98      0.98      1033
weighted avg       0.98      0.98      0.98      1033

Training Accuracy Score: 0.9917695473251029
Testing Accuracy Score: 0.9796708615682478
Precision Score: 0.9732824427480916
Recall Score: 0.9864603481624759
F1 Score: 0.9798270893371758
ROC AUC Score: 0.9796642826083697
Predictions: [1 0 1]
Actual Values: [1 0 1]


Unseen Predictions: [1 1 1 0 0 1 0 0 0 1]
Predicted: 1 Actual: 1
Predicted: 1 Actual: 1
Predicted: 1 Actual: 1
Predicted: 0 Actual: 0
Predicted: 0 Actual: 0
Predicted: 1 Actual: 1
Predicted: 0 Actual: 0
Predicted: 0 Actual: 0
Predicted: 0

In [17]:
# Soft voting
voting_S = VotingClassifier(estimators=[('lr', log_reg_H), ('rf', rf_H), ('svm', svm_soft_H), ('dt', dt_H), ('nn', nn_H)], voting='soft')
voting_S.fit(X_train, y_train)
classifiers_soft = [log_reg_H, rf_H, svm_soft_H, dt_H, nn_H, voting_S]

fit_and_predict(classifiers_soft, X_train, y_train, X_test, y_test, "Soft")




predicting for the first three instances of test data:

Classifier: LogisticRegression(max_iter=1400)
Confusion Matrix:
 [[502  14]
 [  7 510]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98       516
           1       0.97      0.99      0.98       517

    accuracy                           0.98      1033
   macro avg       0.98      0.98      0.98      1033
weighted avg       0.98      0.98      0.98      1033

Training Accuracy Score: 0.9917695473251029
Testing Accuracy Score: 0.9796708615682478
Precision Score: 0.9732824427480916
Recall Score: 0.9864603481624759
F1 Score: 0.9798270893371758
ROC AUC Score: 0.9796642826083697
Predictions: [1 0 1]
Actual Values: [1 0 1]


Unseen Predictions: [1 1 1 0 0 1 0 0 0 1]
Predicted: 1 Actual: 1
Predicted: 1 Actual: 1
Predicted: 1 Actual: 1
Predicted: 0 Actual: 0
Predicted: 0 Actual: 0
Predicted: 1 Actual: 1
Predicted: 0 Actual: 0
Predicted: 0 Actual: 0
Predicted: 0





Unseen Predictions: [1 1 1 0 0 1 0 0 0 1]
Predicted: 1 Actual: 1
Predicted: 1 Actual: 1
Predicted: 1 Actual: 1
Predicted: 0 Actual: 0
Predicted: 0 Actual: 0
Predicted: 1 Actual: 1
Predicted: 0 Actual: 0
Predicted: 0 Actual: 0
Predicted: 0 Actual: 0
Predicted: 1 Actual: 1
Unseen Data Accuracy: 1.0000

Classifier: VotingClassifier(estimators=[('lr', LogisticRegression(max_iter=1400)),
                             ('rf',
                              RandomForestClassifier(class_weight='balanced',
                                                     n_estimators=1000,
                                                     n_jobs=-1,
                                                     random_state=37)),
                             ('svm',
                              SVC(C=1, kernel='linear', probability=True)),
                             ('dt',
                              DecisionTreeClassifier(criterion='entropy',
                                                     max_depth=42))