In [94]:

# Importing necessary libraries
from gettext import install
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Modeling
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Evaluation
from sklearn.metrics import classification_report, f1_score


In [95]:
# Data Loading
df = pd.read_csv('Traffic_Crashes.csv')
df.head()

  df = pd.read_csv('Traffic_Crashes.csv')


Unnamed: 0,CRASH_RECORD_ID,CRASH_DATE_EST_I,CRASH_DATE,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,...,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LATITUDE,LONGITUDE,LOCATION
0,97f1975e8f3e9a1b53ae1abfb6982a374074d8649d9e97...,,01/28/2026 10:56:00 PM,30,NO CONTROLS,NO CONTROLS,SNOW,"DARKNESS, LIGHTED ROAD",FIXED OBJECT,NOT DIVIDED,...,0.0,0.0,1.0,0.0,22,4,1,41.713829,-87.551093,POINT (-87.551093105845 41.713829100033)
1,1a00190102664f10ee5c2ee8767d45c331991692f12dfc...,,01/28/2026 10:25:00 PM,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,"DARKNESS, LIGHTED ROAD",REAR END,NOT DIVIDED,...,0.0,0.0,2.0,0.0,22,4,1,41.796711,-87.755202,POINT (-87.755202215729 41.796710893317)
2,a4fc7133c8193ec53288a9acec055321dee47515621012...,Y,01/28/2026 10:10:00 PM,30,OTHER,OTHER,OTHER,UNKNOWN,PARKED MOTOR VEHICLE,OTHER,...,0.0,0.0,2.0,0.0,22,4,1,41.813005,-87.603823,POINT (-87.603822899265 41.813004951227)
3,e79f2db27a528710d42b2eb1991876b7a9bf029aee3685...,,01/28/2026 10:10:00 PM,30,STOP SIGN/FLASHER,FUNCTIONING PROPERLY,CLEAR,"DARKNESS, LIGHTED ROAD",REAR END,FOUR WAY,...,0.0,0.0,3.0,0.0,22,4,1,41.868335,-87.705668,POINT (-87.705668192505 41.868335288795)
4,48040347f534c316e38421a60b65ab7017ae47cb4a0c3c...,,01/28/2026 10:05:00 PM,30,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",PARKED MOTOR VEHICLE,NOT DIVIDED,...,0.0,0.0,2.0,0.0,22,4,1,41.866618,-87.696128,POINT (-87.696128029764 41.866617682133)


In [96]:
# Data Overview
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1024029 entries, 0 to 1024028
Data columns (total 48 columns):
 #   Column                         Non-Null Count    Dtype  
---  ------                         --------------    -----  
 0   CRASH_RECORD_ID                1024029 non-null  object 
 1   CRASH_DATE_EST_I               74318 non-null    object 
 2   CRASH_DATE                     1024029 non-null  object 
 3   POSTED_SPEED_LIMIT             1024029 non-null  int64  
 4   TRAFFIC_CONTROL_DEVICE         1024029 non-null  object 
 5   DEVICE_CONDITION               1024029 non-null  object 
 6   WEATHER_CONDITION              1024029 non-null  object 
 7   LIGHTING_CONDITION             1024029 non-null  object 
 8   FIRST_CRASH_TYPE               1024029 non-null  object 
 9   TRAFFICWAY_TYPE                1024029 non-null  object 
 10  LANE_CNT                       199035 non-null   object 
 11  ALIGNMENT                      1024029 non-null  object 
 12  ROADWAY_SURFAC

In [97]:
#Gettint the null values in the dataset
df.isnull().sum()

CRASH_RECORD_ID                        0
CRASH_DATE_EST_I                  949711
CRASH_DATE                             0
POSTED_SPEED_LIMIT                     0
TRAFFIC_CONTROL_DEVICE                 0
DEVICE_CONDITION                       0
WEATHER_CONDITION                      0
LIGHTING_CONDITION                     0
FIRST_CRASH_TYPE                       0
TRAFFICWAY_TYPE                        0
LANE_CNT                          824994
ALIGNMENT                              0
ROADWAY_SURFACE_COND                   0
ROAD_DEFECT                            0
REPORT_TYPE                        34014
CRASH_TYPE                             0
INTERSECTION_RELATED_I            788579
NOT_RIGHT_OF_WAY_I                978151
HIT_AND_RUN_I                     702756
DAMAGE                                 0
DATE_POLICE_NOTIFIED                   0
PRIM_CONTRIBUTORY_CAUSE                0
SEC_CONTRIBUTORY_CAUSE                 0
STREET_NO                              0
STREET_DIRECTION

In [98]:
df.columns

Index(['CRASH_RECORD_ID', 'CRASH_DATE_EST_I', 'CRASH_DATE',
       'POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION',
       'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE',
       'TRAFFICWAY_TYPE', 'LANE_CNT', 'ALIGNMENT', 'ROADWAY_SURFACE_COND',
       'ROAD_DEFECT', 'REPORT_TYPE', 'CRASH_TYPE', 'INTERSECTION_RELATED_I',
       'NOT_RIGHT_OF_WAY_I', 'HIT_AND_RUN_I', 'DAMAGE', 'DATE_POLICE_NOTIFIED',
       'PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE', 'STREET_NO',
       'STREET_DIRECTION', 'STREET_NAME', 'BEAT_OF_OCCURRENCE',
       'PHOTOS_TAKEN_I', 'STATEMENTS_TAKEN_I', 'DOORING_I', 'WORK_ZONE_I',
       'WORK_ZONE_TYPE', 'WORKERS_PRESENT_I', 'NUM_UNITS',
       'MOST_SEVERE_INJURY', 'INJURIES_TOTAL', 'INJURIES_FATAL',
       'INJURIES_INCAPACITATING', 'INJURIES_NON_INCAPACITATING',
       'INJURIES_REPORTED_NOT_EVIDENT', 'INJURIES_NO_INDICATION',
       'INJURIES_UNKNOWN', 'CRASH_HOUR', 'CRASH_DAY_OF_WEEK', 'CRASH_MONTH',
       'LATITUDE', 

In [99]:
# Getting the percentage of missing values in each column 
(df.isnull().mean() * 100).sort_values(ascending=False)

WORKERS_PRESENT_I                99.861625
DOORING_I                        99.681454
WORK_ZONE_TYPE                   99.586828
WORK_ZONE_I                      99.458218
PHOTOS_TAKEN_I                   98.572990
STATEMENTS_TAKEN_I               97.610322
NOT_RIGHT_OF_WAY_I               95.519853
CRASH_DATE_EST_I                 92.742588
LANE_CNT                         80.563539
INTERSECTION_RELATED_I           77.007487
HIT_AND_RUN_I                    68.626572
REPORT_TYPE                       3.321586
LOCATION                          0.760428
LONGITUDE                         0.760428
LATITUDE                          0.760428
MOST_SEVERE_INJURY                0.217767
INJURIES_TOTAL                    0.216400
INJURIES_FATAL                    0.216400
INJURIES_INCAPACITATING           0.216400
INJURIES_NON_INCAPACITATING       0.216400
INJURIES_REPORTED_NOT_EVIDENT     0.216400
INJURIES_NO_INDICATION            0.216400
INJURIES_UNKNOWN                  0.216400
BEAT_OF_OCC

In [100]:
df.duplicated().value_counts()

False    1024029
Name: count, dtype: int64

In [101]:
df1 = df.drop(
    columns=[
        'CRASH_RECORD_ID',
        'NOT_RIGHT_OF_WAY_I',
        'LANE_CNT',
        'INTERSECTION_RELATED_I',
        'HIT_AND_RUN_I',
        'CRASH_DATE_EST_I',
        'PHOTOS_TAKEN_I',
        'STATEMENTS_TAKEN_I',
        'DOORING_I',
        'WORK_ZONE_I',
        'WORK_ZONE_TYPE',
        'WORKERS_PRESENT_I',
        'LATITUDE',
        'LONGITUDE',
        'CRASH_DATE'
    ]
)
df1.head()

Unnamed: 0,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,ALIGNMENT,ROADWAY_SURFACE_COND,ROAD_DEFECT,...,INJURIES_FATAL,INJURIES_INCAPACITATING,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LOCATION
0,30,NO CONTROLS,NO CONTROLS,SNOW,"DARKNESS, LIGHTED ROAD",FIXED OBJECT,NOT DIVIDED,STRAIGHT AND LEVEL,SNOW OR SLUSH,NO DEFECTS,...,0.0,0.0,0.0,0.0,1.0,0.0,22,4,1,POINT (-87.551093105845 41.713829100033)
1,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,"DARKNESS, LIGHTED ROAD",REAR END,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,0.0,0.0,0.0,0.0,2.0,0.0,22,4,1,POINT (-87.755202215729 41.796710893317)
2,30,OTHER,OTHER,OTHER,UNKNOWN,PARKED MOTOR VEHICLE,OTHER,STRAIGHT AND LEVEL,OTHER,UNKNOWN,...,0.0,0.0,0.0,0.0,2.0,0.0,22,4,1,POINT (-87.603822899265 41.813004951227)
3,30,STOP SIGN/FLASHER,FUNCTIONING PROPERLY,CLEAR,"DARKNESS, LIGHTED ROAD",REAR END,FOUR WAY,STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,0.0,0.0,0.0,0.0,3.0,0.0,22,4,1,POINT (-87.705668192505 41.868335288795)
4,30,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",PARKED MOTOR VEHICLE,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,0.0,0.0,0.0,0.0,2.0,0.0,22,4,1,POINT (-87.696128029764 41.866617682133)


In [102]:
# Getting the percentage of missing values in each column after dropping unnecessary columns
(df1.isnull().mean() * 100).sort_values(ascending=False)

REPORT_TYPE                      3.321586
LOCATION                         0.760428
MOST_SEVERE_INJURY               0.217767
INJURIES_REPORTED_NOT_EVIDENT    0.216400
INJURIES_TOTAL                   0.216400
INJURIES_FATAL                   0.216400
INJURIES_INCAPACITATING          0.216400
INJURIES_NON_INCAPACITATING      0.216400
INJURIES_NO_INDICATION           0.216400
INJURIES_UNKNOWN                 0.216400
BEAT_OF_OCCURRENCE               0.000488
STREET_DIRECTION                 0.000391
STREET_NAME                      0.000098
CRASH_HOUR                       0.000000
CRASH_DAY_OF_WEEK                0.000000
CRASH_MONTH                      0.000000
NUM_UNITS                        0.000000
POSTED_SPEED_LIMIT               0.000000
TRAFFIC_CONTROL_DEVICE           0.000000
SEC_CONTRIBUTORY_CAUSE           0.000000
PRIM_CONTRIBUTORY_CAUSE          0.000000
DATE_POLICE_NOTIFIED             0.000000
DAMAGE                           0.000000
CRASH_TYPE                       0

In [103]:
# Get the Shape 
df1.shape 

(1024029, 33)

In [104]:
# Checking the columns of the new dataframe
df1.columns

Index(['POSTED_SPEED_LIMIT', 'TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION',
       'WEATHER_CONDITION', 'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE',
       'TRAFFICWAY_TYPE', 'ALIGNMENT', 'ROADWAY_SURFACE_COND', 'ROAD_DEFECT',
       'REPORT_TYPE', 'CRASH_TYPE', 'DAMAGE', 'DATE_POLICE_NOTIFIED',
       'PRIM_CONTRIBUTORY_CAUSE', 'SEC_CONTRIBUTORY_CAUSE', 'STREET_NO',
       'STREET_DIRECTION', 'STREET_NAME', 'BEAT_OF_OCCURRENCE', 'NUM_UNITS',
       'MOST_SEVERE_INJURY', 'INJURIES_TOTAL', 'INJURIES_FATAL',
       'INJURIES_INCAPACITATING', 'INJURIES_NON_INCAPACITATING',
       'INJURIES_REPORTED_NOT_EVIDENT', 'INJURIES_NO_INDICATION',
       'INJURIES_UNKNOWN', 'CRASH_HOUR', 'CRASH_DAY_OF_WEEK', 'CRASH_MONTH',
       'LOCATION'],
      dtype='object')

In [105]:
# Separate categorical columns
cat_cols = df1.select_dtypes(include='object').columns
# Fill missing categorical values with the mode
df1[cat_cols] = df1[cat_cols].fillna(df1[cat_cols].mode().iloc[0])

In [106]:
# Separate numeric columns
num_cols = df1.select_dtypes(include=['int', 'float']).columns
# Fill missing numeric values with the median
df1[num_cols] = df1[num_cols].fillna(
    df1[num_cols].median()
)

In [107]:
# Getting the percentage of missing values in each column after dropping unnecessary columns
(df1.isnull().mean() * 100).sort_values(ascending=False)

POSTED_SPEED_LIMIT               0.0
STREET_DIRECTION                 0.0
CRASH_MONTH                      0.0
CRASH_DAY_OF_WEEK                0.0
CRASH_HOUR                       0.0
INJURIES_UNKNOWN                 0.0
INJURIES_NO_INDICATION           0.0
INJURIES_REPORTED_NOT_EVIDENT    0.0
INJURIES_NON_INCAPACITATING      0.0
INJURIES_INCAPACITATING          0.0
INJURIES_FATAL                   0.0
INJURIES_TOTAL                   0.0
MOST_SEVERE_INJURY               0.0
NUM_UNITS                        0.0
BEAT_OF_OCCURRENCE               0.0
STREET_NAME                      0.0
STREET_NO                        0.0
TRAFFIC_CONTROL_DEVICE           0.0
SEC_CONTRIBUTORY_CAUSE           0.0
PRIM_CONTRIBUTORY_CAUSE          0.0
DATE_POLICE_NOTIFIED             0.0
DAMAGE                           0.0
CRASH_TYPE                       0.0
REPORT_TYPE                      0.0
ROAD_DEFECT                      0.0
ROADWAY_SURFACE_COND             0.0
ALIGNMENT                        0.0
T

In [108]:
# Create a mapping dictionary
# This reduces 40 specific causes into 5 broad "Buckets"
cause_mapping = {
    # DRIVER ERROR (The biggest category)
    'FOLLOWING TOO CLOSELY': 'Driver Error',
    'FAILING TO YIELD RIGHT-OF-WAY': 'Driver Error',
    'FAILING TO REDUCE SPEED TO AVOID CRASH': 'Driver Error',
    'IMPROPER BACKING': 'Driver Error',
    'IMPROPER OVERTAKING/PASSING': 'Driver Error',
    'IMPROPER TURNING/NO SIGNAL': 'Driver Error',
    'DRIVING SKILLS/KNOWLEDGE/EXPERIENCE': 'Driver Error',
    'DISREGARDING TRAFFIC SIGNALS': 'Driver Error',
    'OPERATING VEHICLE IN ERRATIC, RECKLESS, CARELESS, NEGLIGENT OR AGGRESSIVE MANNER': 'Driver Error',
    'TEXTING': 'Driver Error',
    'DISTRACTION - FROM INSIDE VEHICLE': 'Driver Error',
    'DISTRACTION - FROM OUTSIDE VEHICLE': 'Driver Error',
    'PHYSICAL CONDITION OF DRIVER': 'Driver Error',
    
    # EXTERNAL FACTORS
    'WEATHER': 'External Factors',
    'ROAD ENGINEERING/SURFACE/MARKING DEFECTS': 'External Factors',
    'VISION OBSCURED (SIGNS, TREE LIMBS, BUILDINGS, ETC.)': 'External Factors',
    'ANIMAL': 'External Factors',
    
    # VEHICLE DEFECTS
    'EQUIPMENT - VEHICLE CONDITION': 'Vehicle Defect',
    'BRAKESLESS/FAILURE': 'Vehicle Defect',
    
    # UNKNOWN (Usually the biggest or second biggest)
    'UNABLE TO DETERMINE': 'Unknown',
    'NOT APPLICABLE': 'Unknown'
}

# 1. Apply the mapping
# If a cause is NOT in the dictionary, we default it to 'Other'
df_crashes = df1.copy()
df_crashes['New_Target_Category'] = df_crashes['PRIM_CONTRIBUTORY_CAUSE'].map(cause_mapping).fillna('Other')

# 2. Check the new counts
print("New grouped categories:")
print(df_crashes['New_Target_Category'].value_counts())
from sklearn.preprocessing import LabelEncoder

# Instantiate
le = LabelEncoder()

# Encode the NEW grouped column
df_crashes['Target_Encoded'] = le.fit_transform(df_crashes['New_Target_Category'])

# Update your y variable
y = df_crashes['Target_Encoded']

print("Encoding complete. Classes are:", le.classes_)

New grouped categories:
New_Target_Category
Driver Error        464480
Unknown             456049
Other                72885
External Factors     24364
Vehicle Defect        6251
Name: count, dtype: int64
Encoding complete. Classes are: ['Driver Error' 'External Factors' 'Other' 'Unknown' 'Vehicle Defect']


In [109]:
# Target variable (use the encoded target created earlier)
y = df_crashes['Target_Encoded']

# Features (all remaining columns) — drop the target columns if present
X = df_crashes.drop(columns=['New_Target_Category', 'Target_Encoded'], errors='ignore')


The target variable was encoded into numerical form to support multi-class classification. The original categorical target was removed from the feature set to prevent data leakage. All remaining variables were used as predictors to allow the models to learn from the full crash context.

We can now create one preprocessing pipeline that we can use across all models  

In [110]:
# Removing the 'PRIM_CONTRIBUTORY_CAUSE' column to avoid leakage
leakage_cols = [
    col for col in X.columns
    if 'PRIM_CONTRIBUTORY_CAUSE' in col
    or 'SEC_CONTRIBUTORY_CAUSE' in col
]

X = X.drop(columns=leakage_cols, errors='ignore')
X.head()


Unnamed: 0,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,ALIGNMENT,ROADWAY_SURFACE_COND,ROAD_DEFECT,...,INJURIES_FATAL,INJURIES_INCAPACITATING,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LOCATION
0,30,NO CONTROLS,NO CONTROLS,SNOW,"DARKNESS, LIGHTED ROAD",FIXED OBJECT,NOT DIVIDED,STRAIGHT AND LEVEL,SNOW OR SLUSH,NO DEFECTS,...,0.0,0.0,0.0,0.0,1.0,0.0,22,4,1,POINT (-87.551093105845 41.713829100033)
1,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,"DARKNESS, LIGHTED ROAD",REAR END,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,0.0,0.0,0.0,0.0,2.0,0.0,22,4,1,POINT (-87.755202215729 41.796710893317)
2,30,OTHER,OTHER,OTHER,UNKNOWN,PARKED MOTOR VEHICLE,OTHER,STRAIGHT AND LEVEL,OTHER,UNKNOWN,...,0.0,0.0,0.0,0.0,2.0,0.0,22,4,1,POINT (-87.603822899265 41.813004951227)
3,30,STOP SIGN/FLASHER,FUNCTIONING PROPERLY,CLEAR,"DARKNESS, LIGHTED ROAD",REAR END,FOUR WAY,STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,0.0,0.0,0.0,0.0,3.0,0.0,22,4,1,POINT (-87.705668192505 41.868335288795)
4,30,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",PARKED MOTOR VEHICLE,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,0.0,0.0,0.0,0.0,2.0,0.0,22,4,1,POINT (-87.696128029764 41.866617682133)


In [111]:
# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object', 'category']).columns
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns

In [112]:
#Train-test split
# 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)

In [113]:
## Import libraries for model building and evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score

In [114]:
categorical_cols = df1.select_dtypes(include=['object']).columns

In [115]:
X_train

Unnamed: 0,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,ALIGNMENT,ROADWAY_SURFACE_COND,ROAD_DEFECT,...,INJURIES_FATAL,INJURIES_INCAPACITATING,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LOCATION
899405,40,NO CONTROLS,NO CONTROLS,CLEAR,DARKNESS,FIXED OBJECT,DIVIDED - W/MEDIAN BARRIER,"CURVE, LEVEL",DRY,NO DEFECTS,...,0.0,0.0,1.0,0.0,0.0,0.0,3,1,11,POINT (-87.618091911783 41.898389053094)
479809,30,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,REAR END,NOT DIVIDED,STRAIGHT AND LEVEL,UNKNOWN,UNKNOWN,...,0.0,0.0,0.0,0.0,10.0,0.0,13,1,9,POINT (-87.642384512979 41.940186722574)
553121,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,SNOW,"DARKNESS, LIGHTED ROAD",ANGLE,FOUR WAY,STRAIGHT AND LEVEL,SNOW OR SLUSH,NO DEFECTS,...,0.0,0.0,0.0,0.0,2.0,0.0,22,3,1,POINT (-87.80634529093 41.930744417308)
992763,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,"DARKNESS, LIGHTED ROAD",REAR END,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,0.0,0.0,0.0,0.0,4.0,0.0,22,2,8,POINT (-87.70096006787 41.877305760362)
514753,30,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,PARKED MOTOR VEHICLE,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,UNKNOWN,...,0.0,0.0,0.0,0.0,1.0,0.0,13,7,6,POINT (-87.688304588055 41.953491697799)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
919601,30,OTHER,OTHER,CLEAR,DAYLIGHT,TURNING,DIVIDED - W/MEDIAN BARRIER,STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,0.0,0.0,0.0,0.0,2.0,0.0,15,2,9,POINT (-87.660174752888 41.991780377892)
784211,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,PARKED MOTOR VEHICLE,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,UNKNOWN,...,0.0,0.0,0.0,0.0,1.0,0.0,15,5,11,POINT (-87.663815590987 41.907808782674)
673016,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,TURNING,DIVIDED - W/MEDIAN (NOT RAISED),STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,0.0,0.0,0.0,0.0,2.0,0.0,5,6,10,POINT (-87.704257223316 41.811600339039)
236334,40,NO CONTROLS,NO CONTROLS,CLEAR,DUSK,ANGLE,DIVIDED - W/MEDIAN BARRIER,STRAIGHT AND LEVEL,DRY,WORN SURFACE,...,0.0,0.0,0.0,0.0,2.0,0.0,16,5,12,POINT (-87.653110814421 41.985449532208)


In [116]:
X_test


Unnamed: 0,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,ALIGNMENT,ROADWAY_SURFACE_COND,ROAD_DEFECT,...,INJURIES_FATAL,INJURIES_INCAPACITATING,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LOCATION
426857,30,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",SIDESWIPE SAME DIRECTION,FOUR WAY,STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,0.0,0.0,0.0,0.0,2.0,0.0,23,1,3,POINT (-87.766253945447 41.880333955527)
42008,30,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,SIDESWIPE SAME DIRECTION,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,0.0,0.0,0.0,0.0,2.0,0.0,16,6,9,POINT (-87.70508348897 41.798881170398)
39997,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,DAYLIGHT,REAR END,FOUR WAY,STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,0.0,0.0,0.0,0.0,2.0,0.0,9,6,9,POINT (-87.562908182042 41.76619520231)
620460,30,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,REAR END,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,0.0,0.0,0.0,0.0,2.0,0.0,15,6,5,POINT (-87.617719481874 41.758471711463)
731576,30,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",REAR END,PARKING LOT,STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,0.0,0.0,0.0,0.0,4.0,0.0,22,6,4,POINT (-87.614709812077 41.721991872532)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
640257,20,NO CONTROLS,NO CONTROLS,UNKNOWN,UNKNOWN,PARKED MOTOR VEHICLE,ONE-WAY,STRAIGHT AND LEVEL,UNKNOWN,UNKNOWN,...,0.0,0.0,0.0,0.0,3.0,0.0,18,1,2,POINT (-87.631180891189 41.89093246868)
969443,30,NO CONTROLS,NO CONTROLS,CLEAR,DUSK,TURNING,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,0.0,0.0,0.0,0.0,9.0,0.0,15,4,1,POINT (-87.607532022114 41.767416946505)
304252,30,NO CONTROLS,NO CONTROLS,CLEAR,UNKNOWN,PARKED MOTOR VEHICLE,DIVIDED - W/MEDIAN BARRIER,STRAIGHT AND LEVEL,UNKNOWN,NO DEFECTS,...,0.0,0.0,0.0,0.0,1.0,0.0,11,7,5,POINT (-87.765519558311 41.880346236952)
971835,30,NO CONTROLS,NO CONTROLS,CLEAR,DAWN,PARKED MOTOR VEHICLE,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,0.0,0.0,0.0,0.0,1.0,0.0,0,2,12,POINT (-87.575890342787 41.753911617689)


Preprocessing X train 

In [117]:
X_train_cat = X_train.select_dtypes(include= ['object', 'string']).copy()
X_train_cat.head()

Unnamed: 0,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,ALIGNMENT,ROADWAY_SURFACE_COND,ROAD_DEFECT,REPORT_TYPE,CRASH_TYPE,DAMAGE,DATE_POLICE_NOTIFIED,STREET_DIRECTION,STREET_NAME,MOST_SEVERE_INJURY,LOCATION
899405,NO CONTROLS,NO CONTROLS,CLEAR,DARKNESS,FIXED OBJECT,DIVIDED - W/MEDIAN BARRIER,"CURVE, LEVEL",DRY,NO DEFECTS,ON SCENE,INJURY AND / OR TOW DUE TO CRASH,"OVER $1,500",11/19/2017 03:46:00 AM,N,LAKE SHORE DR SB,NONINCAPACITATING INJURY,POINT (-87.618091911783 41.898389053094)
479809,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,REAR END,NOT DIVIDED,STRAIGHT AND LEVEL,UNKNOWN,UNKNOWN,NOT ON SCENE (DESK REPORT),NO INJURY / DRIVE AWAY,"$501 - $1,500",09/20/2021 04:20:00 PM,W,BELMONT AVE,NO INDICATION OF INJURY,POINT (-87.642384512979 41.940186722574)
553121,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,SNOW,"DARKNESS, LIGHTED ROAD",ANGLE,FOUR WAY,STRAIGHT AND LEVEL,SNOW OR SLUSH,NO DEFECTS,ON SCENE,INJURY AND / OR TOW DUE TO CRASH,"OVER $1,500",01/19/2021 10:15:00 PM,N,HARLEM AVE,NO INDICATION OF INJURY,POINT (-87.80634529093 41.930744417308)
992763,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,"DARKNESS, LIGHTED ROAD",REAR END,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,NO DEFECTS,NOT ON SCENE (DESK REPORT),NO INJURY / DRIVE AWAY,"$501 - $1,500",08/08/2016 10:40:00 PM,S,SACRAMENTO BLVD,NO INDICATION OF INJURY,POINT (-87.70096006787 41.877305760362)
514753,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,PARKED MOTOR VEHICLE,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,UNKNOWN,NOT ON SCENE (DESK REPORT),NO INJURY / DRIVE AWAY,"OVER $1,500",06/06/2021 01:43:00 PM,N,WESTERN AVE,NO INDICATION OF INJURY,POINT (-87.688304588055 41.953491697799)


In [118]:
# checking the shape of categorical training data 
X_train_cat.head()
X_train_cat.shape


(819223, 17)

In [119]:
X_train_cat.nunique().sort_values(ascending=False)

DATE_POLICE_NOTIFIED      648600
LOCATION                  301590
STREET_NAME                 1626
TRAFFICWAY_TYPE               20
TRAFFIC_CONTROL_DEVICE        19
FIRST_CRASH_TYPE              18
WEATHER_CONDITION             12
DEVICE_CONDITION               8
ROADWAY_SURFACE_COND           7
ROAD_DEFECT                    7
ALIGNMENT                      6
LIGHTING_CONDITION             6
MOST_SEVERE_INJURY             5
STREET_DIRECTION               4
REPORT_TYPE                    3
DAMAGE                         3
CRASH_TYPE                     2
dtype: int64

In [120]:
X_train_cat = X_train_cat.drop(columns=['date_police_notified','location'], errors='ignore')
X_train_cat.head()

Unnamed: 0,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,ALIGNMENT,ROADWAY_SURFACE_COND,ROAD_DEFECT,REPORT_TYPE,CRASH_TYPE,DAMAGE,DATE_POLICE_NOTIFIED,STREET_DIRECTION,STREET_NAME,MOST_SEVERE_INJURY,LOCATION
899405,NO CONTROLS,NO CONTROLS,CLEAR,DARKNESS,FIXED OBJECT,DIVIDED - W/MEDIAN BARRIER,"CURVE, LEVEL",DRY,NO DEFECTS,ON SCENE,INJURY AND / OR TOW DUE TO CRASH,"OVER $1,500",11/19/2017 03:46:00 AM,N,LAKE SHORE DR SB,NONINCAPACITATING INJURY,POINT (-87.618091911783 41.898389053094)
479809,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,REAR END,NOT DIVIDED,STRAIGHT AND LEVEL,UNKNOWN,UNKNOWN,NOT ON SCENE (DESK REPORT),NO INJURY / DRIVE AWAY,"$501 - $1,500",09/20/2021 04:20:00 PM,W,BELMONT AVE,NO INDICATION OF INJURY,POINT (-87.642384512979 41.940186722574)
553121,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,SNOW,"DARKNESS, LIGHTED ROAD",ANGLE,FOUR WAY,STRAIGHT AND LEVEL,SNOW OR SLUSH,NO DEFECTS,ON SCENE,INJURY AND / OR TOW DUE TO CRASH,"OVER $1,500",01/19/2021 10:15:00 PM,N,HARLEM AVE,NO INDICATION OF INJURY,POINT (-87.80634529093 41.930744417308)
992763,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,"DARKNESS, LIGHTED ROAD",REAR END,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,NO DEFECTS,NOT ON SCENE (DESK REPORT),NO INJURY / DRIVE AWAY,"$501 - $1,500",08/08/2016 10:40:00 PM,S,SACRAMENTO BLVD,NO INDICATION OF INJURY,POINT (-87.70096006787 41.877305760362)
514753,NO CONTROLS,NO CONTROLS,CLEAR,DAYLIGHT,PARKED MOTOR VEHICLE,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,UNKNOWN,NOT ON SCENE (DESK REPORT),NO INJURY / DRIVE AWAY,"OVER $1,500",06/06/2021 01:43:00 PM,N,WESTERN AVE,NO INDICATION OF INJURY,POINT (-87.688304588055 41.953491697799)


In [121]:
X_train_cat.isna().sum()

TRAFFIC_CONTROL_DEVICE    0
DEVICE_CONDITION          0
WEATHER_CONDITION         0
LIGHTING_CONDITION        0
FIRST_CRASH_TYPE          0
TRAFFICWAY_TYPE           0
ALIGNMENT                 0
ROADWAY_SURFACE_COND      0
ROAD_DEFECT               0
REPORT_TYPE               0
CRASH_TYPE                0
DAMAGE                    0
DATE_POLICE_NOTIFIED      0
STREET_DIRECTION          0
STREET_NAME               0
MOST_SEVERE_INJURY        0
LOCATION                  0
dtype: int64

In [122]:
X_train_cat.shape

(819223, 17)

In [123]:
# Keep only top 50 streets, rest as 'Other'
top_streets = X_train_cat['STREET_NAME'].value_counts().nlargest(50).index
X_train_cat['STREET_NAME_GROUPED'] = X_train_cat['STREET_NAME'].where(X_train_cat['STREET_NAME'].isin(top_streets), 'Other')

In [124]:
X_train_cat.columns

Index(['TRAFFIC_CONTROL_DEVICE', 'DEVICE_CONDITION', 'WEATHER_CONDITION',
       'LIGHTING_CONDITION', 'FIRST_CRASH_TYPE', 'TRAFFICWAY_TYPE',
       'ALIGNMENT', 'ROADWAY_SURFACE_COND', 'ROAD_DEFECT', 'REPORT_TYPE',
       'CRASH_TYPE', 'DAMAGE', 'DATE_POLICE_NOTIFIED', 'STREET_DIRECTION',
       'STREET_NAME', 'MOST_SEVERE_INJURY', 'LOCATION', 'STREET_NAME_GROUPED'],
      dtype='object')

In [125]:
X_train_cat = X_train_cat.drop(columns=["STREET_NAME"], errors="ignore")

In [126]:
X_train_cat.nunique().sort_values(ascending=False)

DATE_POLICE_NOTIFIED      648600
LOCATION                  301590
STREET_NAME_GROUPED           51
TRAFFICWAY_TYPE               20
TRAFFIC_CONTROL_DEVICE        19
FIRST_CRASH_TYPE              18
WEATHER_CONDITION             12
DEVICE_CONDITION               8
ROADWAY_SURFACE_COND           7
ROAD_DEFECT                    7
ALIGNMENT                      6
LIGHTING_CONDITION             6
MOST_SEVERE_INJURY             5
STREET_DIRECTION               4
REPORT_TYPE                    3
DAMAGE                         3
CRASH_TYPE                     2
dtype: int64

In [127]:
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

In [128]:
# preprocessing for Categorical preprocessing
# reuse the OneHotEncoder instance already created (ohe)
categorical_transformer = Pipeline(steps=[
    ('encoder', ohe)
])


In [129]:
#  Numerical preprocessing
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

In [130]:
# Combine preprocessing for numeric features and categorical features
preprocessor = ColumnTransformer(
    transformers=[
    ('cat', categorical_transformer, categorical_features),
    ('num', numeric_transformer, numeric_features)]
)

Whitebox model : Logistic Regression And Decision Tree


In [131]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)


In [132]:
# checking for leakage in the target related columns
[col for col in X.columns if 'cause' in col.lower()]
[col for col in X.columns if 'target' in col.lower()]


[]

In [133]:
# Memory-efficient mutual information estimation:
# - sample rows
# - one-hot encode only a reduced set of categorical columns (or use `cat_reduced` if available)
# - keep numeric columns
import scipy.sparse as sp

n_samples = min(100_000, len(X))
rng = np.random.RandomState(42)
sample_idx = rng.choice(len(X), n_samples, replace=False)
X_sample = X.iloc[sample_idx]
y_sample = y.iloc[sample_idx]

# choose categorical columns to include (prefer precomputed `cat_reduced` if available)
if 'cat_reduced' in globals():
    cat_cols_small = [c for c in cat_reduced if c in X_sample.columns]
else:
    exclude = {'LOCATION', 'STREET_NAME', 'DATE_POLICE_NOTIFIED'}
    cat_cols_small = [
        c for c in X_sample.select_dtypes(include=['object', 'category']).columns
        if X_sample[c].nunique() <= 50 and c not in exclude
    ]

num_cols_small = X_sample.select_dtypes(include=[np.number]).columns.tolist()

# One-hot encode categorical columns to sparse matrix
X_cat_sparse = None
cat_feature_names = []
if len(cat_cols_small) > 0:
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=True)
    X_cat_sparse = encoder.fit_transform(X_sample[cat_cols_small])
    try:
        cat_feature_names = encoder.get_feature_names_out(cat_cols_small).tolist()
    except Exception:
        # fallback name construction
        cat_feature_names = [
            f"{c}_{v}" for c in cat_cols_small for v in encoder.categories_[cat_cols_small.index(c)]
        ]

# Numeric columns as sparse
X_num_sparse = None
if len(num_cols_small) > 0:
    X_num_sparse = sp.csr_matrix(X_sample[num_cols_small].values)

# Combine sparse parts
if X_cat_sparse is not None and X_num_sparse is not None:
    X_enc = sp.hstack([X_cat_sparse, X_num_sparse], format='csr')
    feature_names = cat_feature_names + num_cols_small
elif X_cat_sparse is not None:
    X_enc = X_cat_sparse
    feature_names = cat_feature_names
else:
    X_enc = X_num_sparse
    feature_names = num_cols_small

# Estimate mutual information on the reduced/sampled set
mi = mutual_info_classif(X_enc, y_sample, discrete_features='auto', random_state=42)
mi_scores = pd.Series(mi, index=feature_names)
mi_scores.sort_values(ascending=False).head(10)


STREET_NO                                0.134397
FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE    0.028704
INJURIES_NO_INDICATION                   0.025010
DEVICE_CONDITION_FUNCTIONING PROPERLY    0.016345
LIGHTING_CONDITION_UNKNOWN               0.014879
ROADWAY_SURFACE_COND_DRY                 0.013696
WEATHER_CONDITION_UNKNOWN                0.013610
BEAT_OF_OCCURRENCE                       0.012572
ROADWAY_SURFACE_COND_UNKNOWN             0.012420
FIRST_CRASH_TYPE_REAR END                0.012155
dtype: float64

In [134]:
# Removing the 'PRIM_CONTRIBUTORY_CAUSE' column to avoid leakage
leakage_cols = [
    col for col in X.columns
    if 'PRIM_CONTRIBUTORY_CAUSE' in col
    or 'SEC_CONTRIBUTORY_CAUSE' in col
]

X = X.drop(columns=leakage_cols, errors='ignore')
X.head()



Unnamed: 0,POSTED_SPEED_LIMIT,TRAFFIC_CONTROL_DEVICE,DEVICE_CONDITION,WEATHER_CONDITION,LIGHTING_CONDITION,FIRST_CRASH_TYPE,TRAFFICWAY_TYPE,ALIGNMENT,ROADWAY_SURFACE_COND,ROAD_DEFECT,...,INJURIES_FATAL,INJURIES_INCAPACITATING,INJURIES_NON_INCAPACITATING,INJURIES_REPORTED_NOT_EVIDENT,INJURIES_NO_INDICATION,INJURIES_UNKNOWN,CRASH_HOUR,CRASH_DAY_OF_WEEK,CRASH_MONTH,LOCATION
0,30,NO CONTROLS,NO CONTROLS,SNOW,"DARKNESS, LIGHTED ROAD",FIXED OBJECT,NOT DIVIDED,STRAIGHT AND LEVEL,SNOW OR SLUSH,NO DEFECTS,...,0.0,0.0,0.0,0.0,1.0,0.0,22,4,1,POINT (-87.551093105845 41.713829100033)
1,30,TRAFFIC SIGNAL,FUNCTIONING PROPERLY,CLEAR,"DARKNESS, LIGHTED ROAD",REAR END,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,0.0,0.0,0.0,0.0,2.0,0.0,22,4,1,POINT (-87.755202215729 41.796710893317)
2,30,OTHER,OTHER,OTHER,UNKNOWN,PARKED MOTOR VEHICLE,OTHER,STRAIGHT AND LEVEL,OTHER,UNKNOWN,...,0.0,0.0,0.0,0.0,2.0,0.0,22,4,1,POINT (-87.603822899265 41.813004951227)
3,30,STOP SIGN/FLASHER,FUNCTIONING PROPERLY,CLEAR,"DARKNESS, LIGHTED ROAD",REAR END,FOUR WAY,STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,0.0,0.0,0.0,0.0,3.0,0.0,22,4,1,POINT (-87.705668192505 41.868335288795)
4,30,NO CONTROLS,NO CONTROLS,CLEAR,"DARKNESS, LIGHTED ROAD",PARKED MOTOR VEHICLE,NOT DIVIDED,STRAIGHT AND LEVEL,DRY,NO DEFECTS,...,0.0,0.0,0.0,0.0,2.0,0.0,22,4,1,POINT (-87.696128029764 41.866617682133)


In [135]:
# Rechecking the sanity of the dataset 
X.shape, y.shape


((1024029, 31), (1024029,))

In [136]:
from sklearn.linear_model import LogisticRegression

# Use the existing ColumnTransformer to preprocess categorical & numeric features before logistic regression
log_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('logreg', LogisticRegression(max_iter=1000, multi_class='auto', class_weight='balanced', n_jobs=-1))
])

# Drop very high-cardinality categorical columns from OHE to save memory
high_card = ['LOCATION', 'STREET_NAME', 'DATE_POLICE_NOTIFIED']
cat_reduced = [c for c in categorical_features if c not in high_card]

preprocessor_light = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=True), cat_reduced),
        ('num', StandardScaler(), numeric_features)
    ]
)

# use a new pipeline that uses the memory-efficient preprocessor
log_pipe = Pipeline([
    ('preprocessor', preprocessor_light),
    ('logreg', LogisticRegression(max_iter=1000, multi_class='auto', class_weight='balanced', n_jobs=-1))
])

log_pipe.fit(X_train, y_train)
y_log_pred = log_pipe.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_log_pred))
print(classification_report(y_test, y_log_pred, zero_division=0))



Accuracy: 0.39111158852767985
              precision    recall  f1-score   support

           0       0.61      0.40      0.48    139344
           1       0.13      0.67      0.22      7309
           2       0.15      0.48      0.23     21866
           3       0.69      0.35      0.46    136815
           4       0.03      0.58      0.05      1875

    accuracy                           0.39    307209
   macro avg       0.32      0.50      0.29    307209
weighted avg       0.60      0.39      0.45    307209



In [137]:
# Check for strong correlations with the target (memory-safe)
temp = X.copy()
temp['target'] = y

# Select only numeric features + low-cardinality categorical features to avoid exploding one-hot encoding
if 'cat_reduced' in globals():
	low_card_cat = [c for c in cat_reduced if c in temp.columns]
else:
	low_card_cat = [c for c in temp.select_dtypes(include=['object', 'category']).columns
					if temp[c].nunique() <= 50]

# Exclude known very-high-card columns if present
exclude_high_card = {'LOCATION', 'STREET_NAME', 'DATE_POLICE_NOTIFIED'}
low_card_cat = [c for c in low_card_cat if c not in exclude_high_card]

# Ensure we do not include 'target' twice (avoid duplicate column names)
num_cols_local = [c for c in temp.select_dtypes(include=[np.number]).columns.tolist() if c != 'target']
cols_to_use = num_cols_local + low_card_cat

# Work on a reduced DataFrame to save memory (keep target once)
temp_small = temp[cols_to_use + ['target']].copy()

# One-hot encode only the low-card categorical columns, use compact dtype
temp_encoded = pd.get_dummies(temp_small, columns=low_card_cat, drop_first=False, dtype='uint8')

# Drop any duplicated column names just in case
temp_encoded = temp_encoded.loc[:, ~temp_encoded.columns.duplicated()]

# Get absolute correlations with the target (Series) and sort
correlations = temp_encoded.corr()['target'].abs().sort_values(ascending=False)
print(correlations.head(15))


target                                   1.000000
FIRST_CRASH_TYPE_PARKED MOTOR VEHICLE    0.230134
DEVICE_CONDITION_FUNCTIONING PROPERLY    0.172350
LIGHTING_CONDITION_UNKNOWN               0.149284
TRAFFIC_CONTROL_DEVICE_TRAFFIC SIGNAL    0.144905
WEATHER_CONDITION_UNKNOWN                0.144595
ROADWAY_SURFACE_COND_UNKNOWN             0.140044
INJURIES_NO_INDICATION                   0.136918
FIRST_CRASH_TYPE_TURNING                 0.123237
TRAFFIC_CONTROL_DEVICE_NO CONTROLS       0.117636
ROAD_DEFECT_NO DEFECTS                   0.117336
FIRST_CRASH_TYPE_REAR END                0.116750
ROAD_DEFECT_UNKNOWN                      0.116220
TRAFFIC_CONTROL_DEVICE_UNKNOWN           0.113659
DEVICE_CONDITION_NO CONTROLS             0.110279
Name: target, dtype: float64
