Import Packages

In [44]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

Data Preparation

In [45]:
%run /Users/jbslaunwhite01/projects/hfx_traffic_analysis_app/data/transform/transform_data.py

file_path = '/Users/jbslaunwhite01/projects/hfx_traffic_analysis_app/data/raw/Traffic_Collisions.csv'
data = transform_data(file_path)
print(data.columns)
# Assuming 'data' is your DataFrame loaded from 'Traffic_Collisions.csv'
# Let's focus on geographical coordinates as primary features for hotspot prediction
features = data[['WGS84_LAT_COORD', 'WGS84_LON_COORD']]

# For the target, we create a 'Hotspot' label based on collision frequency in the area
# This is a simplified approach, and more sophisticated methods could be applied
# Calculate collision counts by coordinates rounding to approximate for general areas
data['Hotspot'] = data.groupby(['WGS84_LAT_COORD', 'WGS84_LON_COORD'])['OBJECTID'].transform('count')
# Label areas with collisions above a certain percentile as hotspots
hotspot_threshold = data['Hotspot'].quantile(0.75) # Adjust the percentile based on the desired threshold
data['Is_Hotspot'] = (data['Hotspot'] >= hotspot_threshold).astype(int)

# Prepare the features and target variable
X = features
y = data['Is_Hotspot']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optional: Scale the features (useful if extending the feature set beyond coordinates)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


<bound method DataFrame.value_counts of                X          Y  OBJECTID  WGS84_LAT_COORD  WGS84_LON_COORD  \
0     -63.682392  44.601085         1        44.601076       -63.682394   
1     -63.777518  44.768255         2        44.768246       -63.777520   
2     -63.765056  44.667540         3        44.667531       -63.765058   
3     -63.843731  44.709729         4        44.709720       -63.843733   
4     -63.598575  44.687277         5        44.687268       -63.598577   
...          ...        ...       ...              ...              ...   
31953 -63.856055  44.703075     31954        44.703066       -63.856057   
31954 -63.623132  44.656582     31955        44.656573       -63.623134   
31955 -63.468309  44.745645     31956        44.745635       -63.468311   
31956 -63.528903  44.670606     31957        44.670597       -63.528905   
31957 -63.571562  44.636434     31958        44.636425       -63.571564   

                                ROAD_LOCATION_1  \
0       

Model Training

In [46]:
# Initialize and train the RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_scaled, y_train)


Model Evaluation

In [47]:
# Predict on the test set
y_pred = clf.predict(X_test_scaled)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[4589   46]
 [   0 1757]]
              precision    recall  f1-score   support

           0       1.00      0.99      1.00      4635
           1       0.97      1.00      0.99      1757

    accuracy                           0.99      6392
   macro avg       0.99      1.00      0.99      6392
weighted avg       0.99      0.99      0.99      6392



**Interpretation & Use**
A model trained to predict collision hotspots can guide traffic safety improvements. High-risk areas identified by the model could be prioritized for interventions such as enhanced signage, road redesign, or increased enforcement of traffic laws.