In [48]:
import numpy as np
import pandas as pd

In [49]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [50]:
# Load dataset
df=pd.read_csv("accident_hotspot.csv",encoding="latin-1")

In [51]:
# Define a function to get the mode
def get_mode(series):
     mode_val=series.mode()
     return mode_val.iloc[0] if not mode_val.empty else -1

In [52]:
# Count unique accident per grid_id and find the mean coordinate
df_agg=df[["grid_id","Accident_Index","longitude","latitude"]].drop_duplicates(
    subset=["Accident_Index","grid_id"]
).groupby("grid_id").agg(
    accident_count=("Accident_Index","count"),
    mean_longitude=("longitude","mean"),
    mean_latitude=("latitude","mean")
).reset_index()

In [53]:
# Define a hotspot as any grid_id with an accident count in the top 10%
hotspot_percentile=0.90
threshold=df_agg["accident_count"].quantile(hotspot_percentile)
df_agg["is_hotspot"]=(df_agg["accident_count"]>=threshold).astype(int)
print(f"Hotspot Threshold (Accident Count >=): {threshold:0f}")
print(f"Number of hotspot (Class 1): {df_agg['is_hotspot'].sum()}")
print(f"Number of non-hotspot (Class 0): {len(df_agg)-df_agg['is_hotspot'].sum()}")

Hotspot Threshold (Accident Count >=): 19.000000
Number of hotspot (Class 1): 633
Number of non-hotspot (Class 0): 4985


In [54]:
# Feature Engineering
# Feature that describe the static environment(Use Mode)
mode_features=["Speed_limit","Road_Type","Junction_Detail","Urban_or_Rural_Area","1st_Road_Class"]
# Temporals Factor(Use Mean,Which give the proportion)
prop_features=["is_night","is_rush_hour"]
# Road Condition factor : Proportion of accidents on the wet/damp road
df["is_wet_damp"]=(df["Road_Surface_Conditions"]==2).astype(int)
# Aggregation dictionary
agg_dict={col:get_mode for col in mode_features}
agg_dict.update({col:"mean" for col in prop_features +["is_wet_damp"]})
# Aggregate feature for each grid_id
df_features_agg=df[["grid_id"]+mode_features+prop_features+["is_wet_damp"]].groupby("grid_id").agg(agg_dict).reset_index()
# Rename proportion column 
df_features_agg=df_features_agg.rename(columns={
    "is_night":"prop_accidents_at_night",
    "is_rush_hour":"prop_accidents_at_rush_hour",
    "is_wet_damp":"prop_accidents_on_wet_road"
})

In [55]:
# Merge the aggregated feature back to the Hotspot summary
final_df=pd.merge(df_agg,df_features_agg,on="grid_id",how="left")

In [56]:
final_df

Unnamed: 0,grid_id,accident_count,mean_longitude,mean_latitude,is_hotspot,Speed_limit,Road_Type,Junction_Detail,Urban_or_Rural_Area,1st_Road_Class,prop_accidents_at_night,prop_accidents_at_rush_hour,prop_accidents_on_wet_road
0,883c0000bdfffff,1,86.077481,28.492818,0,60,6,0,2,6,0.0,0.0,0.0
1,883c004189fffff,1,85.778423,28.122818,0,40,3,3,2,3,0.0,0.0,0.0
2,883c004a31fffff,1,85.701866,28.050795,0,30,6,3,2,4,0.0,0.0,0.0
3,883c007841fffff,1,86.033655,28.131821,0,40,6,0,2,3,0.0,0.0,0.0
4,883c007943fffff,1,86.004546,28.082948,0,30,6,0,1,3,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5613,8864dd8487fffff,1,88.386699,16.826621,0,60,6,0,2,4,0.0,1.0,0.0
5614,8864dda25bfffff,1,88.180526,16.790139,0,30,6,0,2,6,0.0,1.0,1.0
5615,8864dda5cbfffff,1,87.982137,16.866721,0,30,6,3,2,6,0.0,1.0,0.0
5616,8864ddb145fffff,1,88.341512,16.810509,0,60,6,0,2,4,0.0,0.0,1.0


In [57]:
# Prepare for Modeling
X=final_df.drop(columns=["grid_id","accident_count","mean_longitude","mean_latitude","is_hotspot"])
Y=final_df["is_hotspot"]

In [58]:
X_train,X_test,Y_train,Y_test=train_test_split(
    X,Y,
    test_size=0.3,
    random_state=42,
    stratify=Y
)
print(f"Training set size {len(X_train)}")
print(f"Test set size {len(X_test)}")

Training set size 3932
Test set size 1686


In [59]:
# Model Training :Random Forest Classifier 
# Defining which column need to do OneHotEncoding
categorical_features=mode_features
numerical_features=X_train.columns.drop(categorical_features).tolist()
# Creating a preprocessing pipeline for the features
preprocessor=ColumnTransformer(
    transformers=[
        ('onehot',OneHotEncoder(handle_unknown="ignore"),categorical_features),
        ('num','passthrough',numerical_features)
    ],
    remainder='drop'
)

In [60]:
# Create the  full pipeline model
model_pipeline=Pipeline(steps=[
      ('preprocessor',preprocessor),
      ('classifier',RandomForestClassifier(
          n_estimators=200,
          max_depth=10,
          random_state=42,
          class_weight='balanced'
      ))
])

In [61]:
# Train the model
print("Model training start")
model_pipeline.fit(X_train,Y_train)
print("Model training completed")

Model training start
Model training completed


In [62]:
# Model Evaluation
Y_pred=model_pipeline.predict(X_test)
print(f"Accuracy : {accuracy_score(Y_test,Y_pred):.4f}")
print("Classification Report")
print(classification_report(Y_test,Y_pred))

Accuracy : 0.7028
Classification Report
              precision    recall  f1-score   support

           0       0.98      0.68      0.80      1496
           1       0.26      0.92      0.41       190

    accuracy                           0.70      1686
   macro avg       0.62      0.80      0.61      1686
weighted avg       0.90      0.70      0.76      1686



In [63]:
# Get Feature importance from the tained classifier
feature_names=list(model_pipeline.named_steps['preprocessor'].transformers_[0][1].get_feature_names_out(categorical_features))
feature_names.extend(numerical_features)
importances=model_pipeline.named_steps['classifier'].feature_importances_
feature_importances_df=pd.DataFrame({"Feature":feature_names,"Importance":importances}).sort_values(by='Importance',ascending=False).head(10)
print("-----Top 10 Feature Importance-----")
print(feature_importances_df.to_markdown(index=False))

-----Top 10 Feature Importance-----
| Feature                     |   Importance |
|:----------------------------|-------------:|
| prop_accidents_on_wet_road  |    0.403184  |
| prop_accidents_at_rush_hour |    0.306538  |
| Speed_limit_30              |    0.054018  |
| Road_Type_6                 |    0.0431086 |
| Speed_limit_60              |    0.0427752 |
| Junction_Detail_6           |    0.0285859 |
| Road_Type_3                 |    0.0203877 |
| Urban_or_Rural_Area_1       |    0.0123381 |
| 1st_Road_Class_3            |    0.0113092 |
| Urban_or_Rural_Area_2       |    0.0110102 |


In [70]:
# Ploting on the map
import folium
from folium.plugins import MarkerCluster
# Create a base map centered around the avg location of data
map_center=[final_df['mean_latitude'].mean(),final_df['mean_longitude'].mean()]
m=folium.Map(location=map_center,zoom_start=12,tiles='OpenStreetMap')
# Filter for hotspot only to keep the map clean
hotspots=final_df[final_df['is_hotspot']==1]
# Add hotspot to the map
for idx,row in hotspots.iterrows():
    folium.CircleMarker(
        location=[row['mean_latitude'],row['mean_longitude']],
        radius=5,
        color='red',
        fill=True,
        fill_color='red',
        fill_opacity=0.7,
        popup=f"Grid Id: {row['grid_id']}<br>Accidents: {row['accident_count']}"
    ).add_to(m)
# Save and Display
m.save("accident_hotspots_map.html")
m

In [71]:
!git init

Initialized empty Git repository in C:/Desktop/Inovation/Accident Hotspot/.git/


In [79]:
!git add ..

fatal: ..: '..' is outside repository at 'C:/Desktop/Inovation/Accident Hotspot'


In [73]:
!git commit -m "This is my first commit"

On branch master

Initial commit

Untracked files:
  (use "git add <file>..." to include in what will be committed)
	accident_hotspot.csv
	accident_hotspots_map.html
	main.ipynb

nothing added to commit but untracked files present (use "git add" to track)


In [74]:
!git branch -M main 

In [77]:
!git remote add origin "https://github.com/letscodesatish/Accident_hotspots.git"

error: remote origin already exists.


In [78]:
!git push -u origin main

error: src refspec main does not match any
error: failed to push some refs to 'https://github.com/letscodesatish/Accident_hotspots.git'
