Notebook run using:
    
Python version: 3.13.5

Torch version: 2.8.0+cu128

CUDA runtime: 12.8

GPU: NVIDIA L4

# Model Building

In this notebook, we will begin building different classification models to predict crash severity

### Load Dataframe

In [1]:
import pandas as pd
import kagglehub

# ✅ Download the dataset if not stored locally
# path = kagglehub.dataset_download("sobhanmoosavi/us-accidents")

path ="/home/jacksonwahl/.cache/kagglehub/datasets/sobhanmoosavi/us-accidents/versions/13"
# ✅ Load the CSV into pandas
df_orig = pd.read_csv(f"{path}/US_Accidents_March23.csv")
print("pandas df loaded")

pandas df loaded


In [2]:
# Make copy of original dataset for working and easy re-initialization
df = df_orig.copy()
print("First 5 records:")
print("shape:", df_orig.shape)
df.head()

First 5 records:
shape: (7728394, 46)


Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,Source2,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,Source2,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,Source2,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,0.01,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,Source2,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,0.01,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,Source2,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,0.01,...,False,False,False,False,True,False,Day,Day,Day,Day


## Data Preprocessing

See "Model Building" PDF report for rationale on all removed, transformed and imputed data

In [8]:
df.describe()

Unnamed: 0,Severity,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Precipitation(in)
count,7728394.0,7728394.0,7728394.0,4325632.0,4325632.0,7728394.0,7564541.0,5729375.0,7554250.0,7587715.0,7551296.0,7157161.0,5524808.0
mean,2.212384,36.20119,-94.70255,36.26183,-95.72557,0.5618423,61.66329,58.25105,64.83104,29.53899,9.090376,7.68549,0.00840721
std,0.4875313,5.076079,17.39176,5.272905,18.10793,1.776811,19.01365,22.38983,22.82097,1.00619,2.688316,5.424983,0.1102246
min,1.0,24.5548,-124.6238,24.56601,-124.5457,0.0,-89.0,-89.0,1.0,0.0,0.0,0.0,0.0
25%,2.0,33.39963,-117.2194,33.46207,-117.7543,0.0,49.0,43.0,48.0,29.37,10.0,4.6,0.0
50%,2.0,35.82397,-87.76662,36.18349,-88.02789,0.03,64.0,62.0,67.0,29.86,10.0,7.0,0.0
75%,2.0,40.08496,-80.35368,40.17892,-80.24709,0.464,76.0,75.0,84.0,30.03,10.0,10.4,0.0
max,4.0,49.0022,-67.11317,49.075,-67.10924,441.75,207.0,207.0,100.0,58.63,140.0,1087.0,36.47


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7728394 entries, 0 to 7728393
Data columns (total 27 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Severity               int64  
 1   Distance(mi)           float64
 2   Temperature(F)         float64
 3   Wind_Chill(F)          float64
 4   Humidity(%)            float64
 5   Pressure(in)           float64
 6   Visibility(mi)         float64
 7   Wind_Speed(mph)        float64
 8   Precipitation(in)      float64
 9   Weather_Condition      object 
 10  Amenity                bool   
 11  Bump                   bool   
 12  Crossing               bool   
 13  Give_Way               bool   
 14  Junction               bool   
 15  No_Exit                bool   
 16  Railway                bool   
 17  Roundabout             bool   
 18  Station                bool   
 19  Stop                   bool   
 20  Traffic_Calming        bool   
 21  Traffic_Signal         bool   
 22  Turning_Loop      

### Remove columns

In [10]:
df = df.drop(columns=["ID", "Source", "Start_Time", "End_Time", "Start_Lat","Start_Lng","End_Lat", "End_Lng", "Description", "Street", "City", "County","State","Zipcode","Country","Timezone","Airport_Code","Weather_Timestamp","Wind_Direction"])

### Transform columns

#### Weather_Condition

In [30]:
from collections import Counter

# Drop N/A vals
weather_vals = df["Weather_Condition"].dropna().unique()

# Split each weather description (string) into words and flatten the list
all_words = [word for text in weather_vals for word in text.split()]
word_counts = Counter(all_words)

# Visualize the most popular words
print(word_counts)

Counter({'/': 50, 'Windy': 45, 'Light': 31, 'Snow': 31, 'Rain': 26, 'Heavy': 23, 'and': 18, 'Thunder': 14, 'Sleet': 13, 'Fog': 11, 'Freezing': 10, 'Drizzle': 9, 'Blowing': 8, 'Dust': 8, 'Shower': 8, 'Thunderstorms': 7, 'Cloudy': 6, 'Showers': 6, 'Sand': 6, 'Hail': 6, 'with': 4, 'T-Storm': 4, 'Wintry': 4, 'Mix': 4, 'Haze': 3, 'Smoke': 3, 'Drifting': 3, 'Ice': 3, 'Pellets': 3, 'Mostly': 2, 'Partly': 2, 'Shallow': 2, 'Mist': 2, 'Patches': 2, 'of': 2, 'Fair': 2, 'Widespread': 2, 'in': 2, 'the': 2, 'Vicinity': 2, 'Thunderstorm': 2, 'Squalls': 2, 'Grains': 2, 'Small': 2, 'Whirlwinds': 2, 'Whirls': 2, 'Nearby': 2, 'Partial': 2, 'Overcast': 1, 'Scattered': 1, 'Clouds': 1, 'Clear': 1, 'Volcanic': 1, 'Ash': 1, 'Funnel': 1, 'Cloud': 1, 'Low': 1, 'N/A': 1, 'Precipitation': 1, 'Tornado': 1, 'Duststorm': 1})


In [36]:
df.dropna(subset=["Weather_Condition"], inplace=True)

# Ignore 'Windy' because it is captured in the 'wind speed' column
# Ignore 'Rain' because it is captured in the 'precipitation' column
# Ignore 'Freezing' because it is captured by 'temperature' column


# Encode 'Light' and 'Heavy' because they can indicate severity of weather conditions
# Binary column: 1 if 'hello' exists in 'text', 0 otherwise
df["Light_Weather"] = df["Weather_Condition"].str.contains("Light").astype(int)
df["Heavy_Weather"] = df["Weather_Condition"].str.contains("Heavy").astype(int)

# Encode 'Thunder', 'T-Storm' and 'Thunderstorms' 'Thunderstorm' to be 'Thundering'
df["Thundering_Weather"] = df["Weather_Condition"].str.contains("|".join(["Thunder","T-Storm","Thunderstorms","Thunderstorm"])).astype(int)

# Encode 'Fog' because its popular (even though it might be correlated with visibility)
df["Fog_Weather"] = df["Weather_Condition"].str.contains("Fog").astype(int)

# Encode 'Clouds', 'Cloudy', 'Cloud', and 'Overcast' to 'Cloudy'
df["Cloudy_Weather"] = df["Weather_Condition"].str.contains("|".join(["Clouds","Cloudy","Cloud","Overcast"])).astype(int)

# Encode 'Clear' because it seems distinct
df["Clear_Weather"] = df["Weather_Condition"].str.contains("Clear").astype(int)

# Now, drop weather condition since it is no longer needed
df = df.drop(columns=["Weather_Condition"])

In [37]:
df.head()

Unnamed: 0,Severity,Distance(mi),Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Precipitation(in),Amenity,...,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,Light_Weather,Heavy_Weather,Thundering_Weather,Fog_Weather,Cloudy_Weather,Clear_Weather
0,3,0.01,36.9,,91.0,29.68,10.0,,0.02,False,...,Night,Night,Night,Night,1,0,0,0,0,0
1,2,0.01,37.9,,100.0,29.65,10.0,,0.0,False,...,Night,Night,Night,Day,1,0,0,0,0,0
2,2,0.01,36.0,33.3,100.0,29.67,10.0,3.5,,False,...,Night,Night,Day,Day,0,0,0,0,1,0
3,3,0.01,35.1,31.0,96.0,29.64,9.0,4.6,,False,...,Night,Day,Day,Day,0,0,0,0,1,0
4,2,0.01,36.0,33.3,89.0,29.65,6.0,3.5,,False,...,Day,Day,Day,Day,0,0,0,0,1,0


#### Binary encoding Sunset and Twilight columns 

In [39]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["Sunrise_Sunset"] = le.fit_transform(df["Sunrise_Sunset"])
df["Civil_Twilight"] = le.fit_transform(df["Civil_Twilight"])
df["Nautical_Twilight"] = le.fit_transform(df["Nautical_Twilight"])
df["Astronomical_Twilight"] = le.fit_transform(df["Astronomical_Twilight"])

### Drop rows with NA values

In [48]:
df.dropna(subset=["Temperature(F)", "Humidity(%)", "Pressure(in)", "Visibility(mi)"], inplace=True)

### Impute columns

In [54]:
# Fill NA columns with zero
df["Wind_Speed(mph)"] = df["Wind_Speed(mph)"].fillna(df["Visibility(mi)"].mean())
df["Precipitation(in)"] = df["Precipitation(in)"].fillna(df["Precipitation(in)"].mean())

# Fill NA wind chill column with temperature value for that sample
df["Wind_Chill(F)"] = df["Wind_Chill(F)"].fillna(df["Temperature(F)"])


In [58]:
# Confirm no more missing values
na_counts = df.isna().sum()
print(na_counts)

Severity                 0
Distance(mi)             0
Temperature(F)           0
Wind_Chill(F)            0
Humidity(%)              0
Pressure(in)             0
Visibility(mi)           0
Wind_Speed(mph)          0
Precipitation(in)        0
Amenity                  0
Bump                     0
Crossing                 0
Give_Way                 0
Junction                 0
No_Exit                  0
Railway                  0
Roundabout               0
Station                  0
Stop                     0
Traffic_Calming          0
Traffic_Signal           0
Turning_Loop             0
Sunrise_Sunset           0
Civil_Twilight           0
Nautical_Twilight        0
Astronomical_Twilight    0
Light_Weather            0
Heavy_Weather            0
Thundering_Weather       0
Fog_Weather              0
Cloudy_Weather           0
Clear_Weather            0
dtype: int64


## Model training

### Prepare the data

In [59]:
from sklearn.model_selection import train_test_split

# Create our target and feature cols split
y = df["Severity"]
X = df.drop(columns=["Severity"])

# We'll do an 80/20 train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

### Random forest classifier

NOTE: Run on NVIDIA L4 GPU

In [None]:
import cudf
from cuml.ensemble import RandomForestClassifier as cuRF
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Convert pandas DataFrame to cuDF (GPU DataFrame)
X_gpu = cudf.DataFrame.from_pandas(X)
y_gpu = cudf.Series(y)

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X_gpu, y_gpu, test_size=0.2, random_state=42
)

# Create and train GPU Random Forest
rf_gpu = cuRF(n_estimators=100, max_depth=10, random_state=42)
rf_gpu.fit(X_train, y_train)

# Predict
y_pred = rf_gpu.predict(X_test)

# Evaluate (convert back to CPU for sklearn metrics)
y_pred_cpu = y_pred.to_pandas()
y_test_cpu = y_test.to_pandas()
print("Accuracy:", accuracy_score(y_test_cpu, y_pred_cpu))