Notebook run using Python 3.13.5, PyTorch 2.8.0

# Data Exploration

In [1]:
# Check your system specs
import sys
print("Python version:", sys.version)

## IF APPLICABLE (GPU available)
# import torch
# # Check your torch version and CUDA availability
# print("Torch version:", torch.__version__)
# print("CUDA available:", torch.cuda.is_available())
# print("CUDA runtime:", torch.version.cuda)
# if torch.cuda.is_available():
#     print("GPU:", torch.cuda.get_device_name(0))

Python version: 3.13.7 | packaged by conda-forge | (main, Sep  3 2025, 14:30:35) [GCC 14.3.0]


## Import data

In [3]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "US_Accidents_March23.csv"

# Load the latest version
df = kagglehub.dataset_load(
  KaggleDatasetAdapter.PANDAS,
  "sobhanmoosavi/us-accidents",
  file_path
)
print("pandas df loaded")

pandas df loaded


In [4]:
print("First 5 records:")
df.head()

First 5 records:


Unnamed: 0,ID,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,Source2,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,Source2,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,0.01,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,Source2,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,0.01,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,Source2,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,0.01,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,Source2,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,0.01,...,False,False,False,False,True,False,Day,Day,Day,Day


## Visualize data

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7728394 entries, 0 to 7728393
Data columns (total 46 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   ID                     object 
 1   Source                 object 
 2   Severity               int64  
 3   Start_Time             object 
 4   End_Time               object 
 5   Start_Lat              float64
 6   Start_Lng              float64
 7   End_Lat                float64
 8   End_Lng                float64
 9   Distance(mi)           float64
 10  Description            object 
 11  Street                 object 
 12  City                   object 
 13  County                 object 
 14  State                  object 
 15  Zipcode                object 
 16  Country                object 
 17  Timezone               object 
 18  Airport_Code           object 
 19  Weather_Timestamp      object 
 20  Temperature(F)         float64
 21  Wind_Chill(F)          float64
 22  Humidity(%)       

In [6]:
df.describe()

Unnamed: 0,Severity,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Precipitation(in)
count,7728394.0,7728394.0,7728394.0,4325632.0,4325632.0,7728394.0,7564541.0,5729375.0,7554250.0,7587715.0,7551296.0,7157161.0,5524808.0
mean,2.212384,36.20119,-94.70255,36.26183,-95.72557,0.5618423,61.66329,58.25105,64.83104,29.53899,9.090376,7.68549,0.00840721
std,0.4875313,5.076079,17.39176,5.272905,18.10793,1.776811,19.01365,22.38983,22.82097,1.00619,2.688316,5.424983,0.1102246
min,1.0,24.5548,-124.6238,24.56601,-124.5457,0.0,-89.0,-89.0,1.0,0.0,0.0,0.0,0.0
25%,2.0,33.39963,-117.2194,33.46207,-117.7543,0.0,49.0,43.0,48.0,29.37,10.0,4.6,0.0
50%,2.0,35.82397,-87.76662,36.18349,-88.02789,0.03,64.0,62.0,67.0,29.86,10.0,7.0,0.0
75%,2.0,40.08496,-80.35368,40.17892,-80.24709,0.464,76.0,75.0,84.0,30.03,10.0,10.4,0.0
max,4.0,49.0022,-67.11317,49.075,-67.10924,441.75,207.0,207.0,100.0,58.63,140.0,1087.0,36.47


### Principle Component Analysis

#### Drop all non numeric columns in order to perform PCA

In [7]:
df_numeric = df.select_dtypes(include=['number'])

print("Numeric columns in df:")
df_numeric.columns

Numeric columns in df:


Index(['Severity', 'Start_Lat', 'Start_Lng', 'End_Lat', 'End_Lng',
       'Distance(mi)', 'Temperature(F)', 'Wind_Chill(F)', 'Humidity(%)',
       'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)',
       'Precipitation(in)'],
      dtype='object')

#### Understand and clean missing numeric values in numeric df

In [8]:
# Lets check the NaN values in our remaining numeric df
na_counts = df_numeric.isna().sum()
print(na_counts)

# As we see from the print statement, some columns have a ton of missing values... maybe this is an intentional data design...
# 'Precipitation' column has a lot of NaN values, lets understand why
print()
print("Number of present 'Precipitation' values:", df_numeric['Precipitation(in)'].notna().sum())
print("Number of missing 'Precipitation' values:", df_numeric['Precipitation(in)'].isna().sum())
print("Number of unique 'Precipitation' values:", df_numeric['Precipitation(in)'].nunique())

# Visualize some of these NaN rows
df_filtered = df_numeric[df_numeric['Precipitation(in)'].isna()]
df_filtered.head(10)


Severity                   0
Start_Lat                  0
Start_Lng                  0
End_Lat              3402762
End_Lng              3402762
Distance(mi)               0
Temperature(F)        163853
Wind_Chill(F)        1999019
Humidity(%)           174144
Pressure(in)          140679
Visibility(mi)        177098
Wind_Speed(mph)       571233
Precipitation(in)    2203586
dtype: int64

Number of present 'Precipitation' values: 5524808
Number of missing 'Precipitation' values: 2203586
Number of unique 'Precipitation' values: 299


Unnamed: 0,Severity,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),Temperature(F),Wind_Chill(F),Humidity(%),Pressure(in),Visibility(mi),Wind_Speed(mph),Precipitation(in)
2,2,39.063148,-84.032608,,,0.01,36.0,33.3,100.0,29.67,10.0,3.5,
3,3,39.747753,-84.205582,,,0.01,35.1,31.0,96.0,29.64,9.0,4.6,
4,2,39.627781,-84.188354,,,0.01,36.0,33.3,89.0,29.65,6.0,3.5,
6,2,39.758274,-84.230507,,,0.0,34.0,31.0,100.0,29.66,7.0,3.5,
7,3,39.770382,-84.194901,,,0.01,34.0,31.0,100.0,29.66,7.0,3.5,
8,2,39.778061,-84.172005,,,0.0,33.3,,99.0,29.67,5.0,1.2,
10,3,39.952812,-83.119293,,,0.01,35.6,30.7,93.0,29.64,5.0,5.8,
12,2,39.737633,-84.149933,,,0.0,33.8,,100.0,29.63,3.0,2.3,
13,2,39.79076,-84.241547,,,0.01,36.0,31.1,89.0,29.65,10.0,5.8,
15,2,39.745888,-84.17041,,,0.01,33.8,,100.0,29.63,3.0,2.3,


#### Impute missing numerical data

In [10]:
# For the 'Precipitation' column, we can assume the missing data is supposed to be 0
df_numeric['Precipitation(in)'] = df_numeric['Precipitation(in)'].fillna(0)
df_numeric.isna().sum()

Severity                   0
Start_Lat                  0
Start_Lng                  0
End_Lat              3402762
End_Lng              3402762
Distance(mi)               0
Temperature(F)        163853
Wind_Chill(F)        1999019
Humidity(%)           174144
Pressure(in)          140679
Visibility(mi)        177098
Wind_Speed(mph)       571233
Precipitation(in)          0
dtype: int64

In [42]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = StandardScaler()
df_numeric_scaled = scaler.fit_transform(df_numeric)

pca = PCA(n_components=2)
principal_components = pca.fit_transform(df_numeric_scaled)


ValueError: Input X contains NaN.
PCA does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

## Goal: Predict crash severity

In [None]:
## TODO