Impute data for collision_data.csv and person_data.csv

In [2]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [3]:
df = pd.read_csv('data/collision_data.csv')

In [4]:
# transform severity into binary classification: 1 (fatal), 0 (non-fatal)
df["C_SEV"] = df["C_SEV"].map({1: 1, 2: 0})

# cyclical encoding for month/day/hour
df["C_MNTH_sin"] = np.sin(2*np.pi * df["C_MNTH"] / 12)
df["C_MNTH_cos"] = np.cos(2*np.pi * df["C_MNTH"] / 12)

df["C_WDAY_sin"] = np.sin(2*np.pi * df["C_WDAY"] / 7)
df["C_WDAY_cos"] = np.cos(2*np.pi * df["C_WDAY"] / 7)

df["C_HOUR_sin"] = np.sin(2*np.pi * df["C_HOUR"] / 24)
df["C_HOUR_cos"] = np.cos(2*np.pi * df["C_HOUR"] / 24)

# drop original month/day/hour columns
df = df.drop(columns=["C_MNTH", "C_WDAY", "C_HOUR"])

df = df.drop(columns=["C_CASE"])

In [5]:
numeric_features = [
    "C_YEAR", 
    "C_VEHS",
    "C_MNTH_sin", "C_MNTH_cos",
    "C_WDAY_sin", "C_WDAY_cos",
    "C_HOUR_sin", "C_HOUR_cos"
]

categorical_features = [
    "C_CONF",
    "C_RCFG",
    "C_WTHR",
    "C_RSUR",
    "C_RALN",
    "C_TRAF"
]

In [6]:
df.isna().sum()

C_YEAR             0
C_SEV              0
C_VEHS            52
C_CONF        169942
C_RCFG        204764
C_WTHR         34148
C_RSUR         83648
C_RALN        126823
C_TRAF        122316
C_MNTH_sin        81
C_MNTH_cos        81
C_WDAY_sin       170
C_WDAY_cos       170
C_HOUR_sin     20096
C_HOUR_cos     20096
dtype: int64

In [7]:
df = df.dropna(subset=numeric_features) # drop numerical columns with na
df = df.replace(np.nan, 0)  # fill categorical na with 0

In [8]:
df.value_counts

<bound method DataFrame.value_counts of          C_YEAR  C_SEV  C_VEHS  C_CONF  C_RCFG  C_WTHR  C_RSUR  C_RALN  \
0          2005      0     1.0     4.0     2.0     4.0     5.0     3.0   
1          2005      0     2.0     2.0     3.0     1.0     3.0     1.0   
2          2005      0     2.0    35.0     2.0     1.0     1.0     1.0   
3          2005      0     1.0     4.0     0.0     4.0     4.0     3.0   
4          2005      0     1.0     2.0     0.0     1.0     3.0     3.0   
...         ...    ...     ...     ...     ...     ...     ...     ...   
1950731    2020      0     2.0    21.0     1.0     1.0     2.0     1.0   
1950732    2020      0     2.0    35.0     2.0     2.0     2.0     1.0   
1950733    2020      0     1.0     3.0     2.0     2.0     3.0     2.0   
1950734    2020      0     1.0     2.0     3.0     1.0     1.0     2.0   
1950735    2020      0     1.0     4.0     1.0     7.0     3.0     1.0   

         C_TRAF    C_MNTH_sin  C_MNTH_cos    C_WDAY_sin  C_WDAY_cos  \


In [9]:
df.to_csv('data/collision_data_no_na.csv', index=False)

In [13]:
df = pd.read_csv('data/person_data.csv')

In [14]:
# drop ID columns and rows with missing target
df = df.drop(columns=["C_CASE", "P_ID", "V_ID"], errors="ignore")
df = df.dropna(subset=["P_ISEV"])

# map injury severity to 0, 1, 2
df["P_ISEV"] = df["P_ISEV"].map({1: 0, 2: 1, 3: 2})

# map sex to numerical values
df["P_SEX"] = df["P_SEX"].map({"F": 0, "M": 1, "N": -1})

for col in ['V_TYPE', 'V_YEAR', 'P_SEX', 'P_PSN']:
    df[col] = df[col].astype('category').cat.codes


numeric_features = [
    "V_YEAR", 
    "P_AGE"
]

categorical_features = [
    "V_TYPE",
    "P_SEX",
    "P_PSN",
    "P_ISEV",
    "P_SAFE",
    "P_USER"
]

In [20]:
df.isna().sum()

V_TYPE    0
V_YEAR    0
P_SEX     0
P_AGE     0
P_PSN     0
P_ISEV    0
P_SAFE    0
P_USER    0
dtype: int64

In [19]:
df = df.dropna()

In [21]:
df.to_csv('data/person_data_no_na.csv', index=False)