In [42]:
import numpy as np
import pandas as pd
import datetime
import matplotlib
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split

In [43]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [44]:
folder = "../00_Raw_Data/"

# Generate filenames from 2015_Q1 to 2022_Q4
years = range(2015, 2023)
quarters = range(1, 5)

files = ["{}_Q{}_Traffic_Crashes.csv".format(year, quarter) for year in years for quarter in quarters]

dfs = [pd.read_csv(folder + file) for file in files]
df = pd.concat(dfs, ignore_index=True)

### Time-related feature engineering

In [45]:
df['collision_date'] = pd.to_datetime(df['collision_date'], format='%m/%d/%Y %I:%M:%S %p')
df['date'] = df['collision_date'].dt.date

df['day_of_week'] = df['collision_date'].dt.dayofweek # 0: Monday - 6: Sunday
df['weekend'] = df['day_of_week'].apply(lambda x: 1 if x in [5, 6] else 0)

day_map = {0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'}
df['day_name'] = df['day_of_week'].map(day_map)
day_onehot = pd.get_dummies(df['day_name'], prefix='day')
df = pd.concat([df, day_onehot], axis=1)

df['time'] = df['collision_date'].dt.time
df['hour'] = df['collision_date'].dt.hour
df['month'] = df['collision_date'].dt.month
month_map = {1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 
             7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'}
df['month_name'] = df['month'].map(month_map)
month_onehot = pd.get_dummies(df['month_name'], prefix='month')
df = pd.concat([df, month_onehot], axis=1)

df['year'] = df['collision_date'].dt.year

def assign_season(month):
    if month in [12, 1, 2]:
        return 'winter'
    elif month in [3, 4, 5]:
        return 'spring'
    elif month in [6, 7, 8]:
        return 'summer'
    else:
        return 'fall'

df['season'] = df['month'].apply(assign_season)

season_onehot = pd.get_dummies(df['season'], prefix='season')
df = pd.concat([df, season_onehot], axis=1)
df.drop('season', axis=1, inplace=True)


def time_window(hour):
    if 23 <= hour or hour < 5:
        return 'late night'
    elif 5 <= hour < 11:
        return 'morning'
    elif 11 <= hour < 17:
        return 'afternoon'
    elif 17 <= hour < 23:
        return 'night'
    else:
        return 'unknown' 
    
df['covid'] = (df['date'] >= pd.Timestamp('2020-03-15').date()).astype(int)
df['time_window'] = df['hour'].apply(time_window)

### Accident Related Feature Engineering

In [46]:
bins = [0, 1, 2, 3, 5, float('inf')]
labels = ['1 vehicle', '2 vehicles', '3 vehicles', '4-5 vehicles', '6+ vehicles']
df['vehicles_binned'] = pd.cut(df['vehicles_involved'], bins=bins, labels=labels, right=True, include_lowest=True)

vehicles_onehot = pd.get_dummies(df['vehicles_binned'], prefix='vehicles')
df = pd.concat([df, vehicles_onehot], axis=1)
df.drop('vehicles_binned', axis=1, inplace=True)

In [47]:
bins = [-1, 0, 1, 2, 4, 6, float('inf')]
labels = ['0 injured', '1 injured', '2 injured', '3-4 injured', '5-6 injured', '7+ injured']
df['injured_binned'] = pd.cut(df['number_injured'], bins=bins, labels=labels, right=True, include_lowest=True)

injured_onehot = pd.get_dummies(df['injured_binned'], prefix='injured')
df = pd.concat([df, injured_onehot], axis=1)
df.drop('injured_binned', axis=1, inplace=True)

In [48]:
bins = [-1, 0, 1, float('inf')]
labels = ['0 dead', '1 dead', '2+ dead']
df['dead_binned'] = pd.cut(df['number_dead'], bins=bins, labels=labels, right=True, include_lowest=True)

dead_onehot = pd.get_dummies(df['dead_binned'], prefix='dead')
df = pd.concat([df, dead_onehot], axis=1)
df.drop('dead_binned', axis=1, inplace=True)

In [49]:
df['manner_of_crash'] = df['manner_of_crash'].replace(['Missing', 'Unknown'], 'Unknown/Missing')

crash_onehot = pd.get_dummies(df['manner_of_crash'], prefix='crash')
df = pd.concat([df, crash_onehot], axis=1)

In [50]:
columns_to_convert = ['pedestrian', 'bicycle', 'scooter', 'hitrun', 'parking']

for column in columns_to_convert:
    df[column] = df[column].map({'yes': 1, 'no': 0})

### location related feature engineering

In [51]:
# keep top 100 roads
top_100_roads = df['roadway_name'].value_counts().head(100).index

df['roadway_grouped'] = np.where(df['roadway_name'].isin(top_100_roads), df['roadway_name'], 'Other')

roadway_onehot = pd.get_dummies(df['roadway_grouped'], prefix='road')
df = pd.concat([df, roadway_onehot], axis=1)
df.drop('roadway_grouped', axis=1, inplace=True)

In [52]:
df['intersection_indicator'].fillna("N", inplace=True)
df['intersection_indicator'].replace("Missing", "N", inplace=True)
mapping = {"N": 0, "Y": 1}
df['intersection_encoded'] = df['intersection_indicator'].map(mapping)
df.drop('intersection_indicator', axis=1, inplace=True)

In [53]:
df['intersect_type'].replace(['OTHER', 'Missing', 'Unknown'], 'Unknown/Other', inplace=True)
intersect_type_onehot = pd.get_dummies(df['intersect_type'], prefix='intersect_type')
df = pd.concat([df, intersect_type_onehot], axis=1)
df.drop('intersect_type', axis=1, inplace=True)

In [54]:
df['relation_to_junction'].replace(['Missing', 'OtherLocation', 'Unknown'], 'Unknown/Other', inplace=True)
relation_onehot = pd.get_dummies(df['relation_to_junction'], prefix='relation_to_junction')
df = pd.concat([df, relation_onehot], axis=1)
df.drop('relation_to_junction', axis=1, inplace=True)

In [55]:
# Combine "Campus" and "OHO" categories
df['mou'] = df['mou'].replace('OHO', 'Campus')

# Now, encode "Campus" as 1 and "False" as 0
df['mou'] = df['mou'].apply(lambda x: 1 if x == 'Campus' else 0)

### Environment related feature engineering

In [56]:
df['work_zone_present'] = df['work_zone_type'].apply(lambda x: 0 if x == "Missing" or x == "Unknown" else 1)

In [57]:
def simplify_weather(weather):
    if pd.isna(weather):
        return 'Unknown/Other'
    elif 'Snow' in weather:
        return 'Snow'
    elif 'Sleet/Hail' in weather:
        return 'Sleet/Hail'
    elif 'Rain' in weather:
        return 'Rain'
    elif weather in ['Fog', 'Smoke', 'Smog']:
        return 'Fog'
    elif 'Cloudy' in weather:
        return 'Cloudy'
    elif weather == 'Clear':
        return 'Clear'
    elif weather in ['Missing', 'Unknown', 'Other']:
        return 'Unknown/Other'
    else:
        return 'Other Conditions'


df['simplified_weather'] = df['weather_condition(s)'].apply(simplify_weather)
df = pd.get_dummies(df, columns=['simplified_weather'], prefix='weather')
df.drop('weather_condition(s)', axis=1, inplace=True)

### drop cols won't be using

In [58]:
columns_to_drop = [
    "master_record_number", "agency", "agency_tracking_number", "county", "city", 
    "collision_date", "vehicles_involved", "number_injured", "number_dead", 
    "roadway_name", "roadway_suffix", "roadway_number", "roadway_local_id", 
    "distance_from_reference", "miles-feet_indicator", "direction_from_reference",
    "intersection_road_name", "intersection_road_name_suffix", "intersection_road_number",
    "intersection_local_id", "mile_marker", "interchange_related_indicator", 
    "work_zone_type", "construction_maintenance_zone", "construction_maintenance_zone_location",
    "fatal_case_number", "manner_of_crash", "date", "day_of_week", 
    "day_name", "month", "month_name", "officer_first_name", "officer_last_name"
]

df = df.drop(columns=columns_to_drop)

In [59]:
df.sample(5)

Unnamed: 0,latitude,longitude,pedestrian,bicycle,scooter,parking,hitrun,mou,weekend,day_Friday,day_Monday,day_Saturday,day_Sunday,day_Thursday,day_Tuesday,day_Wednesday,time,hour,month_April,month_August,month_December,month_February,month_January,month_July,month_June,month_March,month_May,month_November,month_October,month_September,year,season_fall,season_spring,season_summer,season_winter,covid,time_window,vehicles_1 vehicle,vehicles_2 vehicles,vehicles_3 vehicles,vehicles_4-5 vehicles,vehicles_6+ vehicles,injured_0 injured,injured_1 injured,injured_2 injured,injured_3-4 injured,injured_5-6 injured,injured_7+ injured,dead_0 dead,dead_1 dead,dead_2+ dead,crash_Angle,crash_Front to Rear,crash_HeadOn,crash_Not Collision with Motor Vehicle in Transport,crash_Other,crash_Rear to Rear,crash_Rear to Side,"crash_Sideswipe, Opposite Direction","crash_Sideswipe, Same Direction",crash_Unknown/Missing,road_-- UNKNOWN RAMP/ROAD --,road_12TH AVS,road_14TH AVN,road_21ST AVS,road_28TH AVN,road_2ND AVS,road_4TH AVS,road_8TH AVS,road_ANDREW JACKSON PKWY,road_ANTIOCH PKE,road_BELL RD,road_BRICK CHURCH PKE,road_BRILEY PKWY,road_BRILEY PKWYE,road_BRILEY PKWYN,road_BRILEY PKWYS,road_BRILEY PKWYW,road_BROADWAY,road_CENTRAL PKE,road_CHARLOTTE AV,road_CHARLOTTE PKE,road_CHURCH ST,road_CLARKSVILLE PKE,road_CONFERENCE DR,road_DEMONBREUN ST,road_DICKERSON PKE,road_DIVISION ST,road_DONELSON PKE,road_DR D B TODD JR BLVD,road_E TRINITY LN,road_EDMONDSON PKE,road_ELLINGTON PKWYS,road_ELM HILL PKE,road_FRANKLIN PKE,road_GALLATIN AV,road_GALLATIN PKE,road_GALLATIN PKEN,road_GALLATIN PKES,road_HAMILTON CHURCH RD,road_HARDING PKE,road_HARDING PL,road_HAYWOOD LN,road_HERMITAGE AV,road_HIGHWAY 100,road_HIGHWAY 70S,road_HILLSBORO PKE,road_I 24,road_I 40,road_I 440,road_I 65,road_I24 E ENT RAMP,road_I24 E EXT RAMP,road_I24 W ENT RAMP,road_I24 W EXT RAMP,road_I24E,road_I40 E ENT RAMP,road_I40 E EXT RAMP,road_I40 W ENT RAMP,road_I40 W EXT RAMP,road_I65 N ENT RAMP,road_I65 N EXT RAMP,road_I65 S ENT RAMP,road_I65 S EXT RAMP,road_JAMES ROBERTSON PKWY,road_JEFFERSON ST,road_LAFAYETTE ST,road_LEBANON PKE,road_MAIN ST,road_MCGAVOCK PKE,road_MT VIEW RD,road_MURFREESBORO PKE,road_MYATT DR,road_NEELYS BEND RD,road_NOLENSVILLE PKE,road_OLD HICKORY BLVD,road_OLD HICKORY BLVDW,road_Other,road_POWELL AV,road_ROSA L PARKS BLVD,road_SHELBY AV,road_SMITH SPRINGS RD,road_SPRING ST,road_STATE HWY 155,road_STATE HWY 254,road_STATE HWY 255,road_STATE HWY 45,road_STATE RT 45,road_STEWARTS FERRY PKE,road_THOMPSON LN,road_US HWY 31 ALT,road_US HWY 31E,road_US HWY 41,road_US HWY 431,road_US HWY 70,road_US HWY 70S,road_W TRINITY LN,road_WEDGEWOOD AV,road_WEST END AV,road_WHITE BRIDGE PKE,road_WHITES CREEK PKE,road_WOODMONT BLVD,intersection_encoded,intersect_type_Five Point Or More,intersect_type_Four-Way,intersect_type_Not at Intersection,intersect_type_Roundabout,intersect_type_T-Intersection,intersect_type_Traffic Circle,intersect_type_Unknown/Other,intersect_type_Y-Intersection,relation_to_junction_AccelDecelLane,relation_to_junction_Crossover-Related,"relation_to_junction_Driveway, Alley Access, etc",relation_to_junction_Entrance/Exit Ramp Related,relation_to_junction_Intersection,relation_to_junction_Intersection-Related,relation_to_junction_Non-Junction,relation_to_junction_Rail Grade Crossing,relation_to_junction_SharedUsePathOrTrail,relation_to_junction_Unknown/Other,work_zone_present,weather_Clear,weather_Cloudy,weather_Fog,weather_Other Conditions,weather_Rain,weather_Sleet/Hail,weather_Snow,weather_Unknown/Other
16095,36.12462,-86.80686,0,0,0,0,0,0,0,False,False,False,False,True,False,False,18:14:00,18,False,False,False,False,False,False,True,False,False,False,False,False,2015,False,False,True,False,0,night,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,1,True,False,False,False,False,False,False,False
100843,36.16531,-86.79428,0,0,0,0,0,0,0,True,False,False,False,False,False,False,16:45:00,16,False,True,False,False,False,False,False,False,False,False,False,False,2017,False,False,True,False,0,afternoon,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,1,True,False,False,False,False,False,False,False
92377,36.23839,-86.67141,0,0,0,0,1,0,0,False,False,False,False,False,True,False,15:05:00,15,False,False,False,False,False,False,True,False,False,False,False,False,2017,False,False,True,False,0,afternoon,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,1,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,1,False,True,False,False,False,False,False,False
259187,36.17644,-86.76593,0,0,0,0,0,0,0,False,False,False,False,False,True,False,10:22:00,10,True,False,False,False,False,False,False,False,False,False,False,False,2022,False,True,False,False,1,morning,False,True,False,False,False,True,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,1,False,False,False,False,True,False,False,False
3513,36.11595,-86.78603,0,0,0,0,0,0,0,False,False,False,False,False,True,False,16:24:00,16,False,False,False,True,False,False,False,False,False,False,False,False,2015,False,False,False,True,0,afternoon,False,False,True,False,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,0,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,1,True,False,False,False,False,False,False,False


In [60]:
df['time'] = df['time'].astype(str)
df['time'] = df['time'].str.strip()

missing_time_df = df[df['time'] == "00:00:00"] 
has_time_df = df[df['time'] != "00:00:00"]

In [61]:
columns_to_drop = [
    "time",
    "hour"
]
missing_time_df = missing_time_df.drop(columns=columns_to_drop)

In [62]:
def time_window(hour):
    if 23 <= hour or hour < 5:
        return 'late night'
    elif 5 <= hour < 11:
        return 'morning'
    elif 11 <= hour < 17:
        return 'afternoon'
    elif 17 <= hour < 23:
        return 'night'
    else:
        return 'unknown' 

has_time_df = has_time_df.copy()
has_time_df['time_window'] = has_time_df['hour'].apply(time_window)

In [63]:
columns_to_drop = [
    "time",
    "hour"
]
has_time_df = has_time_df.drop(columns=columns_to_drop)

In [64]:
train_df, temp_df = train_test_split(has_time_df, test_size=0.3, stratify=has_time_df['time_window'], random_state=42)

valid_df, test_df = train_test_split(temp_df, test_size=1/3, stratify=temp_df['time_window'], random_state=42)

print("Training set:", len(train_df))
print("Validation set:", len(valid_df))
print("Testing set:", len(test_df))

Training set: 183941
Validation set: 52554
Testing set: 26278


In [65]:
def print_value_counts(df, df_name):
    print(f"--- {df_name} ---")
    print(df['time_window'].value_counts())
    print("\n")

print_value_counts(train_df, "Training set")
print_value_counts(valid_df, "Validation set")
print_value_counts(test_df, "Testing set")

--- Training set ---
time_window
afternoon     72309
night         52011
morning       44885
late night    14736
Name: count, dtype: int64


--- Validation set ---
time_window
afternoon     20659
night         14860
morning       12824
late night     4211
Name: count, dtype: int64


--- Testing set ---
time_window
afternoon     10330
night          7430
morning        6413
late night     2105
Name: count, dtype: int64




In [66]:
def bool_to_int(df):
    for col in df.select_dtypes(['bool']).columns:
        df[col] = df[col].astype(int)
    return df

train_df = bool_to_int(train_df)
valid_df = bool_to_int(valid_df)
test_df = bool_to_int(test_df)

In [67]:
train_df.to_csv("../03_Data_for_Modeling/train.csv", index=False)
valid_df.to_csv("../03_Data_for_Modeling/valid.csv", index=False)
test_df.to_csv("../03_Data_for_Modeling/test.csv", index=False)