In [1]:
import numpy as np
import pandas as pd
import datetime
import matplotlib
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [3]:
folder = "../00_Raw_Data/"

# Generate filenames from 2015_Q1 to 2022_Q4
years = range(2015, 2023)
quarters = range(1, 5)

files = ["{}_Q{}_Traffic_Crashes.csv".format(year, quarter) for year in years for quarter in quarters]

dfs = [pd.read_csv(folder + file) for file in files]
df = pd.concat(dfs, ignore_index=True)

In [4]:
df['collision_date'] = pd.to_datetime(df['collision_date'], format='%m/%d/%Y %I:%M:%S %p')
df['date'] = df['collision_date'].dt.date

df['day_of_week'] = df['collision_date'].dt.dayofweek # 0: Monday - 6: Sunday
df['weekend'] = df['day_of_week'].apply(lambda x: 1 if x in [5, 6] else 0)

df['time'] = df['collision_date'].dt.time
df['hour'] = df['collision_date'].dt.hour
df['month'] = df['collision_date'].dt.month
df['year'] = df['collision_date'].dt.year
df['covid'] = (df['date'] >= pd.Timestamp('2020-03-15').date()).astype(int)

def time_window(hour):
    if 23 <= hour or hour < 5:
        return 'late night'
    elif 5 <= hour < 11:
        return 'morning'
    elif 11 <= hour < 17:
        return 'afternoon'
    elif 17 <= hour < 23:
        return 'night'
    else:
        return 'unknown' 
df['time_window'] = df['hour'].apply(time_window)

In [5]:
columns_to_convert = ['pedestrian', 'bicycle', 'scooter', 'hitrun', 'parking']

for column in columns_to_convert:
    df[column] = df[column].map({'yes': 1, 'no': 0})

In [6]:
df['manner_of_crash'] = df['manner_of_crash'].fillna('Unknown/Missing')
df['manner_of_crash'] = df['manner_of_crash'].replace(['Missing', 'Unknown'], 'Unknown/Missing')
encoder = LabelEncoder()

df['manner_of_crash'] = encoder.fit_transform(df['manner_of_crash'])

mapping = dict(zip(encoder.classes_, range(len(encoder.classes_))))
print(mapping)

{'Angle': 0, 'Front to Rear': 1, 'HeadOn': 2, 'Not Collision with Motor Vehicle in Transport': 3, 'Other': 4, 'Rear to Rear': 5, 'Rear to Side': 6, 'Sideswipe, Opposite Direction': 7, 'Sideswipe, Same Direction': 8, 'Unknown/Missing': 9}


In [7]:
# Handle missing values and standardize placeholders
df['roadway_name'] = df['roadway_name'].fillna('UNKNOWN')
df['roadway_name'] = df['roadway_name'].replace(['UNNAMED ROAD', 'UNNAMED STREET', '-- UNKNOWN RAMP/ROAD --'], 'UNKNOWN')

# Instantiate a new encoder
roadway_encoder = LabelEncoder()

# Fit and transform the roadway_name column
df['roadway_name'] = roadway_encoder.fit_transform(df['roadway_name'])

# Create and print the mapping
roadway_mapping = dict(zip(roadway_encoder.classes_, range(len(roadway_encoder.classes_))))
print(roadway_mapping)

{'0CEANFRONT CIRCLE S': 0, '1 65 NORTH': 1, '1-24 E': 2, '10 AIRWAYS BLVD': 3, '100 OAKS SHOPPING CTR': 4, '1000 RIVERGATE': 5, '1000 RIVERGATE PARKWAY': 6, '101 RIVERGATE DRIVE': 7, '10TH': 8, '10TH AVE N': 9, '10TH AVE NORTH': 10, '10TH AVE S': 11, '10TH AVE SOUTH': 12, '10TH AVENUE SOUTH': 13, '10TH AVN': 14, '10TH AVS': 15, '10TH CIR N': 16, '10TH CIRN': 17, '10TH ST': 18, '11 AVE N': 19, '112 LONG HOLLOW PIKE': 20, '112 STEWARTS FERRY PIKE': 21, '11TH AVE N': 22, '11TH AVE S': 23, '11TH AVENUE NORTH': 24, '11TH AVENUE SOUTH': 25, '11TH AVN': 26, '11TH AVS': 27, '11TH ST': 28, '120 CARTWRIGHT': 29, '1215 ANTIOCH': 30, '123 NORTHCREEK': 31, '123 NORTHCREEK BLVD': 32, '12761 OLD HICKORY BLVD': 33, '12TH': 34, '12TH AV S': 35, '12TH AVE N': 36, '12TH AVE S': 37, '12TH AVE SOUTH': 38, '12TH AVENUE SOUTH': 39, '12TH AVN': 40, '12TH AVS': 41, '12TH ST': 42, '1310 ANTIOCH PIKE': 43, '1311 9TH': 44, '133 PARK': 45, '13TH AV CIR': 46, '13TH AVE S': 47, '13TH AVS': 48, '13TH CT': 49, '13TH S

In [8]:
df['intersection_indicator'] = df['intersection_indicator'].fillna("N")
df['intersection_indicator'].replace("Missing", "N", inplace=True)
mapping = {"N": 0, "Y": 1}
df['intersection_indicator'] = df['intersection_indicator'].map(mapping)
df['intersection_indicator'] = df['intersection_indicator'].astype('int64')

In [9]:
df['intersect_type'] = df['intersect_type'].fillna('Unknown/Other')
df['intersect_type'].replace(['OTHER', 'Missing', 'Unknown'], 'Unknown/Other', inplace=True)
# Instantiate a new encoder
intersect_type_encoder = LabelEncoder()

# Fit and transform the roadway_name column
df['intersect_type'] = intersect_type_encoder.fit_transform(df['intersect_type'])

# Create and print the mapping
intersect_type_mapping = dict(zip(intersect_type_encoder.classes_, range(len(intersect_type_encoder.classes_))))
print(intersect_type_mapping)

{'Five Point Or More': 0, 'Four-Way': 1, 'Not at Intersection': 2, 'Roundabout': 3, 'T-Intersection': 4, 'Traffic Circle': 5, 'Unknown/Other': 6, 'Y-Intersection': 7}


In [10]:
df['relation_to_junction'] = df['relation_to_junction'].fillna('Unknown/Other')
df['relation_to_junction'].replace(['Missing', 'OtherLocation', 'Unknown'], 'Unknown/Other', inplace=True)
relation_to_junction_encoder = LabelEncoder()

# Fit and transform the relation_to_junction column
df['relation_to_junction'] = relation_to_junction_encoder.fit_transform(df['relation_to_junction'])

# Create and print the mapping for relation_to_junction
relation_to_junction_mapping = dict(zip(relation_to_junction_encoder.classes_, range(len(relation_to_junction_encoder.classes_))))
print(relation_to_junction_mapping)

{'AccelDecelLane': 0, 'Crossover-Related': 1, 'Driveway, Alley Access, etc': 2, 'Entrance/Exit Ramp Related': 3, 'Intersection': 4, 'Intersection-Related': 5, 'Non-Junction': 6, 'Rail Grade Crossing': 7, 'SharedUsePathOrTrail': 8, 'Unknown/Other': 9}


In [11]:
# Combine "Campus" and "OHO" categories
df['mou'] = df['mou'].replace('OHO', 'Campus')

# Now, encode "Campus" as 1 and "False" as 0
df['mou'] = df['mou'].apply(lambda x: 1 if x == 'Campus' else 0)

In [12]:
df['work_zone_type'] = df['work_zone_type'].apply(lambda x: 0 if x == "Missing" or x == "Unknown" else 1)

In [13]:
def simplify_weather(weather):
    if pd.isna(weather):
        return 'Unknown/Other'
    elif 'Snow' in weather:
        return 'Snow'
    elif 'Sleet/Hail' in weather:
        return 'Sleet/Hail'
    elif 'Rain' in weather:
        return 'Rain'
    elif weather in ['Fog', 'Smoke', 'Smog']:
        return 'Fog'
    elif 'Cloudy' in weather:
        return 'Cloudy'
    elif weather == 'Clear':
        return 'Clear'
    elif weather in ['Missing', 'Unknown', 'Other']:
        return 'Unknown/Other'
    else:
        return 'Other Conditions'


df['weather_condition(s)'] = df['weather_condition(s)'].apply(simplify_weather)

simplified_weather_encoder = LabelEncoder()

# Fit and transform the simplified_weather column
df['weather_condition(s)'] = simplified_weather_encoder.fit_transform(df['weather_condition(s)'])

# Create and print the mapping for simplified_weather
simplified_weather_mapping = dict(zip(simplified_weather_encoder.classes_, range(len(simplified_weather_encoder.classes_))))
print(simplified_weather_mapping)

{'Clear': 0, 'Cloudy': 1, 'Fog': 2, 'Other Conditions': 3, 'Rain': 4, 'Sleet/Hail': 5, 'Snow': 6, 'Unknown/Other': 7}


In [14]:
df['city'] = df['city'].fillna('Unknown')

# Instantiate a new LabelEncoder
city_encoder = LabelEncoder()

# Fit and transform the 'city' column with the encoder
df['city'] = city_encoder.fit_transform(df['city'])

# Create and print the mapping from city names to encoded values
city_mapping = dict(zip(city_encoder.classes_, range(len(city_encoder.classes_))))
print(city_mapping)

{'Antioch': 0, 'Belle Meade': 1, 'Bellevue': 2, 'Berry Hill': 3, 'Donelson': 4, 'Edenwold': 5, 'Forest Hills': 6, 'Goodlettsville': 7, 'Hermitage': 8, 'Joelton': 9, 'Lakewood': 10, 'MILLERSVILLE': 11, 'Madison': 12, 'Nashville': 13, 'Not in City Limit': 14, 'OTHER': 15, 'Oak Hill': 16, 'Old Hickory': 17, 'Ridgetop': 18, 'UNKNOWN': 19, 'Whites Creek': 20}


In [15]:
def bool_to_int(df):
    for col in df.select_dtypes(['bool']).columns:
        df[col] = df[col].astype(int)
    return df

df = bool_to_int(df)


df['time'] = df['time'].astype(str)
df['time'] = df['time'].str.strip()

missing_time_df = df[df['time'] == "00:00:00"] 
has_time_df = df[df['time'] != "00:00:00"]

columns_to_drop = [
    "time",
    "hour"
]
missing_time_df = missing_time_df.drop(columns=columns_to_drop)

def time_window(hour):
    if 23 <= hour or hour < 5:
        return 'late night'
    elif 5 <= hour < 11:
        return 'morning'
    elif 11 <= hour < 17:
        return 'afternoon'
    elif 17 <= hour < 23:
        return 'night'
    else:
        return 'unknown' 

has_time_df = has_time_df.copy()
has_time_df['time_window'] = has_time_df['hour'].apply(time_window)
has_time_df = has_time_df.drop(columns=columns_to_drop)

In [16]:
columns_to_drop = [
    "master_record_number", "agency", "agency_tracking_number", "county", 
    "collision_date", "roadway_suffix", "roadway_number", "roadway_local_id", 
    "distance_from_reference", "miles-feet_indicator", "direction_from_reference",
    "intersection_road_name", "intersection_road_name_suffix", "intersection_road_number",
    "intersection_local_id", "mile_marker", "interchange_related_indicator", 
    "construction_maintenance_zone", "construction_maintenance_zone_location",
    "fatal_case_number", "date", "officer_first_name", "officer_last_name"
]

has_time_df = has_time_df.drop(columns=columns_to_drop)
missing_time_df = missing_time_df.drop(columns=columns_to_drop)

In [17]:
train_df, temp_df = train_test_split(has_time_df, test_size=0.3, stratify=has_time_df['time_window'], random_state=42)

valid_df, test_df = train_test_split(temp_df, test_size=1/3, stratify=temp_df['time_window'], random_state=42)

print("Training set:", len(train_df))
print("Validation set:", len(valid_df))
print("Testing set:", len(test_df))

Training set: 183941
Validation set: 52554
Testing set: 26278


In [18]:
def print_value_counts(df, df_name):
    print(f"--- {df_name} ---")
    print(df['time_window'].value_counts())
    print("\n")

print_value_counts(train_df, "Training set")
print_value_counts(valid_df, "Validation set")
print_value_counts(test_df, "Testing set")

--- Training set ---
time_window
afternoon     72309
night         52011
morning       44885
late night    14736
Name: count, dtype: int64


--- Validation set ---
time_window
afternoon     20659
night         14860
morning       12824
late night     4211
Name: count, dtype: int64


--- Testing set ---
time_window
afternoon     10330
night          7430
morning        6413
late night     2105
Name: count, dtype: int64




In [19]:
# train_df.to_csv("../03_Data_for_Modeling/train.csv", index=False)
# valid_df.to_csv("../03_Data_for_Modeling/valid.csv", index=False)
# test_df.to_csv("../03_Data_for_Modeling/test.csv", index=False)

In [20]:
train_df.sample(10)

Unnamed: 0,city,vehicles_involved,number_injured,number_dead,latitude,longitude,roadway_name,intersection_indicator,intersect_type,relation_to_junction,work_zone_type,weather_condition(s),manner_of_crash,pedestrian,bicycle,scooter,parking,hitrun,mou,day_of_week,weekend,month,year,covid,time_window
146263,13,2,0,0,36.0461,-86.65783,827,0,2,6,1,1,1,0,0,0,0,1,0,3,0,10,2018,0,morning
130977,13,2,0,0,36.0821,-86.7617,5799,1,1,5,1,1,4,0,0,0,0,0,0,4,0,6,2018,0,morning
244677,13,2,2,0,36.15579,-86.79599,1540,0,6,6,1,0,0,0,0,0,0,0,0,0,0,10,2021,1,afternoon
81083,13,2,1,0,36.16198,-86.82574,214,0,2,6,1,1,1,0,0,0,0,0,0,1,0,2,2017,0,afternoon
165812,13,2,1,0,36.20662,-86.7779,6578,1,1,4,1,0,8,0,0,0,0,0,0,1,0,4,2019,0,morning
59458,13,1,1,0,36.24552,-86.76008,5820,0,2,6,1,4,3,0,0,0,0,0,0,5,1,8,2016,0,night
213895,13,2,0,0,36.03718,-86.7859,6810,0,2,6,1,7,8,0,0,0,1,0,0,1,0,9,2020,1,afternoon
142963,14,2,0,0,36.28306,-86.80623,3330,0,2,6,1,4,8,0,0,0,0,1,0,2,0,9,2018,0,morning
208187,13,1,0,0,36.08252,-86.76371,3354,0,2,9,1,7,9,0,0,0,0,0,0,4,0,7,2020,1,afternoon
244899,13,2,0,0,36.15486,-86.62726,3398,0,2,3,1,0,1,0,0,0,0,0,0,3,0,10,2021,1,afternoon


In [21]:
train_df.shape

(183941, 25)