In [28]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd
import numpy as np
import random as rd
import matplotlib.pyplot as plt 
import datetime
from sklearn.cluster import DBSCAN
from sklearn.model_selection import train_test_split
from sklearn.neighbors import BallTree
import copy
from scipy.stats import mode

In [29]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [30]:
folder = "../00_Raw_Data/"

# Generate filenames from 2015_Q1 to 2022_Q4
years = range(2015, 2023)
quarters = range(1, 5)

files = ["{}_Q{}_Traffic_Crashes.csv".format(year, quarter) for year in years for quarter in quarters]

dfs = [pd.read_csv(folder + file) for file in files]
df = pd.concat(dfs, ignore_index=True)
df = df[(df['longitude'] >= -87.05) & (df['longitude'] <= -86.5) & (df['latitude'] >= 35.96) & (df['latitude'] <= 36.395)]

In [31]:
df['collision_date'] = pd.to_datetime(df['collision_date'], format='%m/%d/%Y %I:%M:%S %p')
df['date'] = df['collision_date'].dt.date

df['day_of_week'] = df['collision_date'].dt.dayofweek # 0: Monday - 6: Sunday

df['time'] = df['collision_date'].dt.time
df['hour'] = df['collision_date'].dt.hour
df['month'] = df['collision_date'].dt.month
df['year'] = df['collision_date'].dt.year
df['covid'] = (df['date'] >= pd.Timestamp('2020-03-15').date()).astype(int)

def categorize_time_window(hour):
    if 6 <= hour <= 9:
        return 'rush_morning'
    elif 15 <= hour <= 18:
        return 'rush_evening'
    elif 10 <= hour <= 14:
        return 'non_rush_day'
    else:  
        return 'non_rush_night'

df['time_window'] = df['hour'].apply(categorize_time_window)

columns_to_convert = ['pedestrian', 'bicycle', 'scooter', 'hitrun', 'parking']

for column in columns_to_convert:
    df[column] = df[column].map({'yes': 1, 'no': 0})
    
df['manner_of_crash'] = df['manner_of_crash'].fillna('Unknown/Missing')
df['manner_of_crash'] = df['manner_of_crash'].replace(['Missing', 'Unknown'], 'Unknown/Missing')
crash_onehot = pd.get_dummies(df['manner_of_crash'], prefix='crash')
df = pd.concat([df, crash_onehot], axis=1)
df.drop('manner_of_crash', axis=1, inplace=True)


top_10_roads = df['roadway_name'].value_counts().head(10).index

df['roadway_grouped'] = np.where(df['roadway_name'].isin(top_10_roads), df['roadway_name'], 'Other')

roadway_onehot = pd.get_dummies(df['roadway_grouped'], prefix='road')
df = pd.concat([df, roadway_onehot], axis=1)
df.drop('roadway_grouped', axis=1, inplace=True)
df.drop('roadway_name', axis=1, inplace=True)

df['intersect_type'].replace(['OTHER', 'Missing', 'Unknown'], 'Unknown/Other', inplace=True)
intersect_type_onehot = pd.get_dummies(df['intersect_type'], prefix='intersect_type')
df = pd.concat([df, intersect_type_onehot], axis=1)
df.drop('intersect_type', axis=1, inplace=True)

df['relation_to_junction'].replace(['Missing', 'OtherLocation', 'Unknown'], 'Unknown/Other', inplace=True)
relation_onehot = pd.get_dummies(df['relation_to_junction'], prefix='relation_to_junction')
df = pd.concat([df, relation_onehot], axis=1)
df.drop('relation_to_junction', axis=1, inplace=True)

df.drop('city', axis=1, inplace=True)

df['intersection_indicator'] = df['intersection_indicator'].fillna("N")
df['intersection_indicator'].replace("Missing", "N", inplace=True)
mapping = {"N": 0, "Y": 1}
df['intersection_indicator'] = df['intersection_indicator'].map(mapping)
df['intersection_indicator'] = df['intersection_indicator'].astype('int64')

# Combine "Campus" and "OHO" categories
df['mou'] = df['mou'].replace('OHO', 'Campus')

# Now, encode "Campus" as 1 and "False" as 0
df['mou'] = df['mou'].apply(lambda x: 1 if x == 'Campus' else 0)

df['work_zone_type'] = df['work_zone_type'].fillna('Missing')
df['work_zone_type'] = df['work_zone_type'].apply(lambda x: 0 if x == "Missing" or x == "Unknown" or x is None else 1)

def simplify_weather(weather):
    if pd.isna(weather):
        return 'Unknown/Other'
    elif 'Snow' in weather:
        return 'Snow'
    elif 'Sleet/Hail' in weather:
        return 'Sleet/Hail'
    elif 'Rain' in weather:
        return 'Rain'
    elif weather in ['Fog', 'Smoke', 'Smog']:
        return 'Fog'
    elif 'Cloudy' in weather:
        return 'Cloudy'
    elif weather == 'Clear':
        return 'Clear'
    elif weather in ['Missing', 'Unknown', 'Other']:
        return 'Unknown/Other'
    else:
        return 'Other Conditions'

df['simplified_weather'] = df['weather_condition(s)'].apply(simplify_weather)
df = pd.get_dummies(df, columns=['simplified_weather'], prefix='weather')
df.drop('weather_condition(s)', axis=1, inplace=True)

def bool_to_int(df):
    for col in df.select_dtypes(['bool']).columns:
        df[col] = df[col].astype(int)
    return df

df = bool_to_int(df)

columns_to_drop = [
    "agency", "agency_tracking_number", "county", 
    "collision_date", "roadway_suffix", "roadway_number", "roadway_local_id", 
    "distance_from_reference", "miles-feet_indicator", "direction_from_reference",
    "intersection_road_name", "intersection_road_name_suffix", "intersection_road_number",
    "intersection_local_id", "mile_marker", "interchange_related_indicator", 
    "construction_maintenance_zone", "construction_maintenance_zone_location",
    "fatal_case_number", "date", "officer_first_name", "officer_last_name", 'scooter'
]

df = df.drop(columns=columns_to_drop)

In [32]:
df['time'] = df['time'].astype(str)
df['time'] = df['time'].str.strip()
has_time = df[df['time'] != "00:00:00"]
has_time = has_time.drop('time', axis=1)


weekday_df = has_time[has_time['day_of_week'].between(0, 4)].copy()  
weekday_df = weekday_df.sample(frac=1, random_state=42).reset_index(drop=True)
time_window_mapping = {
    'rush_morning': 0,
    'rush_evening': 1,
    'non_rush_day': 2,
    'non_rush_night': 3
}

weekday_df['time_window'] = weekday_df['time_window'].replace(time_window_mapping)

In [33]:
has_time_df = copy.deepcopy(weekday_df)

train_val, test = train_test_split(has_time_df, test_size=0.1, stratify=has_time_df['time_window'], random_state=42)
train, val = train_test_split(train_val, test_size=2/9, stratify=train_val['time_window'], random_state=42)

tree = BallTree(train[['latitude', 'longitude']].values)

k_value = 4 

def calculate_mode_time_window_of_nearest(train_tree, train_data, point, k=k_value):
    dist, ind = train_tree.query(point, k=k)
    time_windows = train_data.iloc[ind[0]]['time_window']

    if len(time_windows) < k: 
        return np.nan

    mode_result = mode(time_windows)
    modes = mode_result.mode

    if modes.size == 0:  # No mode found
        return np.nan
    elif modes.size > 1:  # Handle ties by selecting a random mode
        return random.choice(modes)
    else:
        return modes.item()
    
    
has_time_df['most_common_time_window_{}_neigh'.format(k_value)] = has_time_df.apply(
    lambda row: calculate_mode_time_window_of_nearest(
        tree, 
        train[['time_window']],  
        np.array([[row['latitude'], row['longitude']]]),
    ), 
    axis=1
)

df_with_avg_hour = has_time_df.set_index('master_record_number')[['most_common_time_window_{}_neigh'.format(k_value)]]

train_with_avg_hour = train.set_index('master_record_number').join(df_with_avg_hour)
val_with_avg_hour = val.set_index('master_record_number').join(df_with_avg_hour)
test_with_avg_hour = test.set_index('master_record_number').join(df_with_avg_hour)

train_with_avg_hour = train_with_avg_hour.reset_index()
val_with_avg_hour = val_with_avg_hour.reset_index()
test_with_avg_hour = test_with_avg_hour.reset_index()

train_xgboost = train_with_avg_hour.drop(['hour', 'master_record_number'], axis=1).copy()
val_xgboost = val_with_avg_hour.drop(['hour', 'master_record_number'], axis=1).copy()
test_xgboost = test_with_avg_hour.drop(['hour', 'master_record_number'], axis=1).copy()

In [34]:
train_xgboost.to_csv("../03_Data_for_Modeling/train_xgboost.csv", index=False)
val_xgboost.to_csv("../03_Data_for_Modeling/val_xgboost.csv", index=False)
test_xgboost.to_csv("../03_Data_for_Modeling/test_xgboost.csv", index=False)

In [35]:
train_xgboost.sample(5)

Unnamed: 0,vehicles_involved,number_injured,number_dead,latitude,longitude,intersection_indicator,work_zone_type,pedestrian,bicycle,parking,hitrun,mou,day_of_week,month,year,covid,time_window,crash_Angle,crash_Front to Rear,crash_HeadOn,crash_Not Collision with Motor Vehicle in Transport,crash_Other,crash_Rear to Rear,crash_Rear to Side,"crash_Sideswipe, Opposite Direction","crash_Sideswipe, Same Direction",crash_Unknown/Missing,road_BELL RD,road_HARDING PL,road_I 24,road_I 40,road_I 440,road_I 65,road_LEBANON PKE,road_MURFREESBORO PKE,road_NOLENSVILLE PKE,road_OLD HICKORY BLVD,road_Other,intersect_type_Five Point Or More,intersect_type_Four-Way,intersect_type_Not at Intersection,intersect_type_Roundabout,intersect_type_T-Intersection,intersect_type_Traffic Circle,intersect_type_Unknown/Other,intersect_type_Y-Intersection,relation_to_junction_AccelDecelLane,relation_to_junction_Crossover-Related,"relation_to_junction_Driveway, Alley Access, etc",relation_to_junction_Entrance/Exit Ramp Related,relation_to_junction_Intersection,relation_to_junction_Intersection-Related,relation_to_junction_Non-Junction,relation_to_junction_Rail Grade Crossing,relation_to_junction_SharedUsePathOrTrail,relation_to_junction_Unknown/Other,weather_Clear,weather_Cloudy,weather_Fog,weather_Other Conditions,weather_Rain,weather_Sleet/Hail,weather_Snow,weather_Unknown/Other,most_common_time_window_4_neigh
75578,1,0,0,36.17291,-86.76868,0,0,0,0,0,0,0,0,1,2022,1,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2
77869,2,1,0,36.23967,-86.78078,1,0,0,0,0,0,0,0,11,2018,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1
87873,1,1,0,36.2245,-86.77454,0,0,0,0,0,0,0,3,12,2018,0,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0
112264,4,1,0,36.10714,-86.67529,0,0,0,0,0,0,0,2,1,2019,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1
60887,2,0,0,36.15421,-86.77941,0,0,0,0,0,0,0,2,9,2019,0,3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
