In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import copy
from sklearn.preprocessing import MinMaxScaler

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [3]:
folder = "../00_Raw_Data/"

# Generate filenames from 2015_Q1 to 2022_Q4
years = range(2015, 2023)
quarters = range(1, 5)

files = ["{}_Q{}_Traffic_Crashes.csv".format(year, quarter) for year in years for quarter in quarters]

dfs = [pd.read_csv(folder + file) for file in files]
df = pd.concat(dfs, ignore_index=True)
df = df[(df['longitude'] >= -87.05) & (df['longitude'] <= -86.5) & (df['latitude'] >= 35.96) & (df['latitude'] <= 36.395)]

In [4]:
df['collision_date'] = pd.to_datetime(df['collision_date'], format='%m/%d/%Y %I:%M:%S %p')
df['date'] = df['collision_date'].dt.date

df['day_of_week'] = df['collision_date'].dt.dayofweek # 0: Monday - 6: Sunday

df['time'] = df['collision_date'].dt.time
df['hour'] = df['collision_date'].dt.hour
df['day_of_month'] = df['collision_date'].dt.day
df['month'] = df['collision_date'].dt.month
df['covid'] = (df['date'] >= pd.Timestamp('2020-03-15').date()).astype(int)

def categorize_time_window(hour):
    if 6 <= hour <= 9:
        return 'rush_morning'
    elif 15 <= hour <= 18:
        return 'rush_evening'
    elif 10 <= hour <= 14:
        return 'non_rush_day'
    else:  
        return 'non_rush_night'

df['time_window'] = df['hour'].apply(categorize_time_window)


columns_to_convert = ['pedestrian', 'bicycle', 'scooter', 'hitrun', 'parking']

for column in columns_to_convert:
    df[column] = df[column].map({'yes': 1, 'no': 0})

df.drop('city', axis=1, inplace=True)

df['intersection_indicator'] = df['intersection_indicator'].fillna("N")
df['intersection_indicator'].replace("Missing", "N", inplace=True)
mapping = {"N": 0, "Y": 1}
df['intersection_indicator'] = df['intersection_indicator'].map(mapping)
df['intersection_indicator'] = df['intersection_indicator'].astype('int64')

# Combine "Campus" and "OHO" categories
df['mou'] = df['mou'].replace('OHO', 'Campus')

# Now, encode "Campus" as 1 and "False" as 0
df['mou'] = df['mou'].apply(lambda x: 1 if x == 'Campus' else 0)

df['work_zone_type'] = df['work_zone_type'].fillna('Missing')
df['work_zone_type'] = df['work_zone_type'].apply(lambda x: 0 if x == "Missing" or x == "Unknown" or x is None else 1)


def simplify_weather(weather):
    if pd.isna(weather):
        return 'Unknown/Other'
    elif 'Snow' in weather:
        return 'Snow'
    elif 'Sleet/Hail' in weather:
        return 'Sleet/Hail'
    elif 'Rain' in weather:
        return 'Rain'
    elif weather in ['Fog', 'Smoke', 'Smog']:
        return 'Fog'
    elif 'Cloudy' in weather:
        return 'Cloudy'
    elif weather == 'Clear':
        return 'Clear'
    elif weather in ['Missing', 'Unknown', 'Other']:
        return 'Unknown/Other'
    else:
        return 'Other Conditions'

df['simplified_weather'] = df['weather_condition(s)'].apply(simplify_weather)
df = pd.get_dummies(df, columns=['simplified_weather'], prefix='weather')
df.drop('weather_condition(s)', axis=1, inplace=True)

def bool_to_int(df):
    for col in df.select_dtypes(['bool']).columns:
        df[col] = df[col].astype(int)
    return df

df = bool_to_int(df)

df['time'] = df['time'].astype(str)
df['time'] = df['time'].str.strip()
has_time_df = df[df['time'] != "00:00:00"]

columns_to_drop = [
    "agency", "agency_tracking_number", "county", 
    "collision_date", "roadway_suffix", "roadway_number", "roadway_local_id", 
    "distance_from_reference", "miles-feet_indicator", "direction_from_reference",
    "intersection_road_name", "intersection_road_name_suffix", "intersection_road_number",
    "intersection_local_id", "mile_marker", "interchange_related_indicator", 
    "construction_maintenance_zone", "construction_maintenance_zone_location",
    "fatal_case_number", "date", "officer_first_name", "officer_last_name", "time"
]

has_time_df = has_time_df.drop(columns=columns_to_drop)
weekday_df = has_time_df[has_time_df['day_of_week'].between(0, 4)].copy()  

weekday_df = weekday_df.sample(frac=1, random_state=42).reset_index(drop=True)

time_window_mapping = {
    'rush_morning': 0,
    'rush_evening': 1,
    'non_rush_day': 2,
    'non_rush_night': 3
}

weekday_df['time_window'] = weekday_df['time_window'].replace(time_window_mapping)
scaler = MinMaxScaler()
weekday_df[['latitude', 'longitude']] = scaler.fit_transform(weekday_df[['latitude', 'longitude']])

selected_cols = ['master_record_number', 'latitude', 'longitude', 'intersection_indicator', 'hitrun', 'parking', 'hour', 'vehicles_involved', 
                 'day_of_week', 'month', 'number_injured', 'number_dead', 'covid', 'time_window']

has_time_df = weekday_df[selected_cols].copy()

In [8]:
has_time_df.to_csv("../03_Data_for_Modeling/knn.csv", index=False)