In [1]:
import pandas as pd
import numpy as np
import re
import os

In [5]:
# read in the data
accidents_data = pd.read_csv('processed_us_car_accidents_2022.csv')

# show columns
print(accidents_data.columns)

Index(['Unnamed: 0', 'ID', 'Source', 'Severity', 'Start_Time', 'End_Time',
       'Start_Lattitude', 'Start_Longitude', 'End_Lattitude', 'End_Longitude',
       'Distance(mi)', 'Description', 'Street', 'City', 'County', 'State',
       'Zipcode', 'Country', 'Timezone', 'Weather_Timestamp', 'Temperature(F)',
       'Wind_Chill(F)', 'Humidity(%)', 'Pressure(in)', 'Visibility(mi)',
       'Wind_Direction', 'Wind_Speed(mph)', 'Precipitation(in)',
       'Weather_Condition', 'Traffic_Signal', 'Sunrise_Sunset', 'State_Abbrv',
       'Population'],
      dtype='object')


To automate the task of determining whether an accident occurred on a highway and its orientation (North, South, East, West), we can write a script that processes the descriptions. The script will use regular expressions to search for keywords indicating highway involvement and direction. If a highway is mentioned with a direction, it will return the respective orientation. If no highway or direction is indicated, it will return "Not_Highway".

In the US, highways are generally categorized into three types:

- Interstate Highways: Usually denoted with an 'I-' followed by a number, e.g., I-95, I-405, I-10.
- U.S. Routes: Often signified by 'US-' followed by a number, e.g., US-1, US-66.
- State Highways: These vary by state and can include state abbreviations or names, followed by a number, e.g., CA-1 for California, NY-27 for New York.

In [10]:
def analyze_description(description):
    # Regular expressions to find highway and direction mentions
    highway_pattern = re.compile(r'\b(I-\d+|US-\d+|[A-Z]{2}-\d+)\b', re.IGNORECASE)
    direction_pattern = re.compile(r'\b(Northbound|Southbound|Eastbound|Westbound|NB|SB|EB|WB|North|South|East|West|N|S|E|W)\b')

    # Capitalize the description to ensure case-insensitive matching for directions
    description = description.upper()

    # Search for patterns in the description
    highway_match = highway_pattern.search(description)
    direction_match = direction_pattern.search(description)

    # Determine if the incident is on a highway and its orientation
    if highway_match:
        if direction_match:
            # Extract the first word of the direction match as the orientation
            direction = direction_match.group(0).split()[0]
            return direction[0]  # Return only the first letter of the direction
        else:
            return "Not_Highway"
    else:
        return "Not_Highway"

In [11]:
# Apply the analyze_description function to the description column
accidents_data['highway_direction'] = accidents_data['Description'].apply(analyze_description)

In [12]:
# show counts of highway_direction
print(accidents_data['highway_direction'].value_counts())

highway_direction
Not_Highway    933241
S              233473
N              231585
E              179109
W              176036
Name: count, dtype: int64


In [13]:
# sample random 10 data point on highway_direction and description
print(accidents_data[['highway_direction', 'Description']].sample(10))

        highway_direction                                        Description
905517                  W  Stationary traffic on I-44 W from MO-19/Exit 2...
494999        Not_Highway   Incident on I-295 near I-295 Drive with caution.
156236        Not_Highway  Lane blocked due to crash on Margaret Wallace ...
1123117       Not_Highway  Incident on PLANK RD near PLANK RD Drive with ...
997630        Not_Highway  Slow traffic from Biscayne Way to E Roselane S...
1163083       Not_Highway  A disabled vehicle is creating a hazard causin...
364644                  W  Stationary traffic on AZ-202 Loop W - Red Moun...
1056329                 E  Incident on US-60 EB near N COLONNADE WAY Cent...
1685608       Not_Highway     Incident on I-95 near I-95 Drive with caution.
382935                  S  Slow traffic on US-41 S from 97th Ave (US-41) ...


In [15]:
import datetime
from suntime import Sun, SunTimeException

In [17]:
# lets filter out the data that is not on highway
accidents_data = accidents_data[accidents_data['highway_direction'] != 'Not_Highway']

In [21]:
# define a function to calculate the sunrise and sunset time based on the latitude and longitude
def calculate_sunrise_sunset(latitude, longitude, date):
    # Initialize the Sun class for the given latitude and longitude
    sun = Sun(latitude, longitude)

    # Get sunrise and sunset for the given date
    sunrise = sun.get_local_sunrise_time(date)
    sunset = sun.get_local_sunset_time(date)

    # Return the tuple of sunrise and sunset
    return (sunrise, sunset)

In [19]:
# seprate the date and time from the start_time column
accidents_data['date'] = pd.to_datetime(accidents_data['Start_Time']).dt.date
accidents_data['time'] = pd.to_datetime(accidents_data['Start_Time']).dt.time

In [22]:
# calculate the sunrise and sunset time for each data point
accidents_data['sunrise'], accidents_data['sunset'] = zip(*accidents_data.apply(lambda x: calculate_sunrise_sunset(x['Start_Lattitude'], x['Start_Longitude'], x['date']), axis=1))

In [24]:
# show the first 5 rows of the data of their accident time, sunrise and sunset time, direction, severity and description
print(accidents_data[['time', 'sunrise', 'sunset', 'highway_direction', 'Severity', 'Description']].head())

         time                   sunrise                    sunset  \
28   05:04:29 2022-09-08 06:03:00-05:00 2022-09-08 18:33:00-05:00   
51   06:41:11 2022-09-08 06:10:00-05:00 2022-09-08 18:41:00-05:00   
343  09:54:20 2022-09-08 06:24:00-05:00 2022-09-07 19:05:00-05:00   
400  10:02:07 2022-09-08 06:19:00-05:00 2022-09-07 19:05:00-05:00   
466  08:44:36 2022-09-08 06:29:00-05:00 2022-09-07 19:18:00-05:00   

    highway_direction  Severity  \
28                  W         1   
51                  E         2   
343                 W         3   
400                 S         1   
466                 S         2   

                                           Description  
28   Crash on exit ramp from I-95 Southbound at Exi...  
51   Crash on I-4 Westbound near Thomas E. Weightma...  
343  Entry ramp to I-65 Westbound from W Trinity Ln...  
400  Crash on exit ramp from I-465 clockwise at Exi...  
466  Lane blocked due to crash on US-20 Ulysses S G...  


In [33]:
# define a function to filter out the accident that is +/- 1 hour from sunrise and sunset time
def filter_accidents(row):
    # Get the accident time, sunrise time, and sunset time
    accident_time = row['time']
    sunrise_time = row['sunrise'].time()
    sunset_time = row['sunset'].time()

    # Calculate time difference between the accident and sunrise
    sun_rise_diff = abs(datetime.datetime.combine(datetime.date.min, accident_time) - datetime.datetime.combine(datetime.date.min, sunrise_time))
    sun_set_diff = abs(datetime.datetime.combine(datetime.date.min, accident_time) - datetime.datetime.combine(datetime.date.min, sunset_time))
    
    # Check if the accident time is within +/- 1 hour of sunrise or sunset time
    if sun_rise_diff <= datetime.timedelta(hours=1):
        return "Sunrise"
    elif sun_set_diff <= datetime.timedelta(hours=1):
        return "Sunset"
    elif (accident_time > sunrise_time) & (accident_time < sunset_time):
        return "Daytime"
    else:
        return "Nighttime"
   

In [34]:
# filter out the data that is not during sunrise, sunset or day time
accidents_data['daytime'] = [filter_accidents(row) for index, row in accidents_data.iterrows()]

Here are some potential ways to compare the directions and sunrise_sunset time might affect the accident severity:

In [35]:
# show the count of the daytime column
print(accidents_data['daytime'].value_counts())

daytime
Daytime      500433
Nighttime    170848
Sunset        79221
Sunrise       69701
Name: count, dtype: int64


In [42]:
# check the correlation between daytime, highway_direction
print(accidents_data.groupby(['daytime', 'highway_direction']).size())

daytime    highway_direction
Daytime    E                    110900
           N                    143338
           S                    143348
           W                    102847
Nighttime  E                     36100
           N                     47216
           S                     47340
           W                     40192
Sunrise    E                     15076
           N                     18235
           S                     20281
           W                     16109
Sunset     E                     17033
           N                     22796
           S                     22504
           W                     16888
dtype: int64


In [43]:
# check the correlation between daytime and highway_direction(combine the direction to NS, EW)
accidents_data['highway_direction_combine'] = accidents_data['highway_direction'].replace({'N': 'NS', 'S': 'NS', 'E': 'EW', 'W': 'EW'})
print(accidents_data.groupby(['daytime', 'highway_direction_combine']).size())

daytime    highway_direction_combine
Daytime    EW                           213747
           NS                           286686
Nighttime  EW                            76292
           NS                            94556
Sunrise    EW                            31185
           NS                            38516
Sunset     EW                            33921
           NS                            45300
dtype: int64


In [44]:
# check the percentage of highway_direction_combine vs total accident count for each daytime
print(accidents_data.groupby(['daytime', 'highway_direction_combine']).size()/accidents_data.groupby(['daytime']).size())

daytime    highway_direction_combine
Daytime    EW                           0.427124
           NS                           0.572876
Nighttime  EW                           0.446549
           NS                           0.553451
Sunrise    EW                           0.447411
           NS                           0.552589
Sunset     EW                           0.428182
           NS                           0.571818
dtype: float64


now lets try to segment the data based on their time of the day on hourly basis, and 