In [0]:
#importing the libraries
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
import statsmodels.api as sm

In [0]:
#pulling the data
nyc_all_data = pd.read_csv('nyc_data.csv')

In [0]:
nyc_all_data.describe()

In [0]:
nyc_all_data.head()

In [0]:
#check for any duplicates - starting from Collision ID column
duplicates_collision_id = nyc_all_data['COLLISION_ID'].value_counts(sort=True, ascending=False)
dv_collision_id = duplicates_collision_id[duplicates_collision_id>1]
print(dv_collision_id)

Looks like the data doesnt have any duplicates for Collision ID

In [0]:
nyc_all_data['BOROUGH'].describe()

In [0]:
nyc_all_data['BOROUGH'].describe()

In [0]:
#check if there are any NAs
nyc_all_data.isnull().sum()

In [0]:
#checking for Borough as null
nyc_all_data[nyc_all_data['BOROUGH'].isnull()].isnull().sum()

# Looks like if i drop na for Borough, it can be a better data

In [0]:
#dropping NA for borough
nyc = nyc_all_data[nyc_all_data['BOROUGH'].notnull()]
nyc.isnull().sum()

In [0]:
# We see NA for persons injured and killed - assigning 0 for these as they are going to be an dependent variable for us
# nyc['NUMBER OF PERSONS INJURED'] = nyc['NUMBER OF PERSONS INJURED'].fillna(0)
# nyc['NUMBER OF PERSONS KILLED'] = nyc['NUMBER OF PERSONS KILLED'].fillna(0)
nyc.loc[:, ['NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED']] = nyc.loc[:, ['NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED']].fillna(0)
nyc.isnull().sum()

In [0]:
per_inj = nyc['NUMBER OF PERSONS INJURED'].sum()
ped_inj = nyc['NUMBER OF PEDESTRIANS INJURED'].sum()
cyc_inj = nyc['NUMBER OF CYCLIST INJURED'].sum()
mot_inj = nyc['NUMBER OF MOTORIST INJURED'].sum()

# Calculate the sum of pedestrians, cyclists, and motorists injured
ped_cyc_mot_sum = ped_inj + cyc_inj + mot_inj

# Check if the sum of pedestrian, cyclist, and motorist injuries equals the total injuries
compare = per_inj == ped_cyc_mot_sum

# Combine the results into a single DataFrame
summary_table = pd.DataFrame({
    'Persons Injured': [per_inj],
    'Pedestrians Injured': [ped_inj],
    'Cyclist Injured': [cyc_inj],
    'Motorist Injured': [mot_inj],
    'ped + cyc + mot': [ped_cyc_mot_sum],
    'compare': [compare]
})
summary_table

In [0]:
per_kill = nyc['NUMBER OF PERSONS KILLED'].sum()
ped_kill = nyc['NUMBER OF PEDESTRIANS KILLED'].sum()
cyc_kill = nyc['NUMBER OF CYCLIST KILLED'].sum()
mot_kill = nyc['NUMBER OF MOTORIST KILLED'].sum()

# Calculate the sum of pedestrians, cyclists, and motorists injured
ped_cyc_mot_sum = ped_kill + cyc_kill + mot_kill

# Check if the sum of pedestrian, cyclist, and motorist injuries equals the total injuries
compare = per_kill == ped_cyc_mot_sum

# Combine the results into a single DataFrame
summary_table = pd.DataFrame({
    'Persons Killed': [per_kill],
    'Pedestrians Killed': [ped_kill],
    'Cyclist Killed': [cyc_kill],
    'Motorist Killed': [mot_kill],
    'ped + cyc + mot': [ped_cyc_mot_sum],
    'compare': [compare]
})
summary_table

In [0]:
# we are more interested if there was an injury or someone was killed and not really interested in how many as thats somthing not in our control
# Injured flag and killed flag to identify the collision result

# Creating injured_flag column using .loc
nyc.loc[:, 'injured_flag'] = np.where((nyc['NUMBER OF PERSONS INJURED'] > 0) |
                                      (nyc['NUMBER OF PEDESTRIANS INJURED'] > 0) |
                                      (nyc['NUMBER OF CYCLIST INJURED'] > 0) |
                                      (nyc['NUMBER OF MOTORIST INJURED'] > 0), 1, 0)

# Creating killed_flag column using .loc
nyc.loc[:, 'killed_flag'] = np.where((nyc['NUMBER OF PERSONS KILLED'] > 0) |
                                     (nyc['NUMBER OF PEDESTRIANS KILLED'] > 0) |
                                     (nyc['NUMBER OF CYCLIST KILLED'] > 0) |
                                     (nyc['NUMBER OF MOTORIST KILLED'] > 0), 1, 0)

## Data Manipulation - Treatment of NAs, creating dataset with required fields only and creating additional fields

In [0]:
# Calculate the count of NaN values for each column
nan_counts = nyc.isna().sum()

# Calculate the total count of non-NaN entries for each column
total_counts = nyc.count()

# Combine the results into a single DataFrame
summary_table = pd.DataFrame({
    'NaN Count': nan_counts,
    'Total Count': total_counts
})
summary_table

In [0]:
#creating df with only required fields
nyc_data = nyc.loc[:,['COLLISION_ID','CRASH DATE','CRASH TIME','BOROUGH','ZIP CODE','LOCATION','injured_flag','killed_flag','CONTRIBUTING FACTOR VEHICLE 1','CONTRIBUTING FACTOR VEHICLE 2','VEHICLE TYPE CODE 1','VEHICLE TYPE CODE 2']]

In [0]:
# List of columns to fill NAs
columns_to_fill = [
    'CONTRIBUTING FACTOR VEHICLE 1', 
    'CONTRIBUTING FACTOR VEHICLE 2', 
    'VEHICLE TYPE CODE 1', 
    'VEHICLE TYPE CODE 2'
]

# Fill NAs in the specified columns with 'Unspecified'
nyc_data[columns_to_fill] = nyc_data[columns_to_fill].fillna('Unspecified')

# -999, -999 for location field
# nyc_data['LATITUDE'] = nyc_data['LATITUDE'].fillna(-999)
# nyc_data['LONGITUDE'] = nyc_data['LONGITUDE'].fillna(-999)
nyc_data['LOCATION'] = nyc_data['LOCATION'].fillna('(-999, -999)')

# Zipcode missing data with 00000
nyc_data['ZIP CODE'] = nyc_data['ZIP CODE'].fillna("00000")

In [0]:
# Calculate the count of NaN values for each column
nan_counts = nyc_data.isna().sum()

# Calculate the total count of non-NaN entries for each column
total_counts = nyc_data.count()

# Combine the results into a single DataFrame
summary_table = pd.DataFrame({
    'NaN Count': nan_counts,
    'Total Count': total_counts
})
summary_table

In [0]:
#checking for data format for all the fields
nyc_data.dtypes

In [0]:
nyc_data['Year'] = pd.to_datetime(nyc_data['CRASH DATE']).dt.year
nyc_data['Month'] = pd.to_datetime(nyc_data['CRASH DATE']).dt.month

# # Filter out data for the year 2024
# nyc_data = nyc_data[nyc_data['Year'] != 2024]

In [0]:
nyc_data.head()

In [0]:
# Convert 'CRASH DATE' to datetime
nyc_data['CRASH DATE'] = pd.to_datetime(nyc_data['CRASH DATE'])

# Convert 'CRASH TIME' to time using the correct format
nyc_data['CRASH TIME'] = pd.to_datetime(nyc_data['CRASH TIME'], format='%H:%M')

# Combine 'CRASH DATE' and 'CRASH TIME' into a single 'CRASH DATETIME' column
nyc_data['CRASH DATETIME'] = nyc_data.apply(lambda row: datetime.combine(row['CRASH DATE'], row['CRASH TIME'].time()), axis=1)


# Extract day of the week & hour
nyc_data['day_of_week'] = nyc_data['CRASH DATE'].dt.day_name()
nyc_data['hour'] = nyc_data['CRASH TIME'].dt.hour

# Define function to categorize time ranges
def categorize_time(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

# Create 'time_category' column based on 'CRASH DATETIME'
nyc_data['time_category'] = nyc_data['CRASH DATETIME'].dt.hour.apply(categorize_time)

# Create 'weekend' flag column
nyc_data['weekend'] = (nyc_data['CRASH DATETIME'].dt.dayofweek >= 5).astype(int)

In [0]:
#checking for data format for all the fields
nyc_data.dtypes

In [0]:
# Cleaning Contributing Factor and Vehicle Type fields for any miss spellings etc
pd.set_option('display.max_rows', None)

# Concatenate 'CONTRIBUTING FACTOR VEHICLE 1' and 'CONTRIBUTING FACTOR VEHICLE 2'
combined_factors = nyc_data['CONTRIBUTING FACTOR VEHICLE 1'].astype(str) + ', ' + nyc_data['CONTRIBUTING FACTOR VEHICLE 2'].astype(str)

# Concatenate 'VEHICLE TYPE CODE 1' and 'VEHICLE TYPE CODE 2'
# combined_type = nyc_data['VEHICLE TYPE CODE 1'].astype(str) + ', ' + nyc_data['VEHICLE TYPE CODE 2'].astype(str)

# Identify misspellings and inconsistencies in the combined series
# print(combined_factors.value_counts())
# print(combined_type.unique())

# Export unique combinations to a CSV file
combined_factors_unique = combined_factors.unique()
unique_factors_df = pd.DataFrame({'Combined Factors': combined_factors_unique})
unique_factors_df.to_csv('unique_combined_factors1.csv', index=False)

# # Export unique combinations to a CSV file
# combined_type_unique = combined_type.unique()
# unique_type_df = pd.DataFrame({'Combined Types': combined_type_unique})
# unique_type_df.to_csv('unique_combined_type.csv', index=False)

In [0]:
# Create a mapping dictionary
factor_mapping = {'1, Unspecified': 'Unspecified, Unspecified','1, Driver Inexperience': 'Unspecified, Driver Inexperience','1, Unsafe Speed': 'Unspecified, Unsafe Speed','1, 1': 'Unspecified, Unspecified','1, Passing or Lane Usage Improper': 'Unspecified, Passing or Lane Usage Improper','80, Unspecified': 'Unspecified, Unspecified','80, 80': 'Unspecified, Unspecified','80, Oversized Vehicle': 'Unspecified, Oversized Vehicle','80, Fell Asleep': 'Unspecified, Fell Asleep','80, Aggressive Driving/Road Rage': 'Unspecified, Aggressive Driving/Road Rage','80, Traffic Control Disregarded': 'Unspecified, Traffic Control Disregarded','80, Pavement Slippery': 'Unspecified, Pavement Slippery','80, Tire Failure/Inadequate': 'Unspecified, Tire Failure/Inadequate','Alcohol Involvement, Illnes': 'Alcohol Involvement, illness','Animals Action, Illnes': 'Animals Action, illness','Driver Inattention/Distraction, Illnes': 'Driver Inattention/Distraction, illness','Driver Inexperience, Illnes': 'Driver Inexperience, illness','Failure to Yield Right-of-Way, Illnes': 'Failure to Yield Right-of-Way, illness','Following Too Closely, Illnes': 'Following Too Closely, illness','Illnes, Unspecified': 'illness, Unspecified','Illnes, Driver Inattention/Distraction': 'illness, Driver Inattention/Distraction','Illnes, Unsafe Lane Changing': 'illness, Unsafe Lane Changing','Illnes, Other Vehicular': 'illness, Other Vehicular','Illnes, Backing Unsafely': 'illness, Backing Unsafely','Illnes, Illnes': 'illness, Illnes','Illnes, Driver Inexperience': 'illness, Driver Inexperience','Illnes, Fatigued/Drowsy': 'illness, Fatigued/Drowsy','Illnes, Alcohol Involvement': 'illness, Alcohol Involvement','Illnes, Following Too Closely': 'illness, Following Too Closely','Illnes, View Obstructed/Limited': 'illness, View Obstructed/Limited','Illnes, Pavement Slippery': 'illness, Pavement Slippery','Illnes, Passing or Lane Usage Improper': 'illness, Passing or Lane Usage Improper','Illness, Unspecified': 'illness, Unspecified','Illness, Illness': 'illness, Illness','Illness, Failure to Yield Right-of-Way': 'illness, Failure to Yield Right-of-Way','Illness, Driver Inattention/Distraction': 'illness, Driver Inattention/Distraction','Illness, Passenger Distraction': 'illness, Passenger Distraction','Illness, Physical Disability': 'illness, Physical Disability','Illness, Other Vehicular': 'illness, Other Vehicular','Illness, Traffic Control Disregarded': 'illness, Traffic Control Disregarded','Illness, Lost Consciousness': 'illness, Lost Consciousness','Illness, Fatigued/Drowsy': 'illness, Fatigued/Drowsy','Illness, Oversized Vehicle': 'illness, Oversized Vehicle','Illness, Backing Unsafely': 'illness, Backing Unsafely','Illness, Outside Car Distraction': 'illness, Outside Car Distraction','Illness, Driver Inexperience': 'illness, Driver Inexperience','Illness, Prescription Medication': 'illness, Prescription Medication','Illness, Glare': 'illness, Glare','Illness, Unsafe Lane Changing': 'illness, Unsafe Lane Changing','Illness, Turning Improperly': 'illness, Turning Improperly','Illness, Other Electronic Device': 'illness, Other Electronic Device','Illness, View Obstructed/Limited': 'illness, View Obstructed/Limited','Illness, Lane Marking Improper/Inadequate': 'illness, Lane Marking Improper/Inadequate','Illness, Aggressive Driving/Road Rage': 'illness, Aggressive Driving/Road Rage','Illness, Failure to Keep Right': 'illness, Failure to Keep Right','Illness, Alcohol Involvement': 'illness, Alcohol Involvement','Illness, Pavement Slippery': 'illness, Pavement Slippery','Other Vehicular, Illnes': 'Other Vehicular, illness','Passing or Lane Usage Improper, Illnes': 'Passing or Lane Usage Improper, illness','Passing Too Closely, Illnes': 'Passing Too Closely, illness','Traffic Control Disregarded, Illnes': 'Traffic Control Disregarded, illness','Turning Improperly, Illnes': 'Turning Improperly, illness','Aggressive Driving/Road Rage, 1': 'Aggressive Driving/Road Rage, Unspecified','Other Vehicular, 80': 'Other Vehicular, Unspecified'}

# Apply the mapping to the combined series
combined_factors = combined_factors.replace(factor_mapping)

# Split the modified series back into separate columns
nyc_data[['CONTRIBUTING FACTOR VEHICLE 1', 'CONTRIBUTING FACTOR VEHICLE 2']] = combined_factors.str.split(', ', expand=True)

# Clean up data
nyc_data['CONTRIBUTING FACTOR VEHICLE 1'] = nyc_data['CONTRIBUTING FACTOR VEHICLE 1'].str.strip().str.lower()
nyc_data['CONTRIBUTING FACTOR VEHICLE 2'] = nyc_data['CONTRIBUTING FACTOR VEHICLE 2'].str.strip().str.lower()


In [0]:
# Define categories
vehicular_factors = ['steering failure','oversized vehicle','view obstructed/limited','other vehicular','brakes defective','tinted windows','tire failure/inadequate','accelerator defective','driverless/runaway vehicle','headlights defective','windshield inadequate','vehicle vandalism','other electronic device','tow hitch defective','other lighting defects']
human_factors = ['driver inexperience','passing too closely','turning improperly','reaction to uninvolved vehicle','following too closely','passing or lane usage improper','driver inattention/distraction','unsafe lane changing','alcohol involvement','traffic control disregarded','failure to yield right-of-way','aggressive driving/road rage','unsafe speed','illness','lost consciousness','backing unsafely','passenger distraction','fell asleep','pedestrian/bicyclist/other pedestrian error/confusion','drugs (illegal)','outside car distraction','fatigued/drowsy','physical disability','glare','eating or drinking','failure to keep right','cell phone (hands-free)','cell phone (hand-held)','using on board navigation device','prescription medication','texting','shoulders defective/improper','listening/using headphones','illnes']
environmental_factors = ['pavement slippery','obstruction/debris','animals action','pavement defective','lane marking improper/inadequate','traffic control device improper/non-working','reaction to other uninvolved vehicle']

# Categorization function
def categorize_factor(factor):
    # factor = factor.strip().title()  # Normalize the factor string (strip whitespaces, title case)
    
    if factor in vehicular_factors:
        return 'Vehicular'
    elif factor in human_factors:
        return 'Human'
    elif factor in environmental_factors:
        return 'Environmental'
    else:
        return 'Unknown'  # For factors that don't fit in any category

# Apply the categorization to both columns
nyc_data['FACTOR VEHICLE 1 CATEGORY'] = nyc_data['CONTRIBUTING FACTOR VEHICLE 1'].apply(categorize_factor)
nyc_data['FACTOR VEHICLE 2 CATEGORY'] = nyc_data['CONTRIBUTING FACTOR VEHICLE 2'].apply(categorize_factor)

nyc_data['COMBINED FACTOR CATEGORY'] = nyc_data['FACTOR VEHICLE 1 CATEGORY'].astype(str) + ', ' + nyc_data['FACTOR VEHICLE 2 CATEGORY'].astype(str)
# Group by the combined factor category and count occurrences
category_counts = nyc_data['COMBINED FACTOR CATEGORY'].value_counts()

# Print the results
print(category_counts)

In [0]:
# Create the any_factor_Environmental field
nyc_data['any_factor_Environmental'] = ((nyc_data['FACTOR VEHICLE 1 CATEGORY'] == 'Environmental') | 
                                        (nyc_data['FACTOR VEHICLE 2 CATEGORY'] == 'Environmental')).astype(int)

# Create the any_factor_Human field
nyc_data['any_factor_Human'] = ((nyc_data['FACTOR VEHICLE 1 CATEGORY'] == 'Human') | 
                                (nyc_data['FACTOR VEHICLE 2 CATEGORY'] == 'Human')).astype(int)

# Create the any_factor_Vehicular field
nyc_data['any_factor_Vehicular'] = ((nyc_data['FACTOR VEHICLE 1 CATEGORY'] == 'Vehicular') | 
                                    (nyc_data['FACTOR VEHICLE 2 CATEGORY'] == 'Vehicular')).astype(int)


In [0]:
# # Filter out data for the year 2024
# nyc_data = nyc_data[nyc_data['Year'] != 2024]

# Group by year and calculate totals
yearly_totals = nyc_data.groupby('Year').agg(
    collisions=('COLLISION_ID', 'count'),
    injuries=('injured_flag', 'sum'),
    deaths=('killed_flag', 'sum')
)

# Calculate year-over-year percentage change
yearly_totals['Collision_YoY'] = yearly_totals['collisions'].pct_change() * 100
yearly_totals['Injuries_YoY'] = yearly_totals['injuries'].pct_change() * 100
yearly_totals['Deaths_YoY'] = yearly_totals['deaths'].pct_change() * 100

# Drop the first row as it will have NaN for YoY percentages
# yearly_totals = yearly_totals.dropna()

# Sort the DataFrame by year in ascending order
yearly_totals_sorted = yearly_totals.sort_values(by='Year', ascending=True)

print(yearly_totals_sorted)

## Visualization

In [0]:
# First, filter the dataset to include only the years 2013 to 2023
filtered_data = nyc_data[(nyc_data['CRASH DATE'].dt.year >= 2013) & (nyc_data['CRASH DATE'].dt.year <= 2023)]

# Group the filtered data by year and count the number of injuries and deaths for each year
injuries_by_year = filtered_data.groupby(filtered_data['CRASH DATE'].dt.year)['injured_flag'].sum()
deaths_by_year = filtered_data.groupby(filtered_data['CRASH DATE'].dt.year)['killed_flag'].sum()

# Plotting injuries
plt.figure(figsize=(10, 6))
bars = injuries_by_year.plot(kind='bar', color='skyblue')
plt.title('Yearly Count of Injuries (2013-2023)')
plt.xlabel('Year')
plt.ylabel('Number of Injuries')
plt.xticks(rotation=0)
plt.grid(axis='y')

# Setting y-axis range and grid lines for injuries
plt.ylim(20000, 32000)
plt.yticks(range(20000, 32001, 2000))

# Adding numbers inside the bars
for bar in bars.patches:
    plt.annotate(format(bar.get_height(), '.0f'), 
                 (bar.get_x() + bar.get_width() / 2, 
                  bar.get_height()), ha='center', va='center',
                 xytext=(0, 5), textcoords='offset points')

plt.tight_layout()
plt.show()

# Plotting deaths
plt.figure(figsize=(10, 6))
bars = deaths_by_year.plot(kind='bar', color='salmon')
plt.title('Yearly Count of Deaths (2013-2023)')
plt.xlabel('Year')
plt.ylabel('Number of Deaths')
plt.xticks(rotation=0)
plt.grid(axis='y')

# Setting y-axis range and grid lines for deaths
plt.ylim(110, 210)
plt.yticks(range(110, 211, 10))

# Adding numbers inside the bars
for bar in bars.patches:
    plt.annotate(format(bar.get_height(), '.0f'), 
                 (bar.get_x() + bar.get_width() / 2, 
                  bar.get_height()), ha='center', va='center',
                 xytext=(0, 5), textcoords='offset points')

plt.tight_layout()
plt.show()

In [0]:
# First, filter the dataset to include only the years 2013 to 2023
filtered_data = nyc_data[(nyc_data['CRASH DATE'].dt.year >= 2013) & (nyc_data['CRASH DATE'].dt.year <= 2023)]

# Create a new column for year-month
filtered_data['YearMonth'] = filtered_data['CRASH DATE'].dt.to_period('M')

# Group the filtered data by YearMonth and sum the number of injuries and deaths for each month
monthly_injuries = filtered_data.groupby('YearMonth')['injured_flag'].sum()
monthly_deaths = filtered_data.groupby('YearMonth')['killed_flag'].sum()

# Plotting monthly injuries
plt.figure(figsize=(14, 7))
monthly_injuries.plot(kind='line', color='skyblue', marker='o')
plt.title('Monthly Count of Injuries (Jan 2013 - Dec 2023)')
plt.xlabel('Year-Month')
plt.ylabel('Number of Injuries')
plt.grid(True)
plt.xticks(rotation=0)
plt.tight_layout()

# Adding numbers on the line chart for injuries
for i, value in enumerate(monthly_injuries):
    plt.text(monthly_injuries.index[i].to_timestamp(), value, f'{value:.0f}', ha='right', va='bottom', fontsize=8)

plt.show()

# Plotting monthly deaths
plt.figure(figsize=(14, 7))
monthly_deaths.plot(kind='line', color='salmon', marker='o')
plt.title('Monthly Count of Deaths (Jan 2013 - Dec 2023)')
plt.xlabel('Year-Month')
plt.ylabel('Number of Deaths')
plt.grid(True)
plt.xticks(rotation=0)
plt.tight_layout()

# Adding numbers on the line chart for deaths
for i, value in enumerate(monthly_deaths):
    plt.text(monthly_deaths.index[i].to_timestamp(), value, f'{value:.0f}', ha='right', va='bottom', fontsize=8)

plt.show()

In [0]:
# Group data by Year and Borough and calculate sums
grouped_data = nyc_data.groupby(['Year', 'BOROUGH']).agg({'injured_flag': 'sum', 'killed_flag': 'sum'}).reset_index()

# Plot line chart
plt.figure(figsize=(10, 6))

# Iterate over unique boroughs
for borough in grouped_data['BOROUGH'].unique():
    borough_data = grouped_data[grouped_data['BOROUGH'] == borough]
    plt.plot(borough_data['Year'], borough_data['injured_flag'], label=f'{borough} Injuries', marker='o')

plt.title('Annual Injuries by Borough')
plt.xlabel('Year')
plt.ylabel('Count')
plt.legend()
plt.grid(True)
plt.xticks(grouped_data['Year'].unique())  # Set x-axis ticks to years
plt.tight_layout()
plt.show()

In [0]:
# Group data by Year and Borough and calculate sums
grouped_data = nyc_data.groupby(['Year', 'BOROUGH']).agg({'injured_flag': 'sum', 'killed_flag': 'sum'}).reset_index()

# Plot line chart
plt.figure(figsize=(10, 6))

# Iterate over unique boroughs
for borough in grouped_data['BOROUGH'].unique():
    borough_data = grouped_data[grouped_data['BOROUGH'] == borough]
    plt.plot(borough_data['Year'], borough_data['killed_flag'], label=f'{borough} Injuries', marker='o')

plt.title('Annual Deaths by Borough')
plt.xlabel('Year')
plt.ylabel('Count')
plt.legend()
plt.grid(True)
plt.xticks(grouped_data['Year'].unique())  # Set x-axis ticks to years
plt.tight_layout()
plt.show()

In [0]:
# Filter data for killed_flag and injured_flag
killed_data = nyc_data[nyc_data['killed_flag'] == 1]
injured_data = nyc_data[nyc_data['injured_flag'] == 1]

# Group the data by day of the week and hour
grouped_killed = killed_data.groupby(['day_of_week', 'hour']).size().reset_index(name='count')
grouped_injured = injured_data.groupby(['day_of_week', 'hour']).size().reset_index(name='count')

# Create the treemap for killed_flag
fig_killed = px.treemap(grouped_killed, 
                        path=['day_of_week', 'hour'], 
                        values='count',
                        color='count',
                        color_continuous_scale='Reds',
                        title='Treemap of Collisions Resulting in Death by Day of Week and Hour')

fig_killed.update_layout(margin=dict(t=50, l=25, r=25, b=25))

# Create the treemap for injured_flag
fig_injured = px.treemap(grouped_injured, 
                         path=['day_of_week', 'hour'], 
                         values='count',
                         color='count',
                         color_continuous_scale='Blues',
                         title='Treemap of Collisions Resulting in Injury by Day of Week and Hour')

fig_injured.update_layout(margin=dict(t=50, l=25, r=25, b=25))

# Show the treemaps
fig_killed.show()
fig_injured.show()

In [0]:
# Order days of the week for consistent plotting
days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
hours = range(24)

# Create pivot tables for killed_flag and injured_flag, and reindex them
killed_pivot = nyc_data[nyc_data['killed_flag'] == 1].pivot_table(
    index='hour', columns='day_of_week', values='COLLISION_ID', aggfunc='count', fill_value=0
).reindex(index=hours, columns=days_order, fill_value=0)

injured_pivot = nyc_data[nyc_data['injured_flag'] == 1].pivot_table(
    index='hour', columns='day_of_week', values='COLLISION_ID', aggfunc='count', fill_value=0
).reindex(index=hours, columns=days_order, fill_value=0)

# Plot the heatmap for killed_flag
plt.figure(figsize=(14, 7))
ax1 = sns.heatmap(killed_pivot.T, cmap='Reds', annot=True, fmt='d', linewidths=.5, cbar_kws={'label': 'Number of Collisions'})
ax1.set_title('Heatmap of Collisions Resulting in Death by Hour and Day of Week')
ax1.set_xlabel('Hour of Day')
ax1.set_ylabel('Day of Week')
ax1.set_xticklabels(ax1.get_xticklabels(), rotation=0)
ax1.set_yticklabels(ax1.get_yticklabels(), rotation=0)
plt.show()

# Plot the heatmap for injured_flag
plt.figure(figsize=(14, 7))
ax2 = sns.heatmap(injured_pivot.T, cmap='Blues', annot=True, fmt='d', linewidths=.5, cbar_kws={'label': 'Number of Collisions'})
ax2.set_title('Heatmap of Collisions Resulting in Injury by Hour and Day of Week')
ax2.set_xlabel('Hour of Day')
ax2.set_ylabel('Day of Week')
ax2.set_xticklabels(ax2.get_xticklabels(), rotation=0)
ax2.set_yticklabels(ax2.get_yticklabels(), rotation=0)
plt.show()

In [0]:
# Pivot the DataFrame based on 'time_category' and 'day_of_week', and aggregate counts of 'killed_flag' and 'injured_flag'
heatmap_data_killed = nyc_data.pivot_table(index='time_category', columns='day_of_week', values='killed_flag', aggfunc='sum', fill_value=0)
heatmap_data_injured = nyc_data.pivot_table(index='time_category', columns='day_of_week', values='injured_flag', aggfunc='sum', fill_value=0)

# Reorder the rows based on the time categories
time_category_order = ['Morning', 'Afternoon', 'Evening', 'Night']
heatmap_data_killed = heatmap_data_killed.reindex(time_category_order)
heatmap_data_injured = heatmap_data_injured.reindex(time_category_order)

# Order days of the week for consistent plotting
days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Plot the heatmap for killed_flag
plt.figure(figsize=(10, 6))
sns.heatmap(heatmap_data_killed, cmap='Reds', annot=True, fmt='d', linewidths=.5, xticklabels=days_order)
plt.title('Heatmap of Collisions Resulting in Death by Time of Day and Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Time of Day')
plt.show()

# Plot the heatmap for injured_flag
plt.figure(figsize=(10, 6))
sns.heatmap(heatmap_data_injured, cmap='Blues', annot=True, fmt='d', linewidths=.5, xticklabels=days_order)
plt.title('Heatmap of Collisions Resulting in Injury by Time of Day and Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Time of Day')
plt.show()

In [0]:
# Create pivot tables for killed_flag and injured_flag
killed_pivot = nyc_data.pivot_table(
    index='FACTOR VEHICLE 1 CATEGORY',
    columns='FACTOR VEHICLE 2 CATEGORY',
    values='killed_flag',
    aggfunc='sum',
    fill_value=0
)

injured_pivot = nyc_data.pivot_table(
    index='FACTOR VEHICLE 1 CATEGORY',
    columns='FACTOR VEHICLE 2 CATEGORY',
    values='injured_flag',
    aggfunc='sum',
    fill_value=0
)

# Plotting the heatmaps with adjusted font size and grid size
fig, axes = plt.subplots(1, 2, figsize=(20, 10))

sns.heatmap(killed_pivot, ax=axes[0], annot=True, fmt="d", cmap="Reds", cbar=False, annot_kws={"size": 18}, linewidths=.5)
axes[0].set_title('Heatmap of Killed Flag Counts', fontsize=24)
axes[0].set_xlabel('Vehicle 2 Factor Category', fontsize=20)
axes[0].set_ylabel('Vehicle 1 Factor Category', fontsize=20)
axes[0].tick_params(axis='both', which='major', labelsize=22)

sns.heatmap(injured_pivot, ax=axes[1], annot=True, fmt="d", cmap="Blues", cbar=False, annot_kws={"size": 18}, linewidths=.5)
axes[1].set_title('Heatmap of Injured Flag Counts', fontsize=24)
axes[1].set_xlabel('Vehicle 2 Factor Category', fontsize=20)
axes[1].set_ylabel('Vehicle 1 Factor Category', fontsize=20)
axes[1].tick_params(axis='both', which='major', labelsize=22)

plt.tight_layout()
plt.show()

In [0]:
# Pivot table for killed_flag
killed_pivot = nyc_data.pivot_table(
    index='weekend',
    columns='BOROUGH',
    values='killed_flag',
    aggfunc='sum',
    fill_value=0
)

# Pivot table for injured_flag
injured_pivot = nyc_data.pivot_table(
    index='weekend',
    columns='BOROUGH',
    values='injured_flag',
    aggfunc='sum',
    fill_value=0
)

# Plotting the heatmaps
fig, axes = plt.subplots(1, 2, figsize=(20, 10))

sns.heatmap(killed_pivot, ax=axes[0], annot=True, fmt="d", cmap="Reds")
axes[0].set_title('Heatmap of Killed Flag Counts')
axes[0].set_xlabel('BOROUGH')
axes[0].set_ylabel('Weekend Flag')

sns.heatmap(injured_pivot, ax=axes[1], annot=True, fmt="d", cmap="Blues")
axes[1].set_title('Heatmap of Injured Flag Counts')
axes[1].set_xlabel('BOROUGH')
axes[1].set_ylabel('Weekend Flag')

plt.tight_layout()
plt.show()

In [0]:
# Create pivot tables for killed_flag and injured_flag
killed_pivot = nyc_data.pivot_table(
    index='time_category',
    columns='BOROUGH',
    values='killed_flag',
    aggfunc='sum',
    fill_value=0
)

injured_pivot = nyc_data.pivot_table(
    index='time_category',
    columns='BOROUGH',
    values='injured_flag',
    aggfunc='sum',
    fill_value=0
)

# Reorder the rows based on the time categories
time_category_order = ['Morning', 'Afternoon', 'Evening', 'Night']
killed_pivot = killed_pivot.reindex(time_category_order)
injured_pivot = injured_pivot.reindex(time_category_order)

# Plotting the heatmaps with adjusted font size and grid size
fig, axes = plt.subplots(1, 2, figsize=(20, 10))

sns.heatmap(killed_pivot, ax=axes[0], annot=True, fmt="d", cmap="Reds", cbar=False, annot_kws={"size": 12}, linewidths=.5)
axes[0].set_title('Heatmap of Killed Flag Counts', fontsize=16)
axes[0].set_xlabel('BOROUGH', fontsize=14)
axes[0].set_ylabel('Time Category', fontsize=14)
axes[0].tick_params(axis='both', which='major', labelsize=14)

sns.heatmap(injured_pivot, ax=axes[1], annot=True, fmt="d", cmap="Blues", cbar=False, annot_kws={"size": 12}, linewidths=.5)
axes[1].set_title('Heatmap of Injured Flag Counts', fontsize=16)
axes[1].set_xlabel('BOROUGH', fontsize=14)
axes[1].set_ylabel('Time Category', fontsize=14)
axes[1].tick_params(axis='both', which='major', labelsize=14)

plt.tight_layout()
plt.show()

In [0]:
# Create pivot tables for killed_flag and injured_flag
killed_pivot = nyc_data.pivot_table(
    index='day_of_week',
    columns='BOROUGH',
    values='killed_flag',
    aggfunc='sum',
    fill_value=0
)

injured_pivot = nyc_data.pivot_table(
    index='day_of_week',
    columns='BOROUGH',
    values='injured_flag',
    aggfunc='sum',
    fill_value=0
)

# Reorder the rows based on the days of the week
day_of_week_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
killed_pivot = killed_pivot.reindex(day_of_week_order)
injured_pivot = injured_pivot.reindex(day_of_week_order)

# Plotting the heatmaps with adjusted font size and grid size
fig, axes = plt.subplots(1, 2, figsize=(20, 10))

sns.heatmap(killed_pivot, ax=axes[0], annot=True, fmt="d", cmap="Reds", cbar=False, annot_kws={"size": 12}, linewidths=.5)
axes[0].set_title('Heatmap of Killed Flag Counts', fontsize=16)
axes[0].set_xlabel('BOROUGH', fontsize=14)
axes[0].set_ylabel('Day of Week', fontsize=14)
axes[0].tick_params(axis='both', which='major', labelsize=14)

sns.heatmap(injured_pivot, ax=axes[1], annot=True, fmt="d", cmap="Blues", cbar=False, annot_kws={"size": 12}, linewidths=.5)
axes[1].set_title('Heatmap of Injured Flag Counts', fontsize=16)
axes[1].set_xlabel('BOROUGH', fontsize=14)
axes[1].set_ylabel('Day of Week', fontsize=14)
axes[1].tick_params(axis='both', which='major', labelsize=14)

plt.tight_layout()
plt.show()

## Analyzing dependent and independent variables

In [0]:
# Calculate total record count
total_count = len(nyc_data)

# Calculate the injured count and percent
injured_event_count = len(nyc_data[nyc_data['injured_flag'] == 1])
injured_event_percent = len(nyc_data[nyc_data['injured_flag'] == 1])*100/len(nyc_data)

# Calculate the killed count and percent
killed_event_count = len(nyc_data[nyc_data['killed_flag'] == 1])
killed_event_percent = len(nyc_data[nyc_data['killed_flag'] == 1])*100/len(nyc_data)

# Combine the results into a single DataFrame
summary_table = pd.DataFrame({
    'Total Count': [total_count],
    'Injured Count': [injured_event_count],
    'Injured Percent': [injured_event_percent],
    'Killed Count': [killed_event_count],
    'Killed Percent': [killed_event_percent]
})
summary_table

In [0]:
# Group by 'time_category'
grouped_data = nyc_data.groupby('time_category')

# Calculate total record count
total_count = grouped_data['COLLISION_ID'].count()

# Calculate the injured count and percent for each time category
injured_event_counts = grouped_data['injured_flag'].sum()
injured_event_percentages = injured_event_counts * 100 / total_count

# Calculate the killed count and percent for each time category
killed_event_counts = grouped_data['killed_flag'].sum()
killed_event_percentages = killed_event_counts * 100 / total_count

# Combine the results into a single DataFrame
summary_table = pd.DataFrame({
    'Time Category': grouped_data.groups.keys(),
    'Total Count': total_count,
    'Injured Count': injured_event_counts,
    'Injured Percent': injured_event_percentages,
    'Killed Count': killed_event_counts,
    'Killed Percent': killed_event_percentages
})

summary_table.reset_index(drop=True, inplace=True)  # Reset index for better visualization
summary_table

In [0]:
grouped_counts = nyc_data.groupby(['killed_flag', 'injured_flag']).size().reset_index(name='count')

# Print the grouped counts
print(grouped_counts)

In [0]:
# Create the any_Casualty field
nyc_data['any_casualty_flag'] = ((nyc_data['killed_flag'] == 1) | (nyc_data['injured_flag'] == 1)).astype(int)


In [0]:
grouped_counts = nyc_data.groupby(['killed_flag', 'injured_flag','any_casualty_flag']).size().reset_index(name='count')

# Print the grouped counts
print(grouped_counts)

# Data Prep for Logistic Regression Model

In [0]:
nyc_data.head()

In [0]:
nyc_data['CRASH DATE'].min()

In [0]:
nyc_data['CRASH DATE'].max()

In [0]:
# Create dummy variables
dummies_borough = pd.get_dummies(nyc_data['BOROUGH'], prefix='BOROUGH')
dummies_factor1 = pd.get_dummies(nyc_data['CONTRIBUTING FACTOR VEHICLE 1'], prefix='FACTOR1')
dummies_factor2 = pd.get_dummies(nyc_data['CONTRIBUTING FACTOR VEHICLE 2'], prefix='FACTOR2')
dummies_day_of_week = pd.get_dummies(nyc_data['day_of_week'], prefix='DAY')
dummies_time_category = pd.get_dummies(nyc_data['time_category'], prefix='TIME')
dummies_factor_cat1 = pd.get_dummies(nyc_data['FACTOR VEHICLE 1 CATEGORY'], prefix='FACTORCAT1')
dummies_factor_cat2 = pd.get_dummies(nyc_data['FACTOR VEHICLE 2 CATEGORY'], prefix='FACTORCAT2')
dummies_year = pd.get_dummies(nyc_data['Year'], prefix='YEAR')
dummies_month = pd.get_dummies(nyc_data['Month'], prefix='Month')

# Combine the dummy variables with the original DataFrame
nyc_data_with_dummies = pd.concat([nyc_data, dummies_borough, dummies_factor1, dummies_factor2, dummies_day_of_week, dummies_time_category, dummies_factor_cat1, dummies_factor_cat2, dummies_year, dummies_month], axis=1)

# Drop the original categorical columns
nyc_data_with_dummies.drop(['BOROUGH','CONTRIBUTING FACTOR VEHICLE 1', 'CONTRIBUTING FACTOR VEHICLE 2','day_of_week','time_category','FACTOR VEHICLE 1 CATEGORY','FACTOR VEHICLE 2 CATEGORY', 'Year', 'Month'], axis=1, inplace=True)

# Optionally, drop one of the dummy variables for each factor to avoid multicollinearity (dummy variable trap)
nyc_data_with_dummies.drop(['BOROUGH_BRONX','FACTOR1_accelerator defective', 'FACTOR2_accelerator defective','DAY_Monday','TIME_Morning', 'FACTORCAT1_Unknown','FACTORCAT2_Unknown', 'YEAR_2012', 'Month_1'], axis=1, inplace=True)

nyc_data_with_dummies.dtypes

## Isolate X & Y

In [0]:
Y = nyc_data_with_dummies.loc[:,'any_casualty_flag'].values

In [0]:
#First Model using Weekday columns
X = nyc_data_with_dummies.loc[:,['DAY_Friday','DAY_Saturday', 'DAY_Sunday','DAY_Thursday','DAY_Tuesday','DAY_Wednesday']].values

# Convert boolean columns to integers using NumPy operations
X = X.astype(int)  # Assuming all values are already boolean or integer

# Add a constant column for the intercept term
X = sm.add_constant(X)

#Run Logistic Regression model
model_weekday = sm.Logit(Y, X).fit()

#Model Summary Output
model_weekday.summary(yname='any_casualty_flag',
              xname=('constant','DAY_Friday','DAY_Saturday', 'DAY_Sunday','DAY_Thursday','DAY_Tuesday','DAY_Wednesday'),
              title = 'Impact of Weekday on Casualty - model Weekday')

In [0]:
#Second Model using Time Category columns
X = nyc_data_with_dummies.loc[:,['TIME_Afternoon','TIME_Evening','TIME_Night']].values

# Convert boolean columns to integers using NumPy operations
X = X.astype(int)  # Assuming all values are already boolean or integer

# Add a constant column for the intercept term
X = sm.add_constant(X)

#Run Logistic Regression model
model_time_cat = sm.Logit(Y, X).fit()

#Model Summary Output
model_time_cat.summary(yname='any_casualty_flag',
              xname=('constant','TIME_Afternoon','TIME_Evening','TIME_Night'),
              title = 'Model Time Category')

In [0]:
#Third Model using Factor 1 Category columns
X = nyc_data_with_dummies.loc[:,['FACTORCAT1_Environmental','FACTORCAT1_Human','FACTORCAT1_Vehicular']].values

# Convert boolean columns to integers using NumPy operations
X = X.astype(int)  # Assuming all values are already boolean or integer

# Add a constant column for the intercept term
X = sm.add_constant(X)

#Run Logistic Regression model
model_factor1_cat = sm.Logit(Y, X).fit()

#Model Summary Output
model_factor1_cat.summary(yname='any_casualty_flag',
              xname=('constant','FACTORCAT1_Environmental','FACTORCAT1_Human','FACTORCAT1_Vehicular'),
              title = 'Model Factor1 Category')

In [0]:
#Fourth Model using Factor 2 Category columns
X = nyc_data_with_dummies.loc[:,['FACTORCAT2_Environmental','FACTORCAT2_Human','FACTORCAT2_Vehicular']].values

# Convert boolean columns to integers using NumPy operations
X = X.astype(int)  # Assuming all values are already boolean or integer

# Add a constant column for the intercept term
X = sm.add_constant(X)

#Run Logistic Regression model
model_factor2_cat = sm.Logit(Y, X).fit()

#Model Summary Output
model_factor2_cat.summary(yname='any_casualty_flag',
              xname=('constant','FACTORCAT2_Environmental','FACTORCAT2_Human','FACTORCAT2_Vehicular'),
              title = 'Model Factor2 Category')

In [0]:
#Fifth Model using Factor 1&2 Category columns
X = nyc_data_with_dummies.loc[:,['FACTORCAT1_Environmental','FACTORCAT1_Human','FACTORCAT1_Vehicular','FACTORCAT2_Environmental','FACTORCAT2_Human','FACTORCAT2_Vehicular']].values

# Convert boolean columns to integers using NumPy operations
X = X.astype(int)  # Assuming all values are already boolean or integer

# Add a constant column for the intercept term
X = sm.add_constant(X)

#Run Logistic Regression model
model_factor12_cat = sm.Logit(Y, X).fit()

#Model Summary Output
model_factor12_cat.summary(yname='any_casualty_flag',
              xname=('constant','FACTORCAT1_Environmental','FACTORCAT1_Human','FACTORCAT1_Vehicular','FACTORCAT2_Environmental','FACTORCAT2_Human','FACTORCAT2_Vehicular'),
              title = 'Model Factor1&2 Category')

In [0]:
#Sixth Model using Borough, Factor 1&2 Category columns
X = nyc_data_with_dummies.loc[:,['BOROUGH_BROOKLYN','BOROUGH_MANHATTAN','BOROUGH_QUEENS','BOROUGH_STATEN ISLAND','FACTORCAT1_Environmental','FACTORCAT1_Human','FACTORCAT2_Human','FACTORCAT2_Vehicular']].values

# Convert boolean columns to integers using NumPy operations
X = X.astype(int)  # Assuming all values are already boolean or integer

# Add a constant column for the intercept term
X = sm.add_constant(X)

#Run Logistic Regression model
model_Bfactor12_cat = sm.Logit(Y, X).fit()

#Model Summary Output
model_Bfactor12_cat.summary(yname='any_casualty_flag',
              xname=('constant','BOROUGH_BROOKLYN','BOROUGH_MANHATTAN','BOROUGH_QUEENS','BOROUGH_STATEN ISLAND','FACTORCAT1_Environmental','FACTORCAT1_Human','FACTORCAT2_Human','FACTORCAT2_Vehicular'),
              title = 'Model Borough, Factor1&2 Category')

In [0]:
#Seventh Model using Time Catagories, Borough, Factor 1&2 Category columns
X = nyc_data_with_dummies.loc[:,['TIME_Afternoon','TIME_Evening','TIME_Night','BOROUGH_BROOKLYN','BOROUGH_MANHATTAN','BOROUGH_QUEENS','BOROUGH_STATEN ISLAND','FACTORCAT1_Environmental','FACTORCAT1_Human','FACTORCAT2_Human','FACTORCAT2_Vehicular']].values

# Convert boolean columns to integers using NumPy operations
X = X.astype(int)  # Assuming all values are already boolean or integer

# Add a constant column for the intercept term
X = sm.add_constant(X)

#Run Logistic Regression model
model_TBfactor12_cat = sm.Logit(Y, X).fit()

#Model Summary Output
model_TBfactor12_cat.summary(yname='any_casualty_flag',
              xname=('constant','TIME_Afternoon','TIME_Evening','TIME_Night','BOROUGH_BROOKLYN','BOROUGH_MANHATTAN','BOROUGH_QUEENS','BOROUGH_STATEN ISLAND','FACTORCAT1_Environmental','FACTORCAT1_Human','FACTORCAT2_Human','FACTORCAT2_Vehicular'),
              title = 'Model Time Categories, Borough, Factor1&2 Category')

In [0]:
#Eighth Model using Day, Time Catagories, Borough, Factor 1&2 Category columns
X = nyc_data_with_dummies.loc[:,['DAY_Saturday', 'DAY_Wednesday','TIME_Afternoon','TIME_Evening','TIME_Night','BOROUGH_BROOKLYN','BOROUGH_MANHATTAN','BOROUGH_QUEENS','BOROUGH_STATEN ISLAND','FACTORCAT1_Environmental','FACTORCAT1_Human','FACTORCAT2_Human','FACTORCAT2_Vehicular']].values

# Convert boolean columns to integers using NumPy operations
X = X.astype(int)  # Assuming all values are already boolean or integer

# Add a constant column for the intercept term
X = sm.add_constant(X)

#Run Logistic Regression model
model_DTBfactor12_cat = sm.Logit(Y, X).fit()

#Model Summary Output
model_DTBfactor12_cat.summary(yname='any_casualty_flag',
              xname=('constant','DAY_Saturday', 'DAY_Wednesday','TIME_Afternoon','TIME_Evening','TIME_Night','BOROUGH_BROOKLYN','BOROUGH_MANHATTAN','BOROUGH_QUEENS','BOROUGH_STATEN ISLAND','FACTORCAT1_Environmental','FACTORCAT1_Human','FACTORCAT2_Human','FACTORCAT2_Vehicular'),
              title = 'Model Day, Time Categories, Borough, Factor1&2 Category')

In [0]:
#Ninth Model using Weekend, Day, Time Catagories, Borough, Factor 1&2 Category columns
X = nyc_data_with_dummies.loc[:,['weekend','DAY_Saturday', 'DAY_Wednesday','TIME_Afternoon','TIME_Evening','TIME_Night','BOROUGH_BROOKLYN','BOROUGH_MANHATTAN','BOROUGH_QUEENS','BOROUGH_STATEN ISLAND','FACTORCAT1_Environmental','FACTORCAT1_Human','FACTORCAT2_Human','FACTORCAT2_Vehicular']].values

# Convert boolean columns to integers using NumPy operations
X = X.astype(int)  # Assuming all values are already boolean or integer

# Add a constant column for the intercept term
X = sm.add_constant(X)

#Run Logistic Regression model
model_WDTBfactor12_cat = sm.Logit(Y, X).fit()

#Model Summary Output
model_WDTBfactor12_cat.summary(yname='any_casualty_flag',
              xname=('constant','weekend','DAY_Saturday', 'DAY_Wednesday','TIME_Afternoon','TIME_Evening','TIME_Night','BOROUGH_BROOKLYN','BOROUGH_MANHATTAN','BOROUGH_QUEENS','BOROUGH_STATEN ISLAND','FACTORCAT1_Environmental','FACTORCAT1_Human','FACTORCAT2_Human','FACTORCAT2_Vehicular'),
              title = 'Model Weekend, Day, Time Categories, Borough, Factor1&2 Category')

In [0]:
#Tenth Model using Any Factor, Day, Time Catagories, Borough, Factor 1&2 Category columns
X = nyc_data_with_dummies.loc[:,['any_factor_Human','DAY_Saturday', 'DAY_Wednesday','TIME_Afternoon','TIME_Evening','TIME_Night','BOROUGH_BROOKLYN','BOROUGH_MANHATTAN','BOROUGH_QUEENS','BOROUGH_STATEN ISLAND','FACTORCAT1_Environmental','FACTORCAT2_Vehicular']].values

# Convert boolean columns to integers using NumPy operations
X = X.astype(int)  # Assuming all values are already boolean or integer

# Add a constant column for the intercept term
X = sm.add_constant(X)

#Run Logistic Regression model
model_ADTBfactor12_cat = sm.Logit(Y, X).fit()

#Model Summary Output
model_ADTBfactor12_cat.summary(yname='any_casualty_flag',
              xname=('constant','any_factor_Human','DAY_Saturday', 'DAY_Wednesday','TIME_Afternoon','TIME_Evening','TIME_Night','BOROUGH_BROOKLYN','BOROUGH_MANHATTAN','BOROUGH_QUEENS','BOROUGH_STATEN ISLAND','FACTORCAT1_Environmental','FACTORCAT2_Vehicular'),
              title = 'Model Any Factor, Day, Time Categories, Borough, Factor1&2 Category')

In [0]:
'hour','weekend','any_factor_Environmental','any_factor_Human','any_factor_Vehicular','any_casualty_flag','BOROUGH_BROOKLYN','BOROUGH_MANHATTAN','BOROUGH_QUEENS','BOROUGH_STATEN ISLAND','FACTOR1_aggressive driving/road rage','FACTOR1_alcohol involvement','FACTOR1_animals action','FACTOR1_backing unsafely','FACTOR1_brakes defective','FACTOR1_cell phone (hand-held)','FACTOR1_cell phone (hands-free)','FACTOR1_driver inattention/distraction','FACTOR1_driver inexperience','FACTOR1_driverless/runaway vehicle','FACTOR1_drugs (illegal)','FACTOR1_eating or drinking','FACTOR1_failure to keep right','FACTOR1_failure to yield right-of-way','FACTOR1_fatigued/drowsy','FACTOR1_fell asleep','FACTOR1_following too closely','FACTOR1_glare','FACTOR1_headlights defective','FACTOR1_illness','FACTOR1_lane marking improper/inadequate','FACTOR1_listening/using headphones','FACTOR1_lost consciousness','FACTOR1_obstruction/debris','FACTOR1_other electronic device','FACTOR1_other lighting defects','FACTOR1_other vehicular','FACTOR1_outside car distraction','FACTOR1_oversized vehicle','FACTOR1_passenger distraction','FACTOR1_passing or lane usage improper','FACTOR1_passing too closely','FACTOR1_pavement defective','FACTOR1_pavement slippery','FACTOR1_pedestrian/bicyclist/other pedestrian error/confusion','FACTOR1_physical disability','FACTOR1_prescription medication','FACTOR1_reaction to other uninvolved vehicle','FACTOR1_reaction to uninvolved vehicle','FACTOR1_shoulders defective/improper','FACTOR1_steering failure','FACTOR1_texting','FACTOR1_tinted windows','FACTOR1_tire failure/inadequate','FACTOR1_tow hitch defective','FACTOR1_traffic control device improper/non-working','FACTOR1_traffic control disregarded','FACTOR1_turning improperly','FACTOR1_unsafe lane changing','FACTOR1_unsafe speed','FACTOR1_unspecified','FACTOR1_using on board navigation device','FACTOR1_vehicle vandalism','FACTOR1_view obstructed/limited','FACTOR1_windshield inadequate','FACTOR2_aggressive driving/road rage','FACTOR2_alcohol involvement','FACTOR2_animals action','FACTOR2_backing unsafely','FACTOR2_brakes defective','FACTOR2_cell phone (hand-held)','FACTOR2_cell phone (hands-free)','FACTOR2_driver inattention/distraction','FACTOR2_driver inexperience','FACTOR2_driverless/runaway vehicle','FACTOR2_drugs (illegal)','FACTOR2_eating or drinking','FACTOR2_failure to keep right','FACTOR2_failure to yield right-of-way','FACTOR2_fatigued/drowsy','FACTOR2_fell asleep','FACTOR2_following too closely','FACTOR2_glare','FACTOR2_headlights defective','FACTOR2_illnes','FACTOR2_illness','FACTOR2_lane marking improper/inadequate','FACTOR2_listening/using headphones','FACTOR2_lost consciousness','FACTOR2_obstruction/debris','FACTOR2_other electronic device','FACTOR2_other lighting defects','FACTOR2_other vehicular','FACTOR2_outside car distraction','FACTOR2_oversized vehicle','FACTOR2_passenger distraction','FACTOR2_passing or lane usage improper','FACTOR2_passing too closely','FACTOR2_pavement defective','FACTOR2_pavement slippery','FACTOR2_pedestrian/bicyclist/other pedestrian error/confusion','FACTOR2_physical disability','FACTOR2_prescription medication','FACTOR2_reaction to other uninvolved vehicle','FACTOR2_reaction to uninvolved vehicle','FACTOR2_shoulders defective/improper','FACTOR2_steering failure','FACTOR2_texting','FACTOR2_tinted windows','FACTOR2_tire failure/inadequate','FACTOR2_tow hitch defective','FACTOR2_traffic control device improper/non-working','FACTOR2_traffic control disregarded','FACTOR2_turning improperly','FACTOR2_unsafe lane changing','FACTOR2_unsafe speed','FACTOR2_unspecified','FACTOR2_using on board navigation device','FACTOR2_vehicle vandalism','FACTOR2_view obstructed/limited','FACTOR2_windshield inadequate','DAY_Friday','DAY_Saturday','DAY_Sunday','DAY_Thursday','DAY_Tuesday','DAY_Wednesday','TIME_Afternoon','TIME_Evening','TIME_Night','FACTORCAT1_Environmental','FACTORCAT1_Human','FACTORCAT1_Vehicular','FACTORCAT2_Environmental','FACTORCAT2_Human','FACTORCAT2_Vehicular'

In [0]:
nyc_data_with_dummies['any_aggressive driving/road rage'] = ((nyc_data_with_dummies['FACTOR1_aggressive driving/road rage']) | (nyc_data_with_dummies['FACTOR2_aggressive driving/road rage'])).astype(int)
nyc_data_with_dummies['any_alcohol involvement'] = ((nyc_data_with_dummies['FACTOR1_alcohol involvement']) | (nyc_data_with_dummies['FACTOR2_alcohol involvement'])).astype(int)
nyc_data_with_dummies['any_animals action'] = ((nyc_data_with_dummies['FACTOR1_animals action']) | (nyc_data_with_dummies['FACTOR2_animals action'])).astype(int)
nyc_data_with_dummies['any_backing unsafely'] = ((nyc_data_with_dummies['FACTOR1_backing unsafely']) | (nyc_data_with_dummies['FACTOR2_backing unsafely'])).astype(int)
nyc_data_with_dummies['any_brakes defective'] = ((nyc_data_with_dummies['FACTOR1_brakes defective']) | (nyc_data_with_dummies['FACTOR2_brakes defective'])).astype(int)
nyc_data_with_dummies['any_cell phone (hand-held)'] = ((nyc_data_with_dummies['FACTOR1_cell phone (hand-held)']) | (nyc_data_with_dummies['FACTOR2_cell phone (hand-held)'])).astype(int)
nyc_data_with_dummies['any_cell phone (hands-free)'] = ((nyc_data_with_dummies['FACTOR1_cell phone (hands-free)']) | (nyc_data_with_dummies['FACTOR2_cell phone (hands-free)'])).astype(int)
nyc_data_with_dummies['any_driver inattention/distraction'] = ((nyc_data_with_dummies['FACTOR1_driver inattention/distraction']) | (nyc_data_with_dummies['FACTOR2_driver inattention/distraction'])).astype(int)
nyc_data_with_dummies['any_driver inexperience'] = ((nyc_data_with_dummies['FACTOR1_driver inexperience']) | (nyc_data_with_dummies['FACTOR2_driver inexperience'])).astype(int)
nyc_data_with_dummies['any_driverless/runaway vehicle'] = ((nyc_data_with_dummies['FACTOR1_driverless/runaway vehicle']) | (nyc_data_with_dummies['FACTOR2_driverless/runaway vehicle'])).astype(int)
nyc_data_with_dummies['any_drugs (illegal)'] = ((nyc_data_with_dummies['FACTOR1_drugs (illegal)']) | (nyc_data_with_dummies['FACTOR2_drugs (illegal)'])).astype(int)
nyc_data_with_dummies['any_eating or drinking'] = ((nyc_data_with_dummies['FACTOR1_eating or drinking']) | (nyc_data_with_dummies['FACTOR2_eating or drinking'])).astype(int)
nyc_data_with_dummies['any_failure to keep right'] = ((nyc_data_with_dummies['FACTOR1_failure to keep right']) | (nyc_data_with_dummies['FACTOR2_failure to keep right'])).astype(int)
nyc_data_with_dummies['any_failure to yield right-of-way'] = ((nyc_data_with_dummies['FACTOR1_failure to yield right-of-way']) | (nyc_data_with_dummies['FACTOR2_failure to yield right-of-way'])).astype(int)
nyc_data_with_dummies['any_fatigued/drowsy'] = ((nyc_data_with_dummies['FACTOR1_fatigued/drowsy']) | (nyc_data_with_dummies['FACTOR2_fatigued/drowsy'])).astype(int)
nyc_data_with_dummies['any_fell asleep'] = ((nyc_data_with_dummies['FACTOR1_fell asleep']) | (nyc_data_with_dummies['FACTOR2_fell asleep'])).astype(int)
nyc_data_with_dummies['any_following too closely'] = ((nyc_data_with_dummies['FACTOR1_following too closely']) | (nyc_data_with_dummies['FACTOR2_following too closely'])).astype(int)
nyc_data_with_dummies['any_glare'] = ((nyc_data_with_dummies['FACTOR1_glare']) | (nyc_data_with_dummies['FACTOR2_glare'])).astype(int)
nyc_data_with_dummies['any_headlights defective'] = ((nyc_data_with_dummies['FACTOR1_headlights defective']) | (nyc_data_with_dummies['FACTOR2_headlights defective'])).astype(int)
nyc_data_with_dummies['any_illness'] = ((nyc_data_with_dummies['FACTOR1_illness']) | (nyc_data_with_dummies['FACTOR2_illness'])).astype(int)
nyc_data_with_dummies['any_lane marking improper/inadequate'] = ((nyc_data_with_dummies['FACTOR1_lane marking improper/inadequate']) | (nyc_data_with_dummies['FACTOR2_lane marking improper/inadequate'])).astype(int)
nyc_data_with_dummies['any_listening/using headphones'] = ((nyc_data_with_dummies['FACTOR1_listening/using headphones']) | (nyc_data_with_dummies['FACTOR2_listening/using headphones'])).astype(int)
nyc_data_with_dummies['any_lost consciousness'] = ((nyc_data_with_dummies['FACTOR1_lost consciousness']) | (nyc_data_with_dummies['FACTOR2_lost consciousness'])).astype(int)
nyc_data_with_dummies['any_obstruction/debris'] = ((nyc_data_with_dummies['FACTOR1_obstruction/debris']) | (nyc_data_with_dummies['FACTOR2_obstruction/debris'])).astype(int)
nyc_data_with_dummies['any_other electronic device'] = ((nyc_data_with_dummies['FACTOR1_other electronic device']) | (nyc_data_with_dummies['FACTOR2_other electronic device'])).astype(int)
nyc_data_with_dummies['any_other lighting defects'] = ((nyc_data_with_dummies['FACTOR1_other lighting defects']) | (nyc_data_with_dummies['FACTOR2_other lighting defects'])).astype(int)
nyc_data_with_dummies['any_other vehicular'] = ((nyc_data_with_dummies['FACTOR1_other vehicular']) | (nyc_data_with_dummies['FACTOR2_other vehicular'])).astype(int)
nyc_data_with_dummies['any_outside car distraction'] = ((nyc_data_with_dummies['FACTOR1_outside car distraction']) | (nyc_data_with_dummies['FACTOR2_outside car distraction'])).astype(int)
nyc_data_with_dummies['any_oversized vehicle'] = ((nyc_data_with_dummies['FACTOR1_oversized vehicle']) | (nyc_data_with_dummies['FACTOR2_oversized vehicle'])).astype(int)
nyc_data_with_dummies['any_passenger distraction'] = ((nyc_data_with_dummies['FACTOR1_passenger distraction']) | (nyc_data_with_dummies['FACTOR2_passenger distraction'])).astype(int)
nyc_data_with_dummies['any_passing or lane usage improper'] = ((nyc_data_with_dummies['FACTOR1_passing or lane usage improper']) | (nyc_data_with_dummies['FACTOR2_passing or lane usage improper'])).astype(int)
nyc_data_with_dummies['any_passing too closely'] = ((nyc_data_with_dummies['FACTOR1_passing too closely']) | (nyc_data_with_dummies['FACTOR2_passing too closely'])).astype(int)
nyc_data_with_dummies['any_pavement defective'] = ((nyc_data_with_dummies['FACTOR1_pavement defective']) | (nyc_data_with_dummies['FACTOR2_pavement defective'])).astype(int)
nyc_data_with_dummies['any_pavement slippery'] = ((nyc_data_with_dummies['FACTOR1_pavement slippery']) | (nyc_data_with_dummies['FACTOR2_pavement slippery'])).astype(int)
nyc_data_with_dummies['any_pedestrian/bicyclist/other pedestrian error/confusion'] = ((nyc_data_with_dummies['FACTOR1_pedestrian/bicyclist/other pedestrian error/confusion']) | (nyc_data_with_dummies['FACTOR2_pedestrian/bicyclist/other pedestrian error/confusion'])).astype(int)
nyc_data_with_dummies['any_physical disability'] = ((nyc_data_with_dummies['FACTOR1_physical disability']) | (nyc_data_with_dummies['FACTOR2_physical disability'])).astype(int)
nyc_data_with_dummies['any_prescription medication'] = ((nyc_data_with_dummies['FACTOR1_prescription medication']) | (nyc_data_with_dummies['FACTOR2_prescription medication'])).astype(int)
nyc_data_with_dummies['any_reaction to other uninvolved vehicle'] = ((nyc_data_with_dummies['FACTOR1_reaction to other uninvolved vehicle']) | (nyc_data_with_dummies['FACTOR2_reaction to other uninvolved vehicle'])).astype(int)
nyc_data_with_dummies['any_reaction to uninvolved vehicle'] = ((nyc_data_with_dummies['FACTOR1_reaction to uninvolved vehicle']) | (nyc_data_with_dummies['FACTOR2_reaction to uninvolved vehicle'])).astype(int)
nyc_data_with_dummies['any_shoulders defective/improper'] = ((nyc_data_with_dummies['FACTOR1_shoulders defective/improper']) | (nyc_data_with_dummies['FACTOR2_shoulders defective/improper'])).astype(int)
nyc_data_with_dummies['any_steering failure'] = ((nyc_data_with_dummies['FACTOR1_steering failure']) | (nyc_data_with_dummies['FACTOR2_steering failure'])).astype(int)
nyc_data_with_dummies['any_texting'] = ((nyc_data_with_dummies['FACTOR1_texting']) | (nyc_data_with_dummies['FACTOR2_texting'])).astype(int)
nyc_data_with_dummies['any_tinted windows'] = ((nyc_data_with_dummies['FACTOR1_tinted windows']) | (nyc_data_with_dummies['FACTOR2_tinted windows'])).astype(int)
nyc_data_with_dummies['any_tire failure/inadequate'] = ((nyc_data_with_dummies['FACTOR1_tire failure/inadequate']) | (nyc_data_with_dummies['FACTOR2_tire failure/inadequate'])).astype(int)
nyc_data_with_dummies['any_tow hitch defective'] = ((nyc_data_with_dummies['FACTOR1_tow hitch defective']) | (nyc_data_with_dummies['FACTOR2_tow hitch defective'])).astype(int)
nyc_data_with_dummies['any_traffic control device improper/non-working'] = ((nyc_data_with_dummies['FACTOR1_traffic control device improper/non-working']) | (nyc_data_with_dummies['FACTOR2_traffic control device improper/non-working'])).astype(int)
nyc_data_with_dummies['any_traffic control disregarded'] = ((nyc_data_with_dummies['FACTOR1_traffic control disregarded']) | (nyc_data_with_dummies['FACTOR2_traffic control disregarded'])).astype(int)
nyc_data_with_dummies['any_turning improperly'] = ((nyc_data_with_dummies['FACTOR1_turning improperly']) | (nyc_data_with_dummies['FACTOR2_turning improperly'])).astype(int)
nyc_data_with_dummies['any_unsafe lane changing'] = ((nyc_data_with_dummies['FACTOR1_unsafe lane changing']) | (nyc_data_with_dummies['FACTOR2_unsafe lane changing'])).astype(int)
nyc_data_with_dummies['any_unsafe speed'] = ((nyc_data_with_dummies['FACTOR1_unsafe speed']) | (nyc_data_with_dummies['FACTOR2_unsafe speed'])).astype(int)
nyc_data_with_dummies['any_unspecified'] = ((nyc_data_with_dummies['FACTOR1_unspecified']) | (nyc_data_with_dummies['FACTOR2_unspecified'])).astype(int)
nyc_data_with_dummies['any_using on board navigation device'] = ((nyc_data_with_dummies['FACTOR1_using on board navigation device']) | (nyc_data_with_dummies['FACTOR2_using on board navigation device'])).astype(int)
nyc_data_with_dummies['any_vehicle vandalism'] = ((nyc_data_with_dummies['FACTOR1_vehicle vandalism']) | (nyc_data_with_dummies['FACTOR2_vehicle vandalism'])).astype(int)
nyc_data_with_dummies['any_view obstructed/limited'] = ((nyc_data_with_dummies['FACTOR1_view obstructed/limited']) | (nyc_data_with_dummies['FACTOR2_view obstructed/limited'])).astype(int)
nyc_data_with_dummies['any_windshield inadequate'] = ((nyc_data_with_dummies['FACTOR1_windshield inadequate']) | (nyc_data_with_dummies['FACTOR2_windshield inadequate'])).astype(int)

In [0]:
#Eleventh Dirty Model using all columns
X = nyc_data_with_dummies.loc[:,['any_factor_Human','DAY_Saturday', 'DAY_Wednesday','TIME_Afternoon','TIME_Evening','TIME_Night','BOROUGH_BROOKLYN','BOROUGH_MANHATTAN','BOROUGH_QUEENS','BOROUGH_STATEN ISLAND','any_aggressive driving/road rage','any_alcohol involvement','any_animals action','any_backing unsafely','any_brakes defective','any_cell phone (hand-held)','any_driver inattention/distraction','any_driver inexperience','any_driverless/runaway vehicle','any_drugs (illegal)','any_failure to yield right-of-way','any_fatigued/drowsy','any_fell asleep','any_following too closely','any_glare','any_headlights defective','any_illness','any_lane marking improper/inadequate','any_listening/using headphones','any_lost consciousness','any_obstruction/debris','any_other lighting defects','any_other vehicular','any_outside car distraction','any_oversized vehicle','any_passenger distraction','any_passing or lane usage improper','any_passing too closely','any_pavement defective','any_pavement slippery','any_pedestrian/bicyclist/other pedestrian error/confusion','any_physical disability','any_prescription medication','any_reaction to other uninvolved vehicle','any_tinted windows','any_tow hitch defective','any_traffic control device improper/non-working','any_traffic control disregarded','any_turning improperly','any_unsafe lane changing','any_unsafe speed','any_unspecified','any_vehicle vandalism','any_view obstructed/limited']].values

# Convert boolean columns to integers using NumPy operations
X = X.astype(int)  # Assuming all values are already boolean or integer

# Add a constant column for the intercept term
X = sm.add_constant(X)

In [0]:
#Run Logistic Regression model
model_dirty = sm.Logit(Y, X).fit()

In [0]:
#Model Summary Output
model_dirty.summary(yname='any_casualty_flag',
              xname=('constant','any_factor_Human','DAY_Saturday', 'DAY_Wednesday','TIME_Afternoon','TIME_Evening','TIME_Night','BOROUGH_BROOKLYN','BOROUGH_MANHATTAN','BOROUGH_QUEENS','BOROUGH_STATEN ISLAND','any_aggressive driving/road rage','any_alcohol involvement','any_animals action','any_backing unsafely','any_brakes defective','any_cell phone (hand-held)','any_driver inattention/distraction','any_driver inexperience','any_driverless/runaway vehicle','any_drugs (illegal)','any_failure to yield right-of-way','any_fatigued/drowsy','any_fell asleep','any_following too closely','any_glare','any_headlights defective','any_illness','any_lane marking improper/inadequate','any_listening/using headphones','any_lost consciousness','any_obstruction/debris','any_other lighting defects','any_other vehicular','any_outside car distraction','any_oversized vehicle','any_passenger distraction','any_passing or lane usage improper','any_passing too closely','any_pavement defective','any_pavement slippery','any_pedestrian/bicyclist/other pedestrian error/confusion','any_physical disability','any_prescription medication','any_reaction to other uninvolved vehicle','any_tinted windows','any_tow hitch defective','any_traffic control device improper/non-working','any_traffic control disregarded','any_turning improperly','any_unsafe lane changing','any_unsafe speed','any_unspecified','any_vehicle vandalism','any_view obstructed/limited'),
              title = 'Final Model for Insights and Recommendation')

In [0]:
#Eleventh Dirty Model using all columns
X = nyc_data_with_dummies.loc[:,['any_factor_Human','DAY_Saturday', 'DAY_Wednesday','TIME_Afternoon','TIME_Evening','TIME_Night','BOROUGH_BROOKLYN','BOROUGH_MANHATTAN','BOROUGH_QUEENS','BOROUGH_STATEN ISLAND','any_aggressive driving/road rage','any_alcohol involvement','any_animals action','any_backing unsafely','any_brakes defective','any_cell phone (hand-held)','any_driver inattention/distraction','any_driver inexperience','any_driverless/runaway vehicle','any_drugs (illegal)','any_failure to yield right-of-way','any_fatigued/drowsy','any_fell asleep','any_following too closely','any_glare','any_headlights defective','any_illness','any_lane marking improper/inadequate','any_listening/using headphones','any_lost consciousness','any_obstruction/debris','any_other lighting defects','any_other vehicular','any_outside car distraction','any_oversized vehicle','any_passenger distraction','any_passing or lane usage improper','any_passing too closely','any_pavement defective','any_pavement slippery','any_pedestrian/bicyclist/other pedestrian error/confusion','any_physical disability','any_prescription medication','any_reaction to other uninvolved vehicle','any_tinted windows','any_tow hitch defective','any_traffic control device improper/non-working','any_traffic control disregarded','any_turning improperly','any_unsafe lane changing','any_unsafe speed','any_unspecified','any_vehicle vandalism','any_view obstructed/limited','YEAR_2013','YEAR_2014','YEAR_2015','YEAR_2016','YEAR_2017','YEAR_2018','YEAR_2020','YEAR_2021','YEAR_2022','YEAR_2023','YEAR_2024','Month_2','Month_4','Month_5','Month_6','Month_7','Month_8','Month_9','Month_10','Month_11','Month_12']].values

# Convert boolean columns to integers using NumPy operations
X = X.astype(int)  # Assuming all values are already boolean or integer

# Add a constant column for the intercept term
X = sm.add_constant(X)

In [0]:
#Run Logistic Regression model
model_dirty1 = sm.Logit(Y, X).fit()

In [0]:
#Model Summary Output
model_dirty1.summary(yname='any_casualty_flag',
              xname=('constant','any_factor_Human','DAY_Saturday', 'DAY_Wednesday','TIME_Afternoon','TIME_Evening','TIME_Night','BOROUGH_BROOKLYN','BOROUGH_MANHATTAN','BOROUGH_QUEENS','BOROUGH_STATEN ISLAND','any_aggressive driving/road rage','any_alcohol involvement','any_animals action','any_backing unsafely','any_brakes defective','any_cell phone (hand-held)','any_driver inattention/distraction','any_driver inexperience','any_driverless/runaway vehicle','any_drugs (illegal)','any_failure to yield right-of-way','any_fatigued/drowsy','any_fell asleep','any_following too closely','any_glare','any_headlights defective','any_illness','any_lane marking improper/inadequate','any_listening/using headphones','any_lost consciousness','any_obstruction/debris','any_other lighting defects','any_other vehicular','any_outside car distraction','any_oversized vehicle','any_passenger distraction','any_passing or lane usage improper','any_passing too closely','any_pavement defective','any_pavement slippery','any_pedestrian/bicyclist/other pedestrian error/confusion','any_physical disability','any_prescription medication','any_reaction to other uninvolved vehicle','any_tinted windows','any_tow hitch defective','any_traffic control device improper/non-working','any_traffic control disregarded','any_turning improperly','any_unsafe lane changing','any_unsafe speed','any_unspecified','any_vehicle vandalism','any_view obstructed/limited','YEAR_2013','YEAR_2014','YEAR_2015','YEAR_2016','YEAR_2017','YEAR_2018','YEAR_2020','YEAR_2021','YEAR_2022','YEAR_2023','YEAR_2024','Month_2','Month_4','Month_5','Month_6','Month_7','Month_8','Month_9','Month_10','Month_11','Month_12'),
              title = 'Final Model for Insights and Recommendation')