# NYC Cyclist Casualty Analysis, 2017-2024

## Load Data

In [None]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import HeatMap

In [None]:
# Optional: If saving csv in Google Drive, mount your drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Read data from CSV
data = pd.read_csv('/content/drive/MyDrive/TDSP Project/Motor_Vehicle_Collisions_-_Crashes_20250202.csv')
# View the column headers and the first five rows of the dataset
data.head(5)

## Check Dataset for Missing Values

In [None]:
# Find the number of missing values in each column
missing_values = data.isnull().sum()

# Calculate percentages of missing values
missing_values_percentage = (missing_values / len(data)) * 100

# Return counts and percentages of missing values in each column
missing_data = pd.DataFrame({'Missing Values': missing_values, 'Percentage (%)': missing_values_percentage})
missing_data.sort_values(by='Percentage (%)', ascending=False)

## Summarize Data to Analyze Annual Cyclist Casualty Rates

In [None]:
# Convert 'CRASH DATE' to datetime format
data['CRASH DATE'] = pd.to_datetime(data['CRASH DATE'])

# Create a dataframe of collisions, cyclist injuries, and cyclist fatalities
cyc = data[['CRASH DATE','COLLISION_ID','NUMBER OF CYCLIST INJURED','NUMBER OF CYCLIST KILLED']]

# Group by year to get the number of crashes per year
annual_cyc = cyc.groupby(cyc['CRASH DATE'].dt.to_period("Y")).agg({
    'COLLISION_ID': 'size',
    'NUMBER OF CYCLIST INJURED': 'sum',
    'NUMBER OF CYCLIST KILLED': 'sum'
}).rename(columns={'COLLISION_ID': 'Number of Crashes'})

# Filter the dataframe to only include the years 2017 - 2024
annual_cyc = annual_cyc[(annual_cyc.index >= '2017') & (annual_cyc.index <= '2024')]

# Calculate cyclist injury and fatality rates for each year and add them to the dataframe
annual_cyc['Cyclist Injury Rate'] = annual_cyc['NUMBER OF CYCLIST INJURED']/annual_cyc['Number of Crashes']
annual_cyc['Cyclist Fatality Rate'] = annual_cyc['NUMBER OF CYCLIST KILLED']/annual_cyc['Number of Crashes']

annual_cyc

## Plot Crashes and Cyclist Casualties

### Crashes per Year

In [None]:
# Plot the number of crashes per year
plt.figure(figsize=(16, 6)) # Adjust figure size to meet your needs
sns.barplot(x=annual_cyc.index, y=annual_cyc['Number of Crashes'].values)
plt.title('Total Crashes per Year', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Number of Crashes', fontsize=14)
plt.tight_layout()

# plt.savefig("total_crashes_year.svg", format='svg') # Optional: Save the figure

plt.show()

### Combo Plots - Cyclist Casualty Rates

#### Cyclist Injury Rate

In [None]:
# Create a figure
fig, ax1 = plt.subplots(figsize=(16,6)) # Adjust figure size to meet your needs

# Create a barplot for the number of cyclists injured
sns.barplot(x=annual_cyc.index.astype(str), y=annual_cyc['NUMBER OF CYCLIST INJURED'].values, ax=ax1)
ax1.set_ylabel('Number of Cyclists Injured', fontsize=14)
ax1.set_xlabel('Year', fontsize=14)

# Create the secondary y-axis
ax2 = ax1.twinx()

# Create a lineplot for cyclist injury rate
sns.lineplot(x=annual_cyc.index.astype(str), y=annual_cyc['Cyclist Injury Rate'].values, ax=ax2, color='red')
ax2.set_ylabel('Cyclist Injury Rate (all crashes)', fontsize=14)

# Set title and labels
plt.title('Cyclists Injured in Crashes', fontsize=16)

# plt.savefig("cyc_inj_rate_year.svg", format='svg') # Optional: Save the figure

plt.show()

#### Cyclist Fatalities

In [None]:
# Create a figure
fig, ax1 = plt.subplots(figsize=(16, 6)) # Adjust figure size to meet your needs

# Create a barplot for the number of cyclists killed
sns.barplot(x=annual_cyc.index.astype(str), y=annual_cyc['NUMBER OF CYCLIST KILLED'].values, ax=ax1)
ax1.set_ylabel('Number of Cyclists Killed', fontsize=14)
ax1.set_xlabel('Year', fontsize=14)

# Create the secondary y-axis
ax2 = ax1.twinx()

# Create a lineplot for cyclist injury rate
sns.lineplot(x=annual_cyc.index.astype(str), y=annual_cyc['Cyclist Fatality Rate'].values, ax=ax2, color='red')
ax2.set_ylabel('Cyclist Fatality Rate (all crashes)', fontsize=14)

# Set title and labels
plt.title('Cyclists Killed in Crashes', fontsize=16)

# plt.savefig("cyc_fatality_rate_year.svg", format='svg') # Optional: Save the figure
plt.show()

## Create Heatmaps

---

### Heatmap for Cyclist Casualties, 2017-2019

In [None]:
# Drop rows with missing latitude and longitude values
data_geo = data.dropna(subset=['LATITUDE', 'LONGITUDE'])

# Drop rows where no cyclist was killed or injured
data_cyc = data_geo[(data_geo['NUMBER OF CYCLIST KILLED'] > 0) | (data_geo['NUMBER OF CYCLIST INJURED'] > 0)]

# Filter rows for desired years
data_cyc_17_19 = data_cyc[(data_cyc['CRASH DATE'] >= '2017-01-01') & (data_cyc['CRASH DATE'] <= '2019-12-31')]

# Create a base map
cyc_17_19 = folium.Map(location=[40.730610, -73.935242], zoom_start=10)

# Add heatmap
heat_cyc = [[row['LATITUDE'], row['LONGITUDE']] for index, row in data_cyc_17_19.iterrows()]
HeatMap(heat_cyc, radius=8, max_zoom=13).add_to(cyc_17_19)

# Save the map
cyc_17_19.save("cyc_heatmap_2017_2019.html")

### Heatmap for Cyclist Casualties, 2022-2024

In [None]:
# IF NOT PERFORMED ABOVE: Drop rows with missing latitude and longitude values
# data_geo = data.dropna(subset=['LATITUDE', 'LONGITUDE'])

# IF NOT PERFORMED ABOVE: Drop rows where no cyclist was killed or injured
# data_cyc = data_geo[(data_geo['NUMBER OF CYCLIST KILLED'] > 0) | (data_geo['NUMBER OF CYCLIST INJURED'] > 0)]

# Filter rows for desired years
data_cyc_22_24 = data_cyc[(data_cyc['CRASH DATE'] >= '2022-01-01') & (data_cyc['CRASH DATE'] <= '2024-12-31')]

# Create a base map
cyc_22_24 = folium.Map(location=[40.730610, -73.935242], zoom_start=10)

# Add heatmap
heat_cyc = [[row['LATITUDE'], row['LONGITUDE']] for index, row in data_cyc_22_24.iterrows()]
HeatMap(heat_cyc, radius=8, max_zoom=13).add_to(cyc_22_24)

# Save the map
cyc_22_24.save("cyc_heatmap_2022_2024.html")

## Create Cyclist Fatality Maps

### Cyclist Casualty Map, 2022-2024

In [None]:
# IF NOT PERFORMED ABOVE: Drop rows with missing latitude and longitude values
# data_geo = data.dropna(subset=['LATITUDE', 'LONGITUDE'])

# IF NOT PERFORMED ABOVE: Drop rows with missing latitude and longitude values
# data_cyc = data_geo[(data_geo['NUMBER OF CYCLIST KILLED'] > 0) | (data_geo['NUMBER OF CYCLIST INJURED'] > 0)]

# IF NOT PERFORMED ABOVE: Filter rows for desired years
# data_cyc_22_24 = data_cyc[(data_cyc['CRASH DATE'] >= '2022-01-01') & (data_cyc['CRASH DATE'] <= '2024-12-31')]

# Create a base map
cyc_casualty_22_24 = folium.Map(location=[40.730610, -73.935242], zoom_start=10)

# Add polygons for cyclist fatalities to map
for index, row in data_cyc_22_24.iterrows():
    if row['NUMBER OF CYCLIST KILLED'] > 0:
      color = "red"
      folium.features.RegularPolygonMarker(
          location=[row['LATITUDE'], row['LONGITUDE']],
          number_of_sides=3,
          radius=5,
          gradient = False,
          color=color,
          fill=True,
          fill_color=color
        ).add_to(cycfatality_22_24)
    elif row['NUMBER OF CYCLIST INJURED'] > 0:
      color = "yellow"
      folium.CircleMarker(
          location=[row['LATITUDE'], row['LONGITUDE']],
          radius=5,
          color=color,
          fill=True,
          fill_color=color
       ).add_to(cycfatality_22_24)

# Save the map
cyc_casualty_22_24.save("cyc_fatalities_2022_2024.html")

### Cyclist Fatality Map, 2022-2024

In [None]:
# IF NOT PERFORMED ABOVE: Drop rows with missing latitude and longitude values
# data_geo = data.dropna(subset=['LATITUDE', 'LONGITUDE'])

# IF NOT PERFORMED ABOVE: Drop rows with missing latitude and longitude values
# data_cyc = data_geo[(data_geo['NUMBER OF CYCLIST KILLED'] > 0) | (data_geo['NUMBER OF CYCLIST INJURED'] > 0)]

# IF NOT PERFORMED ABOVE: Filter rows for desired years
# data_cyc_22_24 = data_cyc[(data_cyc['CRASH DATE'] >= '2022-01-01') & (data_cyc['CRASH DATE'] <= '2024-12-31')]

# Create a base map
cyc_fatality_22_24 = folium.Map(location=[40.730610, -73.935242], zoom_start=10)

# Add polygons for cyclist fatalities to map
for index, row in data_cyc_22_24.iterrows():
    if row['NUMBER OF CYCLIST KILLED'] > 0:
      color = "red"
      folium.features.RegularPolygonMarker(
          location=[row['LATITUDE'], row['LONGITUDE']],
          number_of_sides=3,
          radius=5,
          gradient = False,
          color=color,
          fill=True,
          fill_color=color
        ).add_to(cyc_fatality_22_24)

# Save the map
cyc_fatality_22_24.save("cyc_fatalities_2022_2024.html")