# NYC Traffic Collision Analysis

This notebook contains the detailed technical analysis of the NYC Motor Vehicle Collisions dataset.

## 1. Library Imports


In [12]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import folium
from folium.plugins import HeatMap, MarkerCluster
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import os

# Print available styles and use a valid one
# print("Available styles:", plt.style.available)
plt.style.use('default')  # Use default style
sns.set_palette('husl')
pd.set_option('display.max_columns', None)

## 2. Data Loading

In [None]:
# Load only necessary columns to reduce memory usage
columns_to_load = [
    'CRASH DATE', 'CRASH TIME', 'BOROUGH', 'LATITUDE', 'LONGITUDE',
    'NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED',
    'VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2',
    'CONTRIBUTING FACTOR VEHICLE 1', 'CONTRIBUTING FACTOR VEHICLE 2'
]

# Define safe data types without integers
safe_dtypes = {
    'BOROUGH': 'category',
    'VEHICLE TYPE CODE 1': 'category',
    'VEHICLE TYPE CODE 2': 'category',
    'CONTRIBUTING FACTOR VEHICLE 1': 'category',
    'CONTRIBUTING FACTOR VEHICLE 2': 'category',
    'LATITUDE': 'float32',
    'LONGITUDE': 'float32'
}

# Check if file exists and load or create sample data
file_path = '../Motor_Vehicle_Collisions_-_Crashes_20250512.csv'
if not os.path.exists(file_path):
    print(f"File not found: {file_path}")
    print("Current directory:", os.getcwd())
    print("Parent directory contents:", os.listdir('..'))
    # Let user choose a different path
    file_path = input("Enter the correct path to the CSV file: ")

if os.path.exists(file_path):
    print("Loading the dataset...")
    df = pd.read_csv(file_path,
                    usecols=columns_to_load,
                    dtype=safe_dtypes,
                    parse_dates=['CRASH DATE'])
    
    # Convert numeric columns properly after loading
    df['NUMBER OF PERSONS INJURED'] = pd.to_numeric(df['NUMBER OF PERSONS INJURED'], errors='coerce').fillna(0).astype('int8')
    df['NUMBER OF PERSONS KILLED'] = pd.to_numeric(df['NUMBER OF PERSONS KILLED'], errors='coerce').fillna(0).astype('int8')
else:
    print("Creating sample data for testing...")
    # Create dummy data
    df = pd.DataFrame({
        'CRASH DATE': pd.date_range('2020-01-01', periods=1000),
        'CRASH TIME': pd.date_range('2020-01-01', periods=1000).time,
        'BOROUGH': np.random.choice(['MANHATTAN', 'BROOKLYN', 'QUEENS', 'BRONX', 'STATEN ISLAND'], 1000),
        'LATITUDE': np.random.uniform(40.5, 40.9, 1000),
        'LONGITUDE': np.random.uniform(-74.03, -73.7, 1000),
        'NUMBER OF PERSONS INJURED': np.random.randint(0, 5, 1000),
        'NUMBER OF PERSONS KILLED': np.random.randint(0, 2, 1000),
        'VEHICLE TYPE CODE 1': np.random.choice(['PASSENGER VEHICLE', 'TAXI', 'TRUCK', 'BUS', 'BICYCLE'], 1000),
        'VEHICLE TYPE CODE 2': np.random.choice(['PASSENGER VEHICLE', 'TAXI', 'TRUCK', 'BUS', 'BICYCLE', None], 1000),
        'CONTRIBUTING FACTOR VEHICLE 1': np.random.choice(['Driver Inattention', 'Following Too Closely', 'Failure to Yield', 'Speeding'], 1000),
        'CONTRIBUTING FACTOR VEHICLE 2': np.random.choice(['Driver Inattention', 'Following Too Closely', 'Failure to Yield', 'Speeding', None], 1000)
    })

# Convert time strings to datetime.time objects
df['CRASH TIME'] = pd.to_datetime(df['CRASH TIME'], errors='coerce').dt.time

# Calculate severity (more memory efficient)
conditions = [
    df['NUMBER OF PERSONS KILLED'] > 0,
    df['NUMBER OF PERSONS INJURED'] > 2,
    df['NUMBER OF PERSONS INJURED'] > 0
]
choices = ['Fatal', 'Severe', 'Minor']
df['SEVERITY'] = np.select(conditions, choices, default='Property Damage Only')

# Drop rows with missing coordinates
df = df.dropna(subset=['LATITUDE', 'LONGITUDE'])

print("Dataset Shape:", df.shape)
print("\nMemory usage:", df.memory_usage().sum() / 1024**2, "MB")
df.head()