# Mapping Location to Accident 

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.pandas import read_excel
import geopandas as gpd
import numpy as np
import gc
import os 
import re
import datetime
from fuzzywuzzy import process
import pandas as pd
import rasterio
from rasterio.transform import rowcol
from functools import reduce
from pyspark.sql import DataFrame



In [0]:
# Initialize Spark Session
spark = SparkSession.builder.appName("Accident Data Location Mapping").getOrCreate()

In [0]:
# Read in Accident Data "/FileStore/intermediate_output/accidents_wth_location")
accidents = spark.read.parquet("/FileStore/intermediate_output/accidents_wth_location")

In [0]:
monthly_claims_insurance_pandas = pd.read_csv("/dbfs/FileStore/tables/monthly_claims_insurance_driverid.csv")
# make lowercase 
monthly_claims_insurance_pandas["Claim Number"] = monthly_claims_insurance_pandas["Claim Number"].str.lower()

In [0]:
state_crash_monthly_county_counts = pd.read_csv("/dbfs/FileStore/tables/state_crash_monthly_county_counts.csv")

In [0]:
us_city_states_pandas = pd.read_csv("/dbfs/FileStore/tables/uscities.csv")
# Extract first two digits of county fips for state fips 
us_city_states_pandas["state_fips"] = us_city_states_pandas["county_fips"].apply(lambda x: str(x)[:2])
us_city_states_pandas["county_code"] = us_city_states_pandas["county_fips"].apply(lambda x: str(x)[-3:])

In [0]:
us_city_states_pandas.columns

Index(['city', 'city_ascii', 'state_id', 'state_name', 'county_fips',
       'county_name', 'lat', 'lng', 'population', 'density', 'source',
       'military', 'incorporated', 'timezone', 'ranking', 'zips', 'id',
       'state_fips', 'county_code'],
      dtype='object')

In [0]:
state_crash_monthly_county_counts.columns

Index(['Crash Year', 'Crash Month', 'State', 'County Code', 'crash_count',
       'total_fatalities', 'total_injuries', 'total_vehicles'],
      dtype='object')

In [0]:
# For us_city_states_pandas:
# First, make sure county_code can be converted to numeric, replacing errors with NaN
us_city_states_pandas["county_code"] = pd.to_numeric(us_city_states_pandas["county_code"], errors='coerce')
# Fill NaN values (if any) with a default like 0 or handle as needed
us_city_states_pandas["county_code"] = us_city_states_pandas["county_code"].fillna(0)
# Convert to integer then string with zero-padding to ensure 3 characters
us_city_states_pandas["county_code"] = us_city_states_pandas["county_code"].astype(int).astype(str).str.zfill(3)

# For state_crash_monthly_county_counts:
# Same approach for consistency
state_crash_monthly_county_counts["County Code"] = pd.to_numeric(state_crash_monthly_county_counts["County Code"], errors='coerce')
state_crash_monthly_county_counts["County Code"] = state_crash_monthly_county_counts["County Code"].fillna(0)
# Convert to integer then string with zero-padding to ensure 3 characters
state_crash_monthly_county_counts["County Code"] = state_crash_monthly_county_counts["County Code"].astype(int).astype(str).str.zfill(3)

# Convert state IDs to lowercase
us_city_states_pandas["state_id"] = us_city_states_pandas["state_id"].str.lower()
state_crash_monthly_county_counts["State"] = state_crash_monthly_county_counts["State"].str.lower()

In [0]:
state_crash_monthly_county_counts = state_crash_monthly_county_counts.merge(
    us_city_states_pandas[["state_id", "state_fips", "county_code", "county_name", "zips"]],
    how="left",
    left_on=["State", "County Code"],
    right_on=["state_id", "county_code"]
)

In [0]:
# First, create a grouping key from state_fips and county_code
grouped = state_crash_monthly_county_counts.groupby(['State', 'County Code'])

# Initialize list to hold our combined data
combined_data = []

# Process each group
for (state_fips, county_code), group in grouped:
    # Get the first row for all columns except zips
    # (assuming all other values should be identical)
    first_row = group.iloc[0].to_dict()
    
    # Combine all zip codes into a single list, removing duplicates
    all_zips = []
    for zips_val in group['zips']:
        if isinstance(zips_val, str):
            if ' ' in zips_val.strip():
                zips_list = zips_val.strip().split()
            else:
                zips_list = [zips_val.strip()]
            all_zips.extend(zips_list)
        elif isinstance(zips_val, list):
            # If zips are already stored as lists
            all_zips.extend(zips_val)
        elif pd.notna(zips_val):
            # Handle single values
            all_zips.append(str(zips_val))
    
    # Remove duplicates and join with commas
    unique_zips = sorted(set(z for z in all_zips if z))
    first_row['zips'] = ','.join(unique_zips) if unique_zips else ""
    
    # Add to our results
    combined_data.append(first_row)

In [0]:
state_crash_full = pd.DataFrame(combined_data)

In [0]:
accidents = accidents.filter(
    (F.col("Date of Accident").isNotNull() | F.col("Date Reported").isNotNull())
)

In [0]:
# Convert Date of Accident to DateTime with the correct format
accidents = accidents.withColumn(
    "Date of Accident", 
    F.to_timestamp("Date of Accident", "yyyy-MM-dd HH:mm:ss")
)

# Convert Date Reported to DateTime with the correct format
accidents = accidents.withColumn(
    "Date Reported", 
    F.to_timestamp("Date Reported", "yyyy-MM-dd HH:mm:ss")
)

# Create a Reference_Date column that uses Date of Accident if available, otherwise Date Reported
accidents = accidents.withColumn(
    "Reference_Date", 
    F.when(F.col("Date of Accident").isNotNull(), F.col("Date of Accident"))
    .otherwise(F.col("Date Reported"))
)

# Extract Year, Month, and Day from Reference_Date
accidents = accidents.withColumn("Year", F.year("Reference_Date"))
accidents = accidents.withColumn("Month", F.month("Reference_Date"))
accidents = accidents.withColumn("Day", F.dayofmonth("Reference_Date"))

In [0]:
accidents_pd = accidents.toPandas()

In [0]:
accidents = accidents_pd
state_crash = state_crash_full

In [0]:
# Ensure columns are of the right type for joining
accidents_pd['Year'] = accidents_pd['Year'].astype(int)
accidents_pd['Month'] = accidents_pd['Month'].astype(int)
state_crash_full['Crash Year'] = state_crash_full['Crash Year'].astype(int)
state_crash_full['Crash Month'] = state_crash_full['Crash Month'].astype(int)

In [0]:
# lower case values in state_crash df
state_crash_full['State'] = state_crash_full['State'].str.lower()
accidents_pd['state'] = accidents_pd['state'].str.lower()

In [0]:
us_city_states_pandas["state_id"] = us_city_states_pandas["state_id"].str.lower()
state_crash_full['state_id'] = state_crash_full['state_id'].str.lower()

In [0]:
zip_to_location = {}

# Process each row in state_crash
for _, row in us_city_states_pandas.iterrows():
    if pd.notna(row['zips']):
        # Split comma-separated zips
        zips = [z.strip() for z in str(row['zips']).split(' ')]
        state, county_code = row[["state_id", "county_code"]]



        # Add each zip to the dictionary with this row's location info
        for zip_code in zips:
            if zip_code not in zip_to_location:
                zip_to_location[zip_code] = ()
            
            # Store the row index for later retrieval
            zip_to_location[zip_code] = (state, county_code)

In [0]:
zip_to_location["37203"]

('tn', '037')

In [0]:
accidents_pd.loc[accidents_pd['zip'] == '18977', 'city'] = 'Washington Crossing'
accidents_pd.loc[accidents_pd['zip'] == '18977', 'state_id'] = 'pa'
accidents_pd.loc[accidents_pd['zip'] == '18977', 'county_code'] = '017'

# Set 11600 to null
accidents_pd.loc[accidents_pd['zip'] == '11600', 'zip_code'] = None

# Fix 32694 (city name issue)
accidents_pd.loc[accidents_pd['city'] == '32694', 'city'] = None  # First clear incorrect city
accidents_pd.loc[accidents_pd['zip'] == '32694', 'city'] = 'Williston'  # Set correct city
accidents_pd.loc[accidents_pd['zip'] == '32694', 'state_id'] = 'fl'

# Fix Leicester (28748)
accidents_pd.loc[accidents_pd['zip'] == '28748', 'city'] = 'Leicester'
accidents_pd.loc[accidents_pd['zip'] == '28748', 'state_id'] = 'nc'
accidents_pd.loc[accidents_pd['zip'] == '28748', 'county_code'] = '021'

# Fix Aiken (29805)
accidents_pd.loc[accidents_pd['zip'] == '29805', 'city'] = 'Aiken'
accidents_pd.loc[accidents_pd['zip'] == '29805', 'state_id'] = 'sc'
accidents_pd.loc[accidents_pd['zip'] == '29805', 'county_code'] = '005'

# Fix Naples (34112)
accidents_pd.loc[accidents_pd['zip'] == '34112', 'city'] = 'Naples'
accidents_pd.loc[accidents_pd['zip'] == '34112', 'state_id'] = 'fl'
accidents_pd.loc[accidents_pd['zip'] == '34112', 'county_code'] = '625'

# Fix Live Oak (32060)
accidents_pd.loc[accidents_pd['zip'] == '32060', 'city'] = 'Live Oak'
accidents_pd.loc[accidents_pd['zip'] == '32060', 'state_id'] = 'fl'
accidents_pd.loc[accidents_pd['zip'] == '32060', 'county_code'] = '121'

In [0]:
inexact_match_count = 0

In [0]:
# Initialize columns in accidents_pd to hold county crash statistics
accidents_pd['county_crash_count'] = None
accidents_pd['county_total_fatalities'] = None
accidents_pd['county_total_injuries'] = None
accidents_pd['county_total_vehicles'] = None
accidents_pd['dba_location'] = 0  # Initialize the new indicator variable to 0

inexact_match_count = 0
total_processed = 0
matches_found = 0

for idx, accident in accidents_pd.iterrows():
    total_processed += 1
    
    # First try the primary zip
    accident_zip = accident.get('zip')
    used_dba = False
    
    # If primary zip is missing, try dba_zip and mark the indicator
    if accident_zip is None:
        accident_zip = accident.get('dba_zip')
        if accident_zip is not None:
            used_dba = True
    
    # Skip if still no ZIP code
    if accident_zip is None:
        continue
        
    month = accident.get('Month') 
    date = accident.get('Day')
    year = accident.get('Year')
    
    # Truncate ZIP if longer than 5 digits
    if isinstance(accident_zip, str) and len(accident_zip) > 5:
        accident_zip = accident_zip[:5]
    elif isinstance(accident_zip, (int, float)):
        accident_zip = str(int(accident_zip)).zfill(5)
    
    # Try to find the ZIP code, then try +1, -1, +2, -2, +3, -3
    zip_info = None
    found_zip = None
    
    # Try the original ZIP first
    zip_info = zip_to_location.get(accident_zip)
    if zip_info:
        found_zip = accident_zip
    else:
        # Try nearby ZIPs up to 3 away
        for offset in [1, -1, 2, -2, 3, -3]:
            try:
                # Convert to int, add offset, then back to string
                nearby_zip = str(int(accident_zip) + offset).zfill(5)
                zip_info = zip_to_location.get(nearby_zip)
                if zip_info:
                    found_zip = nearby_zip
                    break
            except (ValueError, TypeError):
                # Handle cases where ZIP can't be converted to int
                continue
    
    if zip_info:
        state_id, county_code = zip_info
        
        # First try exact match for month, year, state_id, and county_code
        row = state_crash_full[(state_crash_full["Crash Month"] == month) &
                              (state_crash_full["Crash Year"] == year) &
                              (state_crash_full["state_id"] == state_id.lower()) & 
                              (state_crash_full["county_code"] == county_code)]
        
        # If no exact match, try with just month, year, and state
        if len(row) == 0:
            row = state_crash_full[(state_crash_full["Crash Month"] == month) &
                                  (state_crash_full["Crash Year"] == year) &
                                  (state_crash_full["state_id"] == state_id.lower())]
            
            # If we found rows for this state/month/year but not this county, 
            # find the closest county
            if len(row) > 0:
                target_county_int = None
                try:
                    target_county_int = int(county_code)
                    row = row.copy()
                    row['county_code_int'] = row['county_code'].apply(
                        lambda x: abs(int(x) - target_county_int) if str(x).isdigit() else float('inf')
                    )
                    row = row.loc[row['county_code_int'].idxmin()].to_frame().T
                    inexact_match_count += 1
                except (ValueError, TypeError):
                    # If county code can't be converted to int, just use first row
                    row = row.iloc[[0]]
                    inexact_match_count += 1
            else:
                # Try with just year and state (different month)
                row = state_crash_full[(state_crash_full["Crash Year"] == year) &
                                      (state_crash_full["state_id"] == state_id.lower())]
                
                if len(row) > 0:
                    # Find closest month
                    row = row.copy()
                    row['month_diff'] = abs(row['Crash Month'] - month)
                    row = row.loc[row['month_diff'].idxmin()].to_frame().T
                    inexact_match_count += 1
                else:
                    # As a last resort, try just the state with any year/month
                    row = state_crash_full[state_crash_full["state_id"] == state_id.lower()]
                    
                    if len(row) > 0:
                        # Sort by year and month to get most recent
                        row = row.sort_values(by=['Crash Year', 'Crash Month'], ascending=False)
                        row = row.iloc[[0]]
                        inexact_match_count += 1
        
        # Copy data from state_crash_data to the accident row if we found a match
        if len(row) > 0:
            matches_found += 1
            
            # Set the dba_location indicator if we used dba_zip
            if used_dba:
                accidents_pd.at[idx, 'dba_location'] = 1
            
            # Map column names
            column_mapping = {
                'county_crash_count': 'crash_count',
                'county_total_fatalities': 'total_fatalities',
                'county_total_injuries': 'total_injuries',
                'county_total_vehicles': 'total_vehicles'
            }
            
            # Update values from the state crash data
            for dest_col, source_col in column_mapping.items():
                if source_col in row.columns:
                    value = row[source_col].values[0]
                    accidents_pd.at[idx, dest_col] = value

In [0]:
# Print summary
print(f"Total records processed: {total_processed}")
print(f"Matches found: {matches_found} ({matches_found/total_processed*100:.2f}%)")
print(f"Inexact matches: {inexact_match_count}")
print(f"DBA locations used: {accidents_pd['dba_location'].sum()} ({accidents_pd['dba_location'].sum()/total_processed*100:.2f}%)")

# Check non-null percentages
for col in ['county_crash_count', 'county_total_fatalities', 'county_total_injuries', 'county_total_vehicles']:
    non_null_count = accidents_pd[col].count()
    non_null_pct = (non_null_count / len(accidents_pd)) * 100
    print(f"{col}: {non_null_pct:.2f}% non-null values ({non_null_count}/{len(accidents_pd)})")

Total records processed: 335
Matches found: 335 (100.00%)
Inexact matches: 322
DBA locations used: 30 (8.96%)
county_crash_count: 100.00% non-null values (335/335)
county_total_fatalities: 100.00% non-null values (335/335)
county_total_injuries: 100.00% non-null values (335/335)
county_total_vehicles: 100.00% non-null values (335/335)


In [0]:
inexact_match_count

322

There are 322 rows in the accident dataset where we did not have an exact match for the county in the state crash data. For now, we will match to the closest county we have data, but we may need to use the crash data at the state level instead. Note that we are currently matching the month that the crash happened, which does not make sense from a modeling perspective and would be data leakage as the crashed in the dataset are probably a part of that count. In the feature engineering stage, we will experiment with different moving average and lags for the location based variables. 

## Mapping precipitation data 

In [0]:
# Read in 2025 Precipitation Data /FileStore/intermediate_output/2025_precipitation_data
df_precipitation_2024 = spark.read.format("parquet").option("header", "true").load("/FileStore/intermediate_output/2024_precipitation_data")

In [0]:
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql.types import StringType
import datetime

In [0]:
# Step 1: Convert PySpark dataframe to Pandas and prepare the precipitation data
def prepare_precipitation_data(df_precipitation_2025):
    # Add debug print statements
    print("DEBUG: Preparing precipitation data")
    print(f"DEBUG: Column names: {df_precipitation_2025.columns}")
    
    # Sample a few values to understand data format
    sample_df = df_precipitation_2025.limit(2).toPandas()
    print(f"DEBUG: Sample precipitation data (first 2 rows):")
    print(sample_df[["ZCTA5CE20"]].head())
    
    # Check for a specific ZIP code mentioned in the error
    if "ZCTA5CE20" in df_precipitation_2025.columns:
        sample_34984 = df_precipitation_2025.filter("ZCTA5CE20 = '34984'").limit(1).toPandas()
        if not sample_34984.empty:
            print(f"DEBUG: Sample data for ZIP 34984:")
            print(sample_34984[["ZCTA5CE20"]].head())
    
    # Convert PySpark dataframe to Pandas
    print("DEBUG: Converting PySpark dataframe to Pandas")
    precipitation_pd = df_precipitation_2025.select(
        "ZCTA5CE20",  # This is the zip code column
        *[col for col in df_precipitation_2025.columns if col.startswith("PRISM_ppt_")]
    ).toPandas()
    
    print(f"DEBUG: Converted to pandas, shape: {precipitation_pd.shape}")
    
    # Rename the zip code column for easier joining
    precipitation_pd = precipitation_pd.rename(columns={"ZCTA5CE20": "zip_code"})
    
    # Print a sample of ZIP codes to verify format
    print(f"DEBUG: Sample ZIP codes: {precipitation_pd['zip_code'].head(5).tolist()}")
    
    # Reshape the precipitation data from wide to long format
    print("DEBUG: Reshaping from wide to long format")
    precipitation_long = pd.melt(
        precipitation_pd,
        id_vars=["zip_code"],
        value_vars=[col for col in precipitation_pd.columns if col.startswith("PRISM_ppt_")],
        var_name="date_col",
        value_name="precipitation"
    )
    
    print(f"DEBUG: After melt, shape: {precipitation_long.shape}")
    
    # Extract the date from the column name, show example of what we're parsing
    print(f"DEBUG: Example date column name: {precipitation_long['date_col'].iloc[0] if not precipitation_long.empty else 'No data'}")
    
    # Extract year from one of the date columns to validate
    if not precipitation_long.empty:
        sample_date_col = precipitation_long['date_col'].iloc[0]
        date_part = sample_date_col.split("_")[-2]
        year_part = date_part[:4]
        print(f"DEBUG: Extracting year from sample date column: {year_part}")
    
    # Extract the date from the column name
    precipitation_long["date"] = precipitation_long["date_col"].apply(
        lambda x: datetime.datetime.strptime(x.split("_")[-2], "%Y%m%d").date()
    )
    
    # Print sample dates after extraction
    print(f"DEBUG: Sample extracted dates: {precipitation_long['date'].head(5).tolist() if not precipitation_long.empty else 'No data'}")
    
    # Drop the original date column name
    precipitation_long = precipitation_long.drop("date_col", axis=1)
    
    return precipitation_long

# Step 2: Prepare the accidents dataframe
def prepare_accidents_data(accidents_pd):
    print("DEBUG: Preparing accident data")
    
    # Create a new column for the zip code to use for joining
    accidents_pd["joining_zip"] = accidents_pd.apply(
        lambda row: row["dba_zip"] if row["dba_location"] == 1 else row["zip"], 
        axis=1
    )
    
    # Handle multiple zip codes in a single field - with SPACE separation
    # Some zip fields may contain multiple values (e.g., "72032 72035 72034")
    accidents_pd["zip_list"] = accidents_pd["joining_zip"].apply(
        lambda z: str(z).split() if pd.notna(z) else []
    )
    
    # Flag records with multiple zip codes
    accidents_pd["has_multiple_zips"] = accidents_pd["zip_list"].apply(len) > 1
    
    # Convert to string type in case it's not already
    accidents_pd["joining_zip"] = accidents_pd["joining_zip"].astype(str)
    
    # Make sure date fields are proper datetime
    # First try Date of Accident, then fall back to Date Reported if Date of Accident is null
    if "Date of Accident" in accidents_pd.columns and "Date Reported" in accidents_pd.columns:
        # Convert both date fields to datetime
        accidents_pd["accident_date"] = pd.to_datetime(accidents_pd["Date of Accident"], errors='coerce')
        reported_date = pd.to_datetime(accidents_pd["Date Reported"], errors='coerce')
        
        # Use Date Reported where Date of Accident is null
        accidents_pd.loc[accidents_pd["accident_date"].isna(), "accident_date"] = reported_date
        
        # Convert to date object (without time)
        accidents_pd["accident_date"] = accidents_pd["accident_date"].dt.date
        
        # Print statistics on date coverage
        date_count = accidents_pd["accident_date"].notna().sum()
        print(f"DEBUG: Records with date (after using Date Reported as fallback): {date_count} / {len(accidents_pd)}")
    
    return accidents_pd

# Step 3: Join the datasets
def join_precipitation_with_accidents(precipitation_long, accidents_prepared):
    print("DEBUG: Starting join operation")
    
    # Convert zip codes to string for joining
    precipitation_long["zip_code"] = precipitation_long["zip_code"].astype(str)
    
    # Print some stats about the datasets before joining
    print(f"DEBUG: Precipitation data shape: {precipitation_long.shape}")
    print(f"DEBUG: Accident data shape: {accidents_prepared.shape}")
    
    # Check ZIP code formats in both datasets
    precip_zip_samples = precipitation_long["zip_code"].dropna().head(5).tolist()
    accident_zip_samples = accidents_prepared["joining_zip"].dropna().head(5).tolist()
    print(f"DEBUG: Sample precipitation ZIP codes: {precip_zip_samples}")
    print(f"DEBUG: Sample accident ZIP codes: {accident_zip_samples}")
    
    # Check for specific ZIP mentioned in the error
    if "34984" in precipitation_long["zip_code"].values:
        print("DEBUG: ZIP 34984 found in precipitation data")
        precip_34984 = precipitation_long[precipitation_long["zip_code"] == "34984"].shape[0]
        print(f"DEBUG: Number of records for ZIP 34984: {precip_34984}")
        
        # Sample dates for this ZIP
        dates_34984 = precipitation_long[precipitation_long["zip_code"] == "34984"]["date"].head(5).tolist()
        print(f"DEBUG: Sample dates for ZIP 34984: {dates_34984}")
    else:
        print("DEBUG: ZIP 34984 NOT found in precipitation data")
    
    # Check date ranges in both datasets
    if not precipitation_long.empty and "date" in precipitation_long.columns:
        precip_min_date = precipitation_long["date"].min()
        precip_max_date = precipitation_long["date"].max()
        print(f"DEBUG: Precipitation date range: {precip_min_date} to {precip_max_date}")
    
    if "accident_date" in accidents_prepared.columns:
        acc_min_date = accidents_prepared["accident_date"].dropna().min() if accidents_prepared["accident_date"].notna().any() else None
        acc_max_date = accidents_prepared["accident_date"].dropna().max() if accidents_prepared["accident_date"].notna().any() else None
        print(f"DEBUG: Accident date range: {acc_min_date} to {acc_max_date}")
    
    # For records with single zip code, do a direct join
    single_zip_accidents = accidents_prepared[~accidents_prepared["has_multiple_zips"]]
    print(f"DEBUG: Records with single ZIP code: {len(single_zip_accidents)}")
    
    # Before joining, check if we have any matching date-ZIP combinations
    if not precipitation_long.empty and "accident_date" in single_zip_accidents.columns:
        sample_accidents = single_zip_accidents.dropna(subset=["accident_date"]).head(5)
        for _, acc in sample_accidents.iterrows():
            matching = precipitation_long[
                (precipitation_long["date"] == acc["accident_date"]) & 
                (precipitation_long["zip_code"] == acc["joining_zip"])
            ]
            print(f"DEBUG: For accident date {acc['accident_date']} and ZIP {acc['joining_zip']}, found {len(matching)} matching precipitation records")
    
    merged_data = pd.merge(
        single_zip_accidents,
        precipitation_long,
        left_on=["accident_date", "joining_zip"],
        right_on=["date", "zip_code"],
        how="left"
    )
    
    print(f"DEBUG: After joining single-ZIP records, shape: {merged_data.shape}")
    print(f"DEBUG: Missing precipitation values: {merged_data['precipitation'].isna().sum()}")
    
    # For records with multiple zip codes, we need to handle differently
    multi_zip_accidents = accidents_prepared[accidents_prepared["has_multiple_zips"]]
    print(f"DEBUG: Records with multiple ZIP codes: {len(multi_zip_accidents)}")
    
    multi_zip_results = []
    
    for _, accident in multi_zip_accidents.iterrows():
        # Get all precipitation values for each zip in the list and average them
        zip_list = accident["zip_list"]
        accident_date = accident["accident_date"]
        
        print(f"DEBUG: Processing multi-ZIP record with date {accident_date} and ZIPs {zip_list}")
        
        if pd.isna(accident_date):
            print("DEBUG: Skipping record with missing date")
            accident_copy = accident.copy()
            accident_copy["precipitation"] = None
            accident_copy["date"] = None
            accident_copy["zip_code"] = ",".join(zip_list) if zip_list else None
            accident_copy["multiple_zip_avg"] = True
            multi_zip_results.append(accident_copy)
            continue
        
        # Find precipitation data for each zip code on the accident date
        matching_precip = precipitation_long[
            (precipitation_long["date"] == accident_date) & 
            (precipitation_long["zip_code"].isin(zip_list))
        ]
        
        print(f"DEBUG: Found {len(matching_precip)} matching precipitation records for this multi-ZIP accident")
        if not matching_precip.empty:
            for _, p in matching_precip.head(3).iterrows():
                print(f"DEBUG: Sample match - ZIP: {p['zip_code']}, Date: {p['date']}, Precip: {p['precipitation']}")
        
        # Calculate average precipitation across all matching zip codes
        avg_precip = matching_precip["precipitation"].mean() if not matching_precip.empty else None
        print(f"DEBUG: Average precipitation: {avg_precip}")
        
        # Create a copy of the accident record with the average precipitation
        accident_copy = accident.copy()
        
        # Add precipitation data
        if pd.notna(avg_precip):
            accident_copy["precipitation"] = avg_precip
            accident_copy["date"] = accident_date
            accident_copy["zip_code"] = ",".join(zip_list)
            accident_copy["multiple_zip_avg"] = True
        else:
            # No matching precipitation data found for any zip
            accident_copy["precipitation"] = None
            accident_copy["date"] = accident_date
            accident_copy["zip_code"] = ",".join(zip_list)
            accident_copy["multiple_zip_avg"] = True
            
        multi_zip_results.append(accident_copy)
    
    # Convert multi-zip results to DataFrame if there are any
    if multi_zip_results:
        multi_zip_df = pd.DataFrame(multi_zip_results)
        print(f"DEBUG: Multi-ZIP results shape: {multi_zip_df.shape}")
        
        # Combine with single-zip results
        merged_data = pd.concat([merged_data, multi_zip_df], ignore_index=True)
    
    print(f"DEBUG: Final merged data shape: {merged_data.shape}")
    print(f"DEBUG: Final missing precipitation values: {merged_data['precipitation'].isna().sum()}")
    
    return merged_data

# Step 4: Main function to orchestrate the process
def map_precipitation_to_accidents(df_precipitation_2025, accidents_pd):
    # Prepare both datasets
    precipitation_long = prepare_precipitation_data(df_precipitation_2025)
    accidents_prepared = prepare_accidents_data(accidents_pd)
    
    # Join the datasets
    result = join_precipitation_with_accidents(precipitation_long, accidents_prepared)
    
    # Check for missing precipitation data
    missing_precip_count = result["precipitation"].isna().sum()
    missing_precip_percent = (missing_precip_count / len(result)) * 100 if len(result) > 0 else 0
    
    print(f"RESULTS SUMMARY:")
    print(f"Total accident records: {len(result)}")
    print(f"Records with precipitation data: {len(result) - missing_precip_count} ({100 - missing_precip_percent:.2f}%)")
    print(f"Records missing precipitation data: {missing_precip_count} ({missing_precip_percent:.2f}%)")
    
    # Report on successful matches
    if len(result) - missing_precip_count > 0:
        print("\nSample of successful matches:")
        matched_sample = result[result["precipitation"].notna()].head(5)
        for _, row in matched_sample.iterrows():
            if row.get("multiple_zip_avg", False):
                print(f"Date: {row['accident_date']} | Multiple ZIPs: {row['zip_code']} | Precipitation: {row['precipitation']:.2f}")
            else:
                print(f"Date: {row['accident_date']} | ZIP: {row['zip_code']} | Precipitation: {row['precipitation']:.2f}")
    
    # Add some summary statistics (works with NaN values too)
    result["has_precipitation"] = result["precipitation"] > 0
    
    return result
    
    return result

# Sample usage:
# merged_data = map_precipitation_to_accidents(df_precipitation_2025, accidents_pd)

# Display some summary statistics
def analyze_precipitation_accidents(merged_data):
    # Calculate average precipitation on accident days
    avg_precip = merged_data["precipitation"].mean()
    
    # Count accidents that occurred on rainy days
    rainy_day_accidents = merged_data[merged_data["precipitation"] > 0].shape[0]
    total_accidents = merged_data.shape[0]
    rainy_day_percentage = (rainy_day_accidents / total_accidents) * 100 if total_accidents > 0 else 0
    
    # Group by precipitation ranges
    precip_ranges = [0, 0.1, 0.5, 1, 5, float('inf')]
    precip_labels = ['No rain', 'Light (0-0.1)', 'Moderate (0.1-0.5)', 'Heavy (0.5-1)', 'Extreme (>1)']
    
    merged_data['precip_category'] = pd.cut(
        merged_data['precipitation'], 
        bins=precip_ranges, 
        labels=precip_labels, 
        right=False
    )
    
    # Count accidents by precipitation category
    accident_counts_by_precip = merged_data['precip_category'].value_counts().sort_index()
    
    # Severity analysis if the column exists
    severity_by_precip = None
    if 'Severity' in merged_data.columns:
        severity_by_precip = merged_data.groupby('precip_category')['Severity'].value_counts()
    
    return {
        'average_precipitation': avg_precip,
        'rainy_day_accidents': rainy_day_accidents,
        'rainy_day_percentage': rainy_day_percentage,
        'accidents_by_precipitation': accident_counts_by_precip,
        'severity_by_precipitation': severity_by_precip
    }

In [0]:
accidents_pd_2024 = accidents_pd[accidents_pd["Year"] == 2024].copy()

In [0]:
merged_data = map_precipitation_to_accidents(df_precipitation_2024, accidents_pd_2024)

DEBUG: Preparing precipitation data
DEBUG: Column names: ['AWATER20', 'MTFCC20', 'CLASSFP20', 'INTPTLON20', 'latitude', 'FUNCSTAT20', 'GEOID20', 'ZCTA5CE20', 'INTPTLAT20', 'longitude', 'ALAND20', 'PRISM_ppt_provisional_4kmD2_20240901_bil', 'PRISM_ppt_provisional_4kmD2_20240902_bil', 'PRISM_ppt_provisional_4kmD2_20240911_bil', 'PRISM_ppt_provisional_4kmD2_20241210_bil', 'PRISM_ppt_provisional_4kmD2_20241211_bil', 'PRISM_ppt_provisional_4kmD2_20241212_bil', 'PRISM_ppt_provisional_4kmD2_20241213_bil', 'PRISM_ppt_provisional_4kmD2_20241214_bil', 'PRISM_ppt_provisional_4kmD2_20241215_bil', 'PRISM_ppt_provisional_4kmD2_20241216_bil', 'PRISM_ppt_provisional_4kmD2_20241217_bil', 'PRISM_ppt_provisional_4kmD2_20241218_bil', 'PRISM_ppt_provisional_4kmD2_20241219_bil', 'PRISM_ppt_provisional_4kmD2_20240912_bil', 'PRISM_ppt_provisional_4kmD2_20241220_bil', 'PRISM_ppt_provisional_4kmD2_20241221_bil', 'PRISM_ppt_provisional_4kmD2_20241222_bil', 'PRISM_ppt_provisional_4kmD2_20241223_bil', 'PRISM_ppt_p

In [0]:
# Display some summary statistics
def analyze_precipitation_accidents(merged_data):
    # Calculate average precipitation on accident days
    avg_precip = merged_data["precipitation"].mean()
    
    # Count accidents that occurred on rainy days
    rainy_day_accidents = merged_data[merged_data["precipitation"] > 0].shape[0]
    total_accidents = merged_data.shape[0]
    rainy_day_percentage = (rainy_day_accidents / total_accidents) * 100 if total_accidents > 0 else 0
    
    # Group by precipitation ranges
    precip_ranges = [0, 0.1, 0.5, 1, 5, float('inf')]
    precip_labels = ['No rain', 'Light (0-0.1)', 'Moderate (0.1-0.5)', 'Heavy (0.5-1)', 'Extreme (>1)']
    
    merged_data['precip_category'] = pd.cut(
        merged_data['precipitation'], 
        bins=precip_ranges, 
        labels=precip_labels, 
        right=False
    )
    
    # Count accidents by precipitation category
    accident_counts_by_precip = merged_data['precip_category'].value_counts().sort_index()
    
    # Severity analysis if the column exists
    severity_by_precip = None
    if 'Severity' in merged_data.columns:
        severity_by_precip = merged_data.groupby('precip_category')['Severity'].value_counts()
    
    return {
        'average_precipitation': avg_precip,
        'rainy_day_accidents': rainy_day_accidents,
        'rainy_day_percentage': rainy_day_percentage,
        'accidents_by_precipitation': accident_counts_by_precip,
        'severity_by_precipitation': severity_by_precip
    }

Starting with precipitation data that had separate columns for each day, we first transformed it into a long format where each row contains a zip code, date, and the corresponding precipitation amount. For the accident data, we created a "joining_zip" column that  selects either the regular "zip" or "dba_zip" based on the "dba_location" value, and ensured proper date formatting. We then joined these datasets using composite keys - matching both the accident date with the precipitation date, and the joining_zip with the precipitation zip code. We preserved all accident records regardless of whether matching precipitation data existed. This approach ensures that each accident is accurately associated with the weather conditions at its specific location and time.  

In [0]:
analysis_results = analyze_precipitation_accidents(merged_data)
print(analysis_results)

{'average_precipitation': 4.507265493146995, 'rainy_day_accidents': 112, 'rainy_day_percentage': 46.47302904564315, 'accidents_by_precipitation': precip_category
No rain               129
Light (0-0.1)          14
Moderate (0.1-0.5)     14
Heavy (0.5-1)          29
Extreme (>1)           48
Name: count, dtype: int64, 'severity_by_precipitation': precip_category     Severity
No rain             sev3        106
                    sev2         17
                    sev1          3
Light (0-0.1)       sev3          8
                    sev2          5
                    sev1          0
Moderate (0.1-0.5)  sev3         13
                    sev2          1
                    sev1          0
Heavy (0.5-1)       sev3         22
                    sev2          2
                    sev1          1
Extreme (>1)        sev3         39
                    sev2          9
                    sev1          0
Name: count, dtype: int64}
