# Combining Accident Data 

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType, BooleanType
from shapely.geometry import shape
import geopandas as gpd
import json
import os
import re

from pyspark.sql.functions import col, format_string,length, expr, when, array, struct, udf, explode, split, array_intersect, size, lit, year, month, sum, count, collect_set,  radians, sin, cos, asin, sqrt, lit
from pyspark.sql.types import ArrayType, StructType, StructField, StringType
from functools import partial
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.gridspec import GridSpec
import numpy as np
from pyspark.sql import functions as F
from pyspark.sql.window import Window



In [0]:
spark = SparkSession.builder.appName("Mapping Insurance Data to Accidents").getOrCreate()

In [0]:
registered_accidents = spark.read.csv("/FileStore/tables/accident_register_combined_driverid.csv", header=True, inferSchema=True)

In [0]:
# Read in data 
claims_df = spark.read.csv("/FileStore/tables/monthly_claims_insurance_driverid.csv", header=True, inferSchema=True)

In [0]:

# FIlter our rows of registered accidentd where 'Date Reported' and 'Date of Accident' are null
registered_accidents = registered_accidents.filter(registered_accidents['Date Reported'].isNotNull() | registered_accidents['Date of Accident'].isNotNull())

# FIlter our rows of claims df where loss date and reported date are null 
claims_df = claims_df.filter(claims_df['Loss Date'].isNotNull() | claims_df['Reported Date'].isNotNull())

In [0]:
# For claims_df: Use 'Loss Date' when possible, otherwise 'Reported Date'
claims_df = claims_df.withColumn(
    "Date", 
    F.when(F.col("Loss Date").isNotNull(), F.col("Loss Date"))
     .otherwise(F.col("Reported Date"))
)

In [0]:
# Make a final column Date that uses the value of 'Date of Accident' and if that is not null, if that is null use Date Reported
# Create a new Date column that prioritizes 'Date of Accident' and falls back to 'Date Reported'
registered_accidents = registered_accidents.withColumn(
    "Date", 
    F.when(F.col("Date of Accident").isNotNull(), F.col("Date of Accident"))
     .otherwise(F.col("Date Reported"))
)

In [0]:
# Create subset of claims_df with common columns
claims_subset = claims_df.select(
    "driver_id", 
    "Claim Number", 
    "insured_driver_name", 
    "insured_first_name", 
    "insured_last_name", 
    "driver_first_name", 
    "driver_last_name",
    "Date",
    "year"
)

# Create subset of registered_accidents with common columns
accidents_subset = registered_accidents.select(
    "driver_id", 
    "Claim Number", 
    "insured_driver_name", 
    "insured_first_name", 
    "insured_last_name", 
    "driver_first_name", 
    "driver_last_name",
    "Date",
    "Year"
)

In [0]:
# Make sure column names match exactly (Year/year)
claims_subset = claims_subset.withColumnRenamed("year", "Year")

# Combine the two dataframes
combined_df = claims_subset.union(accidents_subset)

In [0]:
# Group by the key columns and count occurrences
duplicate_check = combined_df.groupBy("driver_id", "Claim Number", "Date") \
    .agg(F.count("*").alias("occurrence_count")) \
    .filter(F.col("occurrence_count") > 1) \
    .orderBy(F.col("occurrence_count").desc())

# Show the duplicate records
print("Number of duplicate key combinations:", duplicate_check.count())
duplicate_check.show(10)

Number of duplicate key combinations: 91
+---------+------------------+-------------------+----------------+
|driver_id|      Claim Number|               Date|occurrence_count|
+---------+------------------+-------------------+----------------+
|  4447484|              NULL|2024-09-12 13:40:00|               4|
|  4794313|              NULL|2024-06-17 00:00:00|               4|
|  4447484|              NULL|2024-05-02 11:15:00|               4|
|  4447484|              NULL|2025-01-29 00:00:00|               4|
|  4794313|              NULL|2024-12-19 13:58:15|               4|
|  4447484|              NULL|2024-06-27 10:27:00|               4|
|  4794313|              NULL|2024-07-11 15:32:07|               4|
|  4794313|              NULL|2025-03-06 00:00:00|               4|
|  4183324|              NULL|2024-12-13 17:00:00|               3|
|  4036831|09 PC 000000338613|2024-02-08 00:00:00|               3|
+---------+------------------+-------------------+----------------+
only sh

In [0]:
# To see the actual duplicate records
if duplicate_check.count() > 0:
    # Get the keys of duplicate records
    duplicate_keys = duplicate_check.select("driver_id", "Claim Number", "Date")
    
    # Join back to get full duplicate records
    duplicate_records = combined_df.join(
        duplicate_keys,
        on=["driver_id", "Claim Number", "Date"],
        how="inner"
    )
    
    # Show the full duplicate records
    print("Sample of duplicate records:")
    duplicate_records.show(10)

In [0]:
duplicate_records.count()

27

In [0]:
# Remove duplicates by keeping only the first occurrence
unique_df = combined_df.dropDuplicates(["driver_id", "Claim Number", "Date"])

# Verify the result
print(f"Original combined dataframe row count: {combined_df.count()}")
print(f"Unique dataframe row count: {unique_df.count()}")
print(f"Number of duplicates removed: {combined_df.count() - unique_df.count()}")

Original combined dataframe row count: 591
Unique dataframe row count: 469
Number of duplicates removed: 122


In [0]:
# Remove rows where driver_id is null
unique_df = unique_df.filter(F.col("driver_id").isNotNull())

# Remove Claim Number Column
unique_df = unique_df.drop("Claim Number")


from pyspark.sql import functions as F
from pyspark.sql.types import TimestampType

# Function to standardize date formats
def standardize_dates(df):
    # First, convert any timestamp columns to string in YYYY-MM-DD format
    if df.schema["Date"].dataType == TimestampType():
        df = df.withColumn("Date", F.date_format("Date", "yyyy-MM-dd"))
    
    # Then handle various string formats
    return df.withColumn(
        "Date",
        F.when(
            # Match pattern like "1/13/2025"
            F.regexp_extract(F.col("Date"), "^(\\d{1,2})/(\\d{1,2})/(\\d{4})$", 0) != "",
            # Convert to YYYY-MM-DD
            F.date_format(F.to_date(F.col("Date"), "M/d/yyyy"), "yyyy-MM-dd")
        ).when(
            # Match pattern like "2024-12-31 16:22:37"
            F.regexp_extract(F.col("Date"), "^(\\d{4})-(\\d{1,2})-(\\d{1,2}).+$", 0) != "",
            # Extract just the date part
            F.regexp_extract(F.col("Date"), "^(\\d{4}-\\d{1,2}-\\d{1,2})", 1)
        ).otherwise(F.col("Date"))
    )

# Apply to our combined dataframe
unique_df = standardize_dates(unique_df)

# Verify the results

# Check if there are any non-standard formats left
non_standard = unique_df.filter(~F.regexp_extract(F.col("Date"), "^\\d{4}-\\d{1,2}-\\d{1,2}$", 0).rlike("^\\d{4}-\\d{1,2}-\\d{1,2}$"))
print(f"Number of non-standard dates remaining: {non_standard.count()}")

if non_standard.count() > 0:
    print("Sample of remaining non-standard dates:")
    non_standard.select("Date").distinct().show(10)

Number of non-standard dates remaining: 0


In [0]:
# Check for driver id and date duplicates 
driver_id_date_duplicates = unique_df.groupBy("driver_id", "Date") 

In [0]:
# Check for driver_id and Date duplicates
driver_id_date_duplicates = unique_df.groupBy("driver_id", "Date") \
    .agg(F.count("*").alias("occurrence_count")) \
    .filter(F.col("occurrence_count") > 1)

# Show the duplicate records
print("Number of driver_id and Date duplicates:", driver_id_date_duplicates.count())

if driver_id_date_duplicates.count() > 0:
    print("Sample of driver_id and Date duplicates:")
    driver_id_date_duplicates.show(10)
    
    # Get the duplicate keys
    duplicate_keys = driver_id_date_duplicates.select("driver_id", "Date")
    
    # Join back to see the full duplicate records
    full_duplicates = unique_df.join(
        duplicate_keys,
        on=["driver_id", "Date"],
        how="inner"
    )
    
    print("Sample of full duplicate records:")
    full_duplicates.show(10)

# Remove duplicates by keeping only one row per driver_id and Date
# We'll keep the first occurrence 
final_df = unique_df.dropDuplicates(["driver_id", "Date"])

# Verify the result
print(f"Original unique dataframe row count: {unique_df.count()}")
print(f"Final dataframe row count: {final_df.count()}")
print(f"Number of duplicates removed: {unique_df.count() - final_df.count()}")

# Show sample of the final da

In [0]:
display(final_df)

driver_id,insured_driver_name,insured_first_name,insured_last_name,driver_first_name,driver_last_name,Date,Year
67876,Kent Lucas,Kent,Lucas,Kent,Lucas,2023-07-29,2023
67876,Kent Lucas,Kent,Lucas,Kent,Lucas,2023-08-16,2023
3601228,Derrick Benjamin,Derrick,Benjamin,Derrick,Benjamin,2024-08-02,2024
3601235,Patrick Cortlessa,Patrick,Cortlessa,Patrick,Cortlessa,2023-06-06,2023
3601235,Patrick Cortlessa,Patrick,Cortlessa,Patrick,Cortlessa,2023-10-17,2023
3601235,Patrick Cortlessa,Patrick,Cortlessa,Patrick,Cortlessa,2025-02-04,2025
3602920,Mike Brown,Mike,Brown,Michael,Brown,2024-05-21,2024
3602920,Michael Brown,Michael,Brown,Michael,Brown,2024-06-09,2024
3602923,Romero Perez,Romero,Perez,Romero,Perez,2024-04-15,2024
3606675,Neal Fisher,Neal,Fisher,Neal,Fisher,2023-08-29,2023
