# Aggregating Location Information

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.pandas import read_excel
import geopandas as gpd
import numpy as np
import gc
import os 
import re
import datetime
from fuzzywuzzy import process
import pandas as pd
import rasterio
from rasterio.transform import rowcol
from functools import reduce
from pyspark.sql import DataFrame



### Read in Data

In [0]:
# Initialize Spark Session
spark = SparkSession.builder.appName("Accident Data Preprocessing").getOrCreate()

In [0]:
fusionsite_regional_df_pandas =  pd.read_excel("/dbfs/FileStore/tables/FusionSite_Services_Regional_Footprint_January_2025.xlsx")
accident_df_pandas = pd.read_csv("/dbfs/FileStore/tables/accident_register_combined_driverid.csv")
monthly_claims_insurance_pandas = pd.read_csv("/dbfs/FileStore/tables/monthly_claims_insurance_driverid.csv")
us_city_states_pandas = pd.read_csv("/dbfs/FileStore/tables/uscities.csv")
full_driver_details = pd.read_csv("/dbfs/FileStore/tables/full_driver_details.csv")

In [0]:
accident_df_pandas.drop_duplicates(inplace=True)

### Parsing Accident Location into city, state, zip

In [0]:
# Make a dictionary that maps stat acronyms to full names
acronym_full_name = us_city_states_pandas[["state_id", "state_name"]].drop_duplicates()

state_abbrevs = us_city_states_pandas["state_id"].unique()
full_state_names = us_city_states_pandas["state_name"].unique()
state_names  = dict(zip(acronym_full_name["state_id"], acronym_full_name["state_name"]))

# Lowercase all strings in state_abbrevs and 
# Lower case key and value in state_names dict
state_abbrevs = [abbrev.lower() for abbrev in state_abbrevs]
state_names = {key.lower(): value.lower() for key, value in state_names.items()}

accident_df_pandas["Location of Accident (street, city, state)"] = accident_df_pandas["Location of Accident (street, city, state)"].str.lower()

In [0]:
street_types = ['st',
 'ave',
 'blvd',
 'rd',
 'dr',
 'ln',
 'ct',
 'pkwy',
 'cir',
 'way',
 'street',
 'avenue',
 'boulevard',
 'road',
 'drive',
 'lane',
 'court',
 'parkway',
 'circle',
 'highway',
 'hwy',
 'route',
 'rt']

In [0]:
non_city_terms = ['block of',
 'junction of',
 'intersection',
 'corner of',
 'box',
 'rural route',
 'rr',
 'mile marker']

In [0]:
def parse_location(location):
    if not isinstance(location, str) or not location.strip():
        return None, None, None
    
    # Clean and standardize the location string
    location = location.strip()
    
    # Extract ZIP code using regex - find all matches and take the last one
    zip_pattern = r'\b(\d{5}(?:-\d{4})?)\b'
    zip_matches = re.findall(zip_pattern, location)
    zip_code = zip_matches[-1] if zip_matches else None
    
    # Extract state
    state = None
    state_index = -1
    
    # First check for state abbreviations
    for abbrev in state_abbrevs:
        pattern = r'\b' + abbrev + r'\b'
        match = re.search(pattern, location)
        if match:
            state = abbrev
            state_index = match.start()
            break
    
    # If no abbreviation found, check for full state names
    if state is None:
        for state_name, abbrev in state_names.items():
            if state_name in location:
                state = abbrev
                state_index = location.find(state_name)
                break
    
    return state, zip_code


In [0]:
# Create new columns for the extracted information
accident_df_pandas['state'] = None
accident_df_pandas['zip'] = None

# Process each row
for idx, row in accident_df_pandas.iterrows():
    location = row["Location of Accident (street, city, state)"]
    
    if location:
        state, zip_code, *other_values = parse_location(location)
        accident_df_pandas.at[idx, 'state'] = state
        accident_df_pandas.at[idx, 'zip'] = zip_code

In [0]:
known_cities = list(fusionsite_regional_df_pandas[fusionsite_regional_df_pandas["City"].notnull()]["City"].str.lower())

In [0]:
known_cities.append("austin")
known_cities.append("black mountain")

In [0]:
# Use word boundaries to ensure we match complete words, not substrings
pattern = '|'.join([fr"\b{re.escape(street_type)}\b[\s\.,]" for street_type in street_types])
city_names_with_street_name = list(us_city_states_pandas[us_city_states_pandas["city"].str.contains(pattern, na=False, case=False)]["city"].unique())

In [0]:
known_cities = known_cities + city_names_with_street_name
known_cities = [city for city in known_cities if all(term not in city.lower() for term in ["college", "court house", "mobile home park", "addition"])]

In [0]:
unique_city_names = us_city_states_pandas.groupby("city").size()[us_city_states_pandas.groupby("city").size() == 1]
# lowercase unique_city_names
unique_city_names = [name.lower() for name in list(unique_city_names.index)]

city_names = [name.lower() for name in us_city_states_pandas[us_city_states_pandas["city"].notnull()]["city"].unique()]

In [0]:
def extract_city(location):
    if not isinstance(location, str) or not location.strip():
        return None
    
    # Clean the location string
    location = location.strip().lower()
    
    # Special case handling for problematic patterns
    if "lavaca st" in location and "austin" in location:
        return "Austin"
    
    # Handle "place in [city]" pattern
    place_in_match = re.search(r'(place|pl)\s+in\s+([a-z]+)', location, re.IGNORECASE)
    if place_in_match:
        potential_city = place_in_match.group(2)
        # Check if this isn't a state and is followed by a state
        if potential_city not in state_abbrevs:
            # Check if a state follows
            after_city = location[place_in_match.end():]
            if any(state in after_city for state in state_abbrevs):
                return potential_city.title()
    
    # If it's just a state abbreviation, return None
    if location in state_abbrevs:
        return None
    
    # Check for known city-state pairs
    for city in known_cities:
        if city.lower() in location:
            return city.title()
    
    # Split by commas
    parts = [p.strip() for p in location.split(',')]
    
    # Case 1: Find city before state in comma format (city, state)
    if len(parts) >= 2:
        for i in range(len(parts) - 1):
            current_part = parts[i]
            next_part = parts[i + 1]
            
            # Check if next part contains a state code
            has_state = False
            for state in state_abbrevs:
                if state in next_part.split():
                    has_state = True
                    break
            
            # Check if current part has street type (with or without period)
            words = current_part.split()
            has_street_type = False
            for word in words:
                word_no_period = word.replace('.', '')
                if word_no_period in street_types:
                    has_street_type = True
                    break
            
            if has_state and not has_street_type:
                # Remove any leading street numbers
                city = re.sub(r'^\d+\s+', '', current_part)
                return city.title()
    
    # Pattern for street-type + city + state format (no commas)
    for street_type in street_types:
        # Try with and without period
        for st in [street_type, street_type + '.']:
            # Pattern like "lavaca st. austin texas" or "6th st. brownsville tx"
            pattern = r'(?:\d+\s+)?(?:\w+\s+)?' + re.escape(st) + r'\s+([a-z]+)\s+(?:' + '|'.join(state_abbrevs + full_state_names) + r')\b'
            match = re.search(pattern, location, re.IGNORECASE)
            if match:
                return match.group(1).title()
    
    # Pattern for road names + city + state (e.g., "3709 cove mountain rd sevierville, tn")
    for street_type in street_types:
        pattern = r'\b' + re.escape(street_type) + r'\s+([a-z]+)(?:\s*,\s*|\s+)(?:' + '|'.join(state_abbrevs) + r')\b'
        match = re.search(pattern, location, re.IGNORECASE)
        if match:
            potential_city = match.group(1)
            if potential_city not in street_types and not any(c.isdigit() for c in potential_city):
                return potential_city.title()
    
    # Case for any city immediately before state
    for state in state_abbrevs:
        pattern = r'([a-z]+)\s*,?\s*\b' + state + r'\b'
        match = re.search(pattern, location)
        if match:
            potential_city = match.group(1)
            # Verify this isn't a street type
            if potential_city not in street_types and not potential_city.isdigit():
                return potential_city.title()
    
    # Fallback to original behavior for other cases
    return None

def extract_cities_from_df(df, location_column):
    return df[location_column].apply(extract_city)

In [0]:
# Test examples
test_locations = [
    "3709 cove mountain rd sevierville, tn" ,
    "norriswood ave, memphis ,tn",
    "411 woodycrest nashville tn",
    "6631 old settler roads, waxhaw, nc 28173",
    "28 wendy lane asheville, nc 28805",
    "lavaca st. austin texas",
    "SC",
    "198 e grover street, shelby, nc",
    "prospect place in carrboro, nc",
    "Hwy 9 Black Mountain NC",
    "6th st. brownsville tx", 
    "9630 university city blvd, charlotte, nc"
]

# Test each example individually
print("Testing individual examples:")
for loc in test_locations:
    result = extract_city(loc)
    print(f"\nLocation: '{loc}'")
    print(f"Extracted city: {result}")

Testing individual examples:

Location: '3709 cove mountain rd sevierville, tn'
Extracted city: Sevierville

Location: 'norriswood ave, memphis ,tn'
Extracted city: Memphis

Location: '411 woodycrest nashville tn'
Extracted city: Nashville

Location: '6631 old settler roads, waxhaw, nc 28173'
Extracted city: Waxhaw

Location: '28 wendy lane asheville, nc 28805'
Extracted city: Asheville

Location: 'lavaca st. austin texas'
Extracted city: Austin

Location: 'SC'
Extracted city: None

Location: '198 e grover street, shelby, nc'
Extracted city: Shelby

Location: 'prospect place in carrboro, nc'
Extracted city: Carrboro

Location: 'Hwy 9 Black Mountain NC'
Extracted city: Black Mountain

Location: '6th st. brownsville tx'
Extracted city: Brownsville

Location: '9630 university city blvd, charlotte, nc'
Extracted city: Charlotte


In [0]:
accident_df_pandas['city'] = extract_cities_from_df(accident_df_pandas, 'Location of Accident (street, city, state)')

In [0]:
def clean_city_name(city):
    if city is None:
        return city
    return re.sub(r'^\D*\d+\s*', '', city)

accident_df_pandas['city'] = accident_df_pandas['city'].apply(clean_city_name)


## Using DBA to map accident to its corresponding site location

In [0]:
multiple_location_dbas = list(fusionsite_regional_df_pandas.groupby("Brand").size()[fusionsite_regional_df_pandas.groupby("Brand").size() > 1].index)

In [0]:
accident_df_pandas = accident_df_pandas[~accident_df_pandas["DBA"].str.contains("Home Office", na = False)]
fusionsite_regional_df_pandas.dropna(subset=["Zip"], inplace = True)

In [0]:
accident_df_pandas["DBA"] = accident_df_pandas["DBA"].ffill()

In [0]:
# Get unique DBA values that are not in the 'Brand' column
dba_values = accident_df_pandas[~accident_df_pandas["DBA"].isin(fusionsite_regional_df_pandas["Brand"])]["DBA"].unique()

# Create a list of possible 'Brand' values from fusionsite_regional_df_location_info
brand_values = fusionsite_regional_df_pandas["Brand"].tolist()

# Map DBA values to the closest Brand using fuzzy matching
mapped_dba_to_brand = {}
for dba in dba_values:
    closest_match, score = process.extractOne(dba, brand_values)
    mapped_dba_to_brand[dba] = closest_match


In [0]:
accident_df_pandas[accident_df_pandas["DBA"] == 'Nashville']

Unnamed: 0,"Location of Accident (street, city, state)",DBA,General Manager,Date of Accident,Date Reported,Fatality,insured_driver_name,Copy of Sate or Insurance Report,Severity,Summary of Accident,Claim Number,Total Incurred,Year,Region,Time of Accident,APMM Recordable,Notes,insured_first_name,insured_last_name,driver_first_name,driver_last_name,driver_id,state,zip,city
52,tn,Nashville,,2023-09-01 00:00:00,,,Shaun Huffman,,,,09 PC 000000289012,,2023,,,,,Shaun,Huffman,Shaun,Huffman,4041709.0,tn,,


In [0]:
del mapped_dba_to_brand['West Florida']
del mapped_dba_to_brand['Nashville']

In [0]:
cities_in_dba_names = ["Fayetteville", "Augusta", "Columbia", "Little Rock"]

def assign_city(dba):
    if "Woodycrest" in dba:
        return "Nashville"
    for city in cities_in_dba_names:
        if city in dba:
            return city
    return None

# Only assign city if current city is null

accident_df_pandas['city'] = accident_df_pandas.apply(
    lambda row: assign_city(row['DBA']) if pd.isnull(row['city']) else row['city'], axis=1
)

In [0]:
# Fix fuzzy wuzzy mappings after manual review
mapped_dba_to_brand['Arkansas Portables'] = 'Arkansas Portable Toilets'
mapped_dba_to_brand['ACP'] = 'A Clean Portoco'
mapped_dba_to_brand['ASC'] =  'A Sani-Can'
mapped_dba_to_brand['Fayetteville'] = 'Arkansas Portable Toilets'
mapped_dba_to_brand['AR Fayeteville'] = 'Arkansas Portable Toilets'
mapped_dba_to_brand['Littlejohn Portables'] = 'Premier Portables/ Prestigious Restrooms'

In [0]:
accident_df_pandas['DBA'] = accident_df_pandas['DBA'].apply(
    lambda dba: mapped_dba_to_brand[dba] if dba in mapped_dba_to_brand and dba not in fusionsite_regional_df_pandas["Brand"].values else dba
)

In [0]:
multiple_location_dbas

['A Clean Portoco',
 'A Sani-Can',
 'Arkansas Portable Toilets',
 'East Tennessee Portables',
 'Forza Site Services',
 'Portable Services',
 'Stamback Services']

In [0]:
# Prepare a mapping dataframe from fusionsite data
# Using Brand as the equivalent of DBA
location_mapping = fusionsite_regional_df_pandas[~fusionsite_regional_df_pandas["Brand"].isin(multiple_location_dbas)][['Brand', 'State', 'City', 'Zip', 'County']].copy()

# Handle potential duplicates by keeping first occurrence for each Brand
# This assumes that each Brand has consistent location info
location_mapping = location_mapping.drop_duplicates(subset=['Brand'])

# Rename columns for the merge
location_mapping = location_mapping.rename(columns={
    'Brand': 'DBA',
    'State': 'dba_state',
    'City': 'dba_city',
    'Zip': 'dba_zip',
    'County': 'dba_county'
})

# Merge the location data with the accident dataframe
accident_df_pandas = pd.merge(
    accident_df_pandas,
    location_mapping,
    on='DBA',
    how='left'  # Keep all rows from accident_df even if no match found
)

## Using Driver Name to get DBA site location

In [0]:
accident_df_pandas["driver_id"] = accident_df_pandas["driver_id"].apply(
    lambda x: str(int(x)) if pd.notna(x) else None
)

In [0]:
non_matched_names = accident_df_pandas[(accident_df_pandas["DBA"].isin(multiple_location_dbas)) &
                   (accident_df_pandas["zip"].isnull()) &
                   (accident_df_pandas["city"].isnull()) &
                   (~accident_df_pandas["driver_id"].isin(full_driver_details["Driver ID"]))]["insured_driver_name"]

In [0]:
driver_detail_names = full_driver_details[full_driver_details["Driver"].notnull()]["Driver"].unique()

# Get unique DBA values that are not in the 'Brand' column
unmatched_driver_names = accident_df_pandas[(~accident_df_pandas["insured_driver_name"].isin(driver_detail_names))& 
                                            (accident_df_pandas["insured_driver_name"].notnull())]["insured_driver_name"].unique()


In [0]:
# Map DBA values to the closest Brand using fuzzy matching
mapped_names = {}
for name in unmatched_driver_names:
    closest_match, score = process.extractOne(str(name), driver_detail_names)
    mapped_names[name] = closest_match

In [0]:
# Deleting some mappings after manual check
del mapped_names['Bostick Delvin Miguel Question']
del mapped_names['Micheal Roberts, Stephen Storti']
del mapped_names['Larry Woodley']
del mapped_names['unknown ']
del mapped_names['unknown']
del mapped_names['Javories Abraham']
del mapped_names['Henry Tony']
del mapped_names['none']
del mapped_names['Raul Sauceda']
del mapped_names['Edward Hart']
del mapped_names['na']
del mapped_names['Multiple drivers were at this site. ']
del mapped_names["It happened back in May, they didn't tell us a price till August. No idea what day, time, or driver did this."]
del mapped_names['Various Technicians']
del mapped_names['Jonathan Thompson']

In [0]:
accident_df_pandas['insured_driver_name'] = accident_df_pandas['insured_driver_name'].apply(lambda name: mapped_names.get(name, None) if name in unmatched_driver_names else name)

In [0]:
fs_cities = fusionsite_regional_df_pandas["City"].unique()

In [0]:
# Create a copy to avoid modifying the original
updated_df = accident_df_pandas.copy()

# Identify rows that need city updates based on the three conditions
mask = (
    (updated_df["DBA"].isin(multiple_location_dbas)) &
    (updated_df["zip"].isnull()) &
    (updated_df["city"].isnull())
)

# Get the indices of rows that need updating
rows_to_update = updated_df[mask].index
print(f"Found {len(rows_to_update)} rows that need city updates")

# Create a mapping from driver names to cities based on Groups column
driver_city_map = {}

for _, row in full_driver_details[["Driver", "Groups"]].iterrows():
    driver = row["Driver"]
    groups = str(row["Groups"])  # Convert to string in case it's not
    
    # Skip if Groups is empty or null
    if pd.isna(groups) or groups == "":
        continue
    
    # Look for city in groups using the pattern "XX - CityName"
    city_match = re.search(r'\w+ - (.+)', groups)
    if city_match:
        city = city_match.group(1).strip().lower()
        
        # Handle special case "Fay" -> "Fayetteville"
        if "fay" in city:
            city = "Fayetteville"
        
            
        # Check if this city is in the known cities list
        if any(fs_city.lower() == city.lower() for fs_city in fs_cities):
            driver_city_map[driver] = city
        
        # If city not in fs_cities, check if it contains any fs_cities
        else:
            for fs_city in fs_cities:
                if fs_city.lower() in city.lower():
                    driver_city_map[driver] = fs_city
                    break
# Update the city information
cities_found = 0

for idx in rows_to_update:
    driver_name = updated_df.loc[idx, "insured_driver_name"]
    
    # Check if we have a city for this driver
    if driver_name in driver_city_map:
        updated_df.loc[idx, "dba_city"] = driver_city_map[driver_name]
        cities_found += 1

print(f"Updated city information for {cities_found} out of {len(rows_to_update)} rows")
    

Found 28 rows that need city updates
Updated city information for 11 out of 28 rows


In [0]:
updated_df.drop_duplicates(inplace=True)

### Mapping DBA location using General Manager Info

In [0]:
updated_df[updated_df["dba_city"].isnull() &
            updated_df["dba_zip"].isnull() &
             updated_df["dba_county"].isnull()][["DBA", "General Manager "]].value_counts()

DBA                        General Manager               
A Sani-Can                 Howard Salters                    31
Portable Services          Andrew Munekata                   15
East Tennessee Portables   Charlie Seivers                    8
Arkansas Portable Toilets  Chris Greganti                     7
                           Brix Byers                         6
Forza Site Services        Albert Bernal                      4
A Clean Portoco            michael perez                      3
                           Michael Perez                      3
Arkansas Portable Toilets  Bubba Wood                         2
                           Bubba Wood                         2
                           Devin Dauel                        2
A Clean Portoco            MICHAEL PEREZ                      1
Forza Site Services        Albert Bernal                      1
Stamback Services          Joseph Schmuker                    1
Arkansas Portable Toilets  Devin Dauel        

In [0]:
# Define the mapping from manager names to locations (found these through online research)
manager_location_map = {
    "michael perez": "Harlingen, Texas",
    "Howard Salters": "Denver, NC",
    "Andrew Munekata": "Augusta, GA",
    "Charlie Seivers": "Knoxville, TN",
    "Chris Greganti": "Little Rock, AK",
    "Brix Byers": "Little Rock",
    "Albert Bernal": "Lubbock, TX",
    "Joseph Schmuker": "Wilcox, AZ",
    "Bubba Wood": "Little Rock, AK", 
    "Devin Dauel": "Little Rock, AK"
}

In [0]:
#update_location_by_manager 
updated_df['gm_lower'] = updated_df['General Manager '].astype(str).str.lower().str.strip()


# Track how many updates we make
city_updates = 0
state_updates = 0

# Process each row
for idx, row in updated_df.iterrows():
    gm_name = row['gm_lower']
    
    # Check if the GM name is in our mapping
    for manager, location in manager_location_map.items():
        if manager.lower() in gm_name:
            # Parse city and state from location
            if ',' in location:
                city, state = [part.strip() for part in location.split(',', 1)]
            
            # Update city if it's missing
            if pd.isna(row['dba_city']):
                updated_df.loc[idx, 'dba_city'] = city
                city_updates += 1
            
            # Update state if it's missing and we have state info
            if state and pd.isna(row['dba_state']):
                updated_df.loc[idx, 'dba_state'] = state
                state_updates += 1
            
            break  # Stop checking once we find a match

# Drop the temporary column
#updated_df = updated_df.drop(columns=['gm_lower'])
print(f"Updated {city_updates} city values and {state_updates} state values based on General Manager names")


Updated 90 city values and 93 state values based on General Manager names


### Filling in missing location information based on existing columns

In [0]:
# Step 1: Fill in missing values for dba_state, dba_city, dba_county based on dba_zip
updated_df['dba_state'] = updated_df.groupby('dba_zip')['dba_state'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
updated_df['dba_county'] = updated_df.groupby('dba_zip')['dba_county'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))

# Step 2: Fill in missing values for dba_state and dba_city based on dba_city
updated_df['dba_state'] = updated_df.groupby('dba_city')['dba_state'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))
updated_df['dba_zip'] = updated_df.groupby('dba_city')['dba_zip'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))

In [0]:
# Make sure the columns you want to use in us_city_states_pandas are non-null
fusionsite_regional_df_pandas_clean = fusionsite_regional_df_pandas.dropna(subset=['State.1', 'City', 'County', 'Zip'])

# lower case all columns 
fusionsite_regional_df_pandas_clean = fusionsite_regional_df_pandas.dropna(subset=['State.1', 'City', 'County', 'Zip']).applymap(lambda x: x.lower() if isinstance(x, str) else x)

# lower case dba_state, dba_city, dba_county in updated_df
# Lower case dba_state, dba_city, dba_county in updated_df
updated_df['dba_state'] = updated_df['dba_state'].str.lower()
updated_df['dba_city'] = updated_df['dba_city'].str.lower()
updated_df['dba_county'] = updated_df['dba_county'].str.lower()

# Iterate over the rows of updated_df to fill in the missing values
for idx, row in updated_df.iterrows():
    # Check if the value for dba_state, dba_city, or dba_county is null
    if pd.isnull(row['dba_state']) or pd.isnull(row['dba_city']) or pd.isnull(row['dba_county']):
        
        # Find matching row(s) in us_city_states_pandas_clean based on city, county, or zip
        matches = fusionsite_regional_df_pandas_clean[
            (fusionsite_regional_df_pandas_clean['County'] == row['dba_county']) |
            (fusionsite_regional_df_pandas_clean['City'] == row['dba_city']) |
            (fusionsite_regional_df_pandas_clean['Zip'] == row['dba_zip'])
        ]
        
        # If we find any match, use the first match to fill the missing values
        if not matches.empty:
            # Fill missing values with matched data
            updated_df.at[idx, 'dba_state'] = matches.iloc[0]['State.1'] 
            updated_df.at[idx, 'dba_city'] = matches.iloc[0]['City']
            updated_df.at[idx, 'dba_county'] = matches.iloc[0]['County']
            updated_df.at[idx, 'dba_zip'] = matches.iloc[0]['Zip']


In [0]:
# lowercase all columns 
updated_df = updated_df.applymap(lambda x: x.lower() if isinstance(x, str) else x)

In [0]:
# Iterate over the rows of updated_df to fill in the missing values
for idx, row in updated_df.iterrows():
    # Check if the value for dba_state, dba_city, or dba_county is null
    if pd.isnull(row['state']) or pd.isnull(row['city']):
        
        # Find matching row(s) in us_city_states_pandas_clean based on city, county, or zip
        matches = fusionsite_regional_df_pandas_clean[
            ((fusionsite_regional_df_pandas_clean['City'] == row['city']) & 
             (fusionsite_regional_df_pandas_clean['State.1'] == row['state'])) |
            (fusionsite_regional_df_pandas_clean['Zip'] == row['dba_zip'])
        ]
        
        # If we find any match, use the first match to fill the missing values
        if not matches.empty:
            # Fill missing values with matched data
            updated_df.at[idx, 'state'] = matches.iloc[0]['State.1'] 
            updated_df.at[idx, 'city'] = matches.iloc[0]['City']
            updated_df.at[idx, 'zip'] = matches.iloc[0]['Zip']

In [0]:
# lower case all columns 
us_city_states_pandas = us_city_states_pandas.applymap(lambda x: x.lower() if isinstance(x, str) else x)

In [0]:
# Iterate over the rows of updated_df to fill in the missing values
for idx, row in updated_df.iterrows():
    # Check if the value for dba_state, dba_city, or dba_county is null
    if pd.isnull(row['state']) or pd.isnull(row['city']) or pd.isnull(""):
        
        # Find matching row(s) in us_city_states_pandas_clean based on city, county, or zip
        matches = us_city_states_pandas[
            ((us_city_states_pandas['city'] == row['city']) & 
             (us_city_states_pandas['state_id'] == row['state'])) 
        ]
        
        # If we find any match, use the first match to fill the missing values
        if not matches.empty:
            # Fill missing values with matched data
            updated_df.at[idx, 'state'] = matches.iloc[0]['state_id'] 
            updated_df.at[idx, 'city'] = matches.iloc[0]['city']

In [0]:
for idx, row in updated_df.iterrows():
    if pd.isna(row['zip']):
        match = us_city_states_pandas[
            (us_city_states_pandas['state_id'] == row['state']) & 
            (us_city_states_pandas['city'] == row['city'])
        ]
        if not match.empty:
            updated_df.at[idx, 'zip'] = match.iloc[0]['zips']

                      

In [0]:
updated_df[(updated_df['zip'].isnull()) & (updated_df['dba_zip'].isnull())].shape[0]

17

There are 17 accidents we cannot map to a location using the address of the accident or the address of the DBA. 

In [0]:
full_driver_details[full_driver_details["Driver ID"].notnull()]["Driver ID"].unique()

array(['1471', '1144', 'Old South Auto Mechanic', ..., '0897', '2359',
       '894'], dtype=object)

In [0]:
updated_df[((updated_df['zip'].isnull()) & 
            (updated_df['dba_zip'].isnull()) &
            (~updated_df["driver_id"].isin(full_driver_details[full_driver_details["Driver ID"].notnull()]["Driver ID"].unique())))].shape

(17, 30)

In [0]:
updated_df = updated_df[~((updated_df['zip'].isnull()) & 
            (updated_df['dba_zip'].isnull()) &
            (~updated_df["driver_id"].isin(full_driver_details[full_driver_details["Driver ID"].notnull()]["Driver ID"].unique())))]

In [0]:
# Define the schema for the DataFrame
schema = StructType([
    StructField("Location of Accident (street, city, state)", StringType(), True),
    StructField("DBA", StringType(), True),
    StructField("General Manager", StringType(), True),
    StructField("Date of Accident", StringType(), True),
    StructField("Date Reported", StringType(), True),
    StructField("Fatality", StringType(), True),
    StructField("insured_driver_name", StringType(), True),
    StructField("Copy of Sate or Insurance Report", StringType(), True),
    StructField("Severity", StringType(), True),
    StructField("Summary of Accident", StringType(), True),
    StructField("Claim Number", StringType(), True),
    StructField("Total Incurred", FloatType(), True),
    StructField("Year", IntegerType(), True),
    StructField("Region", StringType(), True),
    StructField("Time of Accident", StringType(), True),
    StructField("APMM Recordable", StringType(), True),
    StructField("Notes", StringType(), True),
    StructField("insured_first_name", StringType(), True),
    StructField("insured_last_name", StringType(), True),
    StructField("driver_first_name", StringType(), True),
    StructField("driver_last_name", StringType(), True),
    StructField("driver_id", StringType(), True),
    StructField("state", StringType(), True),
    StructField("zip", StringType(), True),  # Changed from float to string to preserve leading zeros
    StructField("city", StringType(), True),
    StructField("dba_state", StringType(), True),
    StructField("dba_city", StringType(), True),
    StructField("dba_zip", StringType(), True),  # Changed from float to string
    StructField("dba_county", StringType(), True),
    StructField("gm_lower", StringType(), True)
])


def cleanup_zip_codes(df):
    # Handle 'zip' column
    def clean_zip(x):
        if pd.isna(x):
            return None
        
        # Convert to string first to handle both floats and strings
        x_str = str(x)
        
        # Check if it contains spaces (multiple zip codes)
        if ' ' in x_str:
            return x_str  # Keep multiple zip codes as is
        
        # For single zip codes, remove decimal part if it's a float
        try:
            # Try to convert to int to remove decimal, but only if it's a number without spaces
            return str(int(float(x_str)))
        except ValueError:
            # If conversion fails, return as is
            return x_str
    
    # Apply the function to both zip columns
    df['zip'] = df['zip'].apply(clean_zip)
    df['dba_zip'] = df['dba_zip'].apply(clean_zip)
    
    return df

# Apply the cleanup function
updated_df = cleanup_zip_codes(updated_df)

# Create Spark DataFrame with the defined schema
updated_df_spark = spark.createDataFrame(updated_df, schema=schema)

# Write to parquet
updated_df_spark.write.mode("overwrite").parquet("/FileStore/intermediate_output/accidents_wth_location")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['zip'] = df['zip'].apply(clean_zip)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['dba_zip'] = df['dba_zip'].apply(clean_zip)
