In [1]:
import numpy as np
import pandas as pd

In [2]:
cols = [
    "objectid",
    "year",
    "district",
    "date_",
    "primary_st",
    "age",
    "sex",
    "crash_type",
    "hit_____ru",
    "veh1",
    "veh2",
    "arrest_yes",
    "investigat",
    "time_of_day__24hrs_",
    "lat",
    "lng",
]

In [3]:
df = pd.read_csv("./fatal_crashes_main.csv", usecols=cols, index_col=False)
column_mapping = {
    "objectid": "crash_id",
    "district": "police_district",
    "date_": "crash_date",
    "primary_st": "primary_street",
    "sex": "victim_gender",
    "crash_type": "collision_type",
    "hit_____ru": "hit_and_run",
    "veh1": "primary_vehicle",
    "veh2": "secondary_vehicle",
    "arrest_yes": "arrest_made",
    "investigat": "investigation_status",
    "time_of_day__24hrs_": "time",
    "lat": "latitude",
    "lng": "longitude",
}
df = df.rename(columns=column_mapping)

In [4]:
df.sample(3)

Unnamed: 0,crash_id,year,police_district,crash_date,primary_street,age,victim_gender,collision_type,hit_and_run,primary_vehicle,secondary_vehicle,arrest_made,investigation_status,time,latitude,longitude
703,48615,2024,39,2024-08-05 04:00:00+00,2022 W. Hunting Park Ave.,42.0,F,Unit #1 was traveling east on Hunting Park Ave...,Yes,Auto,Pedestrian,No,Active investigation,00:11:00,40.012657,-75.160945
187,48101,2020,19,2020-09-09 04:00:00+00,Parkside Ave.,45.0,M,"Unit #1 operator was intoxicated, left roadway...",No,Auto,Pedestrian,Yes,"Inv. Mans. HBV while DUI, DUI, Simp. Assault, ...",,39.991408,-75.219698
660,48573,2024,2,2024-05-08 04:00:00+00,215 Comly St.,2.0,F,Unit #1 was double parked westbound at 215 W. ...,No,Auto,Pedestrian,No,Active investigation,17:35:00,40.048376,-75.105653


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 796 entries, 0 to 795
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   crash_id              796 non-null    int64  
 1   year                  796 non-null    int64  
 2   police_district       796 non-null    int64  
 3   crash_date            796 non-null    object 
 4   primary_street        796 non-null    object 
 5   age                   791 non-null    float64
 6   victim_gender         795 non-null    object 
 7   collision_type        796 non-null    object 
 8   hit_and_run           796 non-null    object 
 9   primary_vehicle       796 non-null    object 
 10  secondary_vehicle     793 non-null    object 
 11  arrest_made           780 non-null    object 
 12  investigation_status  792 non-null    object 
 13  time                  291 non-null    object 
 14  latitude              782 non-null    float64
 15  longitude             7

In [6]:
df.isnull().sum()

crash_id                  0
year                      0
police_district           0
crash_date                0
primary_street            0
age                       5
victim_gender             1
collision_type            0
hit_and_run               0
primary_vehicle           0
secondary_vehicle         3
arrest_made              16
investigation_status      4
time                    505
latitude                 14
longitude                14
dtype: int64

In [7]:
df.describe()

Unnamed: 0,crash_id,year,police_district,age,latitude,longitude
count,796.0,796.0,796.0,791.0,782.0,782.0
mean,48311.5,2021.708543,18.410804,43.342604,40.011418,-75.156395
std,229.929699,1.746292,10.887109,18.776303,0.145467,0.338102
min,47914.0,2019.0,1.0,0.0,39.884438,-80.502372
25%,48112.75,2020.0,9.0,29.0,39.974932,-75.179845
50%,48311.5,2022.0,17.0,40.0,40.005547,-75.144203
75%,48510.25,2023.0,25.0,58.0,40.033044,-75.097381
max,48709.0,2025.0,77.0,99.0,42.224336,-74.961302


In [8]:
df["crash_date"] = df["crash_date"].str.split().str[0]
df["crash_date"] = pd.to_datetime(df["crash_date"])

In [9]:
df.insert(
    loc=df.columns.get_loc("crash_date") + 1,
    column="day_name",
    value=df["crash_date"].dt.day_name(),
)

In [10]:
df["age_group"] = pd.cut(
    df["age"],
    bins=[0, 18, 25, 35, 45, 55, 65, 75, 100],
    labels=["0-18", "19-25", "26-35", "36-45", "46-55", "56-65", "66-75", "75+"],
)

In [11]:
df["hit_and_run"].value_counts()

hit_and_run
No     600
Yes    194
Unk      1
??       1
Name: count, dtype: int64

In [12]:
df["hit_and_run"] = np.where(
    df["hit_and_run"] == "Yes", 1, np.where(df["hit_and_run"] == "No", 0, np.nan)
)

In [13]:
df["arrest_made"].value_counts()

arrest_made
No               549
Pending          147
Yes               73
No                 4
TBD                2
Pending ID         1
Unk.               1
Arrested           1
None expected      1
N                  1
Name: count, dtype: int64

In [14]:
day_order = [
    "Monday",
    "Tuesday",
    "Wednesday",
    "Thursday",
    "Friday",
    "Saturday",
    "Sunday",
]
df["day_name"] = pd.Categorical(df["day_name"], categories=day_order, ordered=True)

In [15]:
df["victim_gender"].value_counts()

victim_gender
M     577
F     214
Mn      1
MN      1
f       1
M.      1
Name: count, dtype: int64

In [16]:
gender_mapping = {
    "M": "M",
    "F": "F",
    "MN": "M",
    "M.": "M",
    "Mn": "M",
    "f": "F",
}

df["victim_gender"] = df["victim_gender"].map(gender_mapping)
df["victim_gender"].value_counts()

victim_gender
M    580
F    215
Name: count, dtype: int64

In [17]:
df["primary_vehicle"].value_counts()

primary_vehicle
Auto                  623
M/C                    78
Motorcycle             29
Auto                   10
Train                   9
Dirtbike                9
ATV                     4
Bus                     4
Bicyclist               2
Dirt Bike               2
SEPTA Bus               2
Motor Scooter           2
Quad                    2
Bike                    2
unk                     1
Van                     1
T/T                     1
Moped                   1
Septa bus               1
Truck                   1
Trolley Car             1
Auto (Police)           1
Ambulance               1
Tow truck               1
Can Am Slingshot        1
Pedestrian              1
PFD Ladder TK           1
SEPTA Trolley           1
SEPTA Subway Train      1
Tree                    1
School Bus              1
Auto (Mack TK)          1
Name: count, dtype: int64

In [18]:
vehicle_mapping = {
    "Dirt Bike": "Dirtbike",
    "SEPTA Bus": "Bus",
    "Septa bus": "Bus",
    "Motor Scooter": "Moped",
    "T/T": "Truck",
    "Auto (Police)": "Auto",
    "Auto (Mack TK)": "Auto",
    "SEPTA Trolley": "Trolley",
    "SEPTA Subway Train": "Train",
    "PFD Ladder TK": "Truck",
    "Can Am Slingshot": "ATV",
}
df["primary_vehicle"] = df["primary_vehicle"].replace(vehicle_mapping)
vehicle_categories = {
    "M/C": "Motorcycle",
    "Auto": "Auto",
    "Motorcycle": "Motorcycle",
    "Dirtbike": "Motorcycle",
    "Bike": "Bicycle",
    "Bicyclist": "Bicycle",
    "Bus": "Commercial",
    "School Bus": "Commercial",
    "Truck": "Commercial",
    "Tow truck": "Commercial",
    "Ambulance": "Commercial",
    "Train": "Train",
    "Trolley": "Trolley",
    "ATV": "Off-road",
    "Quad": "Off-road",
    "Moped": "Motorcycle",
    "Van": "Auto",
    "Pedestrian": "Pedestrian",
    "Tree": "Object",
}
df["primary_vehicle"] = df["primary_vehicle"].map(vehicle_categories)


unmapped = df[df["primary_vehicle"].isna()]["primary_vehicle"].unique()


df["primary_vehicle"] = df["primary_vehicle"].fillna("Others")
print(df["primary_vehicle"].value_counts())

primary_vehicle
Auto          626
Motorcycle    121
Commercial     13
Others         12
Train          10
Off-road        7
Bicycle         4
Pedestrian      1
Trolley         1
Object          1
Name: count, dtype: int64


In [19]:
df["secondary_vehicle"].value_counts()

secondary_vehicle
Pedestrian               315
Auto                     234
Fixed Object             137
Bicycle                   17
Bicyclist                 10
M/C                        9
Parked veh.                8
Tree                       6
Scooter                    4
Parked                     4
Bus                        4
E-Scooter                  3
Ground                     3
Parked Trailer             3
Fixed object               3
Dirt-bike                  3
Motorcycle                 3
Tractor-Trailer            2
M/C and PED                2
Parked auto                2
Trolley car                2
Mini-bike                  2
Front-end loader           2
Auto (Parked)              2
Dirtbike                   1
Parked TT                  1
T/T                        1
School Bus                 1
Ped on skateboard          1
Parked autos               1
Pedestrian on scooter      1
ATV                        1
Truck                      1
E-Bicycle                

In [20]:
standardization_map = {
    "Pedestrian on scooter": "Pedestrian",
    "Ped on skateboard": "Pedestrian",
    "Pedestrians": "Pedestrian",
    "Parked veh.": "Parked Vehicle",
    "Parked auto": "Parked Vehicle",
    "Parked autos": "Parked Vehicle",
    "Auto (Parked)": "Parked Vehicle",
    "Parked": "Parked Vehicle",
    "Parked TT": "Parked Vehicle",
    "T/T": "Truck",
    "Tractor-Trailer": "Truck",
    "Dirt-bike": "Dirtbike",
    "Mini-bike": "Dirtbike",
    "E-Scooter": "Scooter",
    "E-Bicycle": "Bicycle",
    "Bicyclist": "Bicycle",
    "Fixed object": "Fixed Object",
    "Trolley car": "Trolley",
    "M/C and PED": "Motorcycle + Pedestrian",
    "School Bus": "Bus",
    "Front-end loader": "Construction Equipment",
}

df["secondary_vehicle"] = df["secondary_vehicle"].replace(standardization_map)
main_categories = {
    "Pedestrian": "Pedestrian",
    "Auto": "Auto",
    "Fixed Object": "Fixed Object",
    "Bicycle": "Bicycle",
    "M/C": "Motorcycle",
    "Parked Vehicle": "Parked Vehicle",
    "Bus": "Bus",
    "Dirtbike": "Motorcycle",
    "Scooter": "Motorcycle",
    "Motorcycle": "Motorcycle",
    "Truck": "Truck",
    "Trolley": "Trolley",
    "Ground": "Ground",
    "Parked Trailer": "Trailer",
    "ATV": "Off-road",
    "Moped": "Motorcycle",
    "Construction Equipment": "Commercial Vehicle",
    "Motorcycle + Pedestrian": "Mixed",
}
df["secondary_vehicle"] = df["secondary_vehicle"].map(main_categories)
df["secondary_vehicle"].value_counts()

secondary_vehicle
Pedestrian            318
Auto                  234
Fixed Object          140
Bicycle                28
Motorcycle             26
Parked Vehicle         18
Bus                     5
Truck                   4
Ground                  3
Trailer                 3
Trolley                 2
Mixed                   2
Commercial Vehicle      2
Off-road                1
Name: count, dtype: int64

In [21]:
schema = pd.read_csv("schema.csv", index_col=False)
data_types = dict(zip(schema["column_name"], schema["data_type"]))
df = df.astype(data_types)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 796 entries, 0 to 795
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   crash_id              796 non-null    Int64         
 1   year                  796 non-null    int32         
 2   police_district       796 non-null    category      
 3   crash_date            796 non-null    datetime64[ns]
 4   day_name              796 non-null    category      
 5   primary_street        796 non-null    string        
 6   age                   791 non-null    Float64       
 7   victim_gender         795 non-null    category      
 8   collision_type        796 non-null    string        
 9   hit_and_run           796 non-null    bool          
 10  primary_vehicle       796 non-null    category      
 11  secondary_vehicle     786 non-null    category      
 12  arrest_made           780 non-null    category      
 13  investigation_status

In [22]:
df.to_csv("./philadelphia_fatal_crashes_clean.csv")