## Load Data

In [15]:
# enable many obj dumps in a cell w/out print - not used this wk
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import pandas as pd
pd.set_option('display.max_columns', None)

df = pd.read_csv("data.csv")
#print(df)
#df.tail(2)

## Clean up and rename

In [16]:
# Explicit dictionary mapping: original column name → desired column name
explicit_column_map = {
    "X": "x",
    "Y": "y",
    "OBJECTID": "object_id",
    "Incidentid": "incident_id",
    "DateTime": "date_time",
    "Year": "year",
    "StreetName": "street_name",
    "CrossStreet": "cross_street",
    "Distance": "distance",
    "JunctionRelation": "junction_relation",
    "Totalinjuries": "total_injuries",
    "Totalfatalities": "total_fatalities",
    "Injuryseverity": "injury_severity",
    "Collisionmanner": "collision_manner",
    "Lightcondition": "light_condition",
    "Weather": "weather",
    "SurfaceCondition": "surface_condition",
    "Unittype_One": "unit_type_1",
    "Age_Drv1": "age_1",
    "Gender_Drv1": "gender_1",
    "Traveldirection_One": "travel_direction_1",
    "Unitaction_One": "unit_action_1",
    "Violation1_Drv1": "violation_1",
    "AlcoholUse_Drv1": "alcohol_use_1",
    "DrugUse_Drv1": "drug_use_1",
    "Unittype_Two": "unit_type_2",
    "Age_Drv2": "age_2",
    "Gender_Drv2": "gender_2",
    "Traveldirection_Two": "travel_direction_2",
    "Unitaction_Two": "unit_action_2",
    "Violation1_Drv2": "violation_2",
    "AlcoholUse_Drv2": "alcohol_use_2",
    "DrugUse_Drv2": "drug_use_2",
    "Latitude": "lat",
    "Longitude": "long"
}

# Rename using the dictionary
df.rename(columns=explicit_column_map, inplace=True)
df.tail(2)

Unnamed: 0,x,y,object_id,incident_id,date_time,year,street_name,cross_street,distance,junction_relation,total_injuries,total_fatalities,injury_severity,collision_manner,light_condition,weather,surface_condition,unit_type_1,age_1,gender_1,travel_direction_1,unit_action_1,violation_1,alcohol_use_1,drug_use_1,unit_type_2,age_2,gender_2,travel_direction_2,unit_action_2,violation_2,alcohol_use_2,drug_use_2,lat,long
51303,-111.926404,33.435576,51304,4155885.0,2024/03/14 13:20:00+00,2024.0,SR-202 Exit 7 T-Ramp,,0.0,Entrance Exit Ramp 205,0.0,0.0,No Injury,Rear End,Daylight,Clear,Dry,Driver,40.0,Male,East,Making Right Turn,Followed Too Closely,No Apparent Influence,No Apparent Influence,Driver,25.0,Female,West,Making Right Turn,No Improper Action,No Apparent Influence,No Apparent Influence,33.435576,-111.926404
51304,-111.909869,33.436621,51305,4155890.0,2024/03/15 07:11:00+00,2024.0,SR-202 Exit 8 J-Ramp,,0.0,Entrance Exit Ramp 205,0.0,0.0,No Injury,Single Vehicle,Dark Lighted,Clear,Dry,Driver,26.0,Male,North,Making Left Turn,Speed To Fast For Conditions,No Apparent Influence,No Apparent Influence,,,,,,,,,33.436621,-111.909869


In [19]:
# Convert types with error handling and timezone removal for datetime
df['date_time'] = pd.to_datetime(df['date_time'], errors='coerce').dt.tz_localize(None)

# Type conversion map
dtype_map = {
    'x': 'float',
    'y': 'float',
    'object_id': 'Int64',
    'incident_id': 'Int64',
    'year': 'Int64',
    'street_name': 'string',
    'cross_street': 'string',
    'distance': 'float',
    'junction_relation': 'string',
    'total_injuries': 'Int64',
    'total_fatalities': 'Int64',
    'injury_severity': 'string',
    'collision_manner': 'string',
    'light_condition': 'string',
    'weather': 'string',
    'surface_condition': 'string',
    'unit_type_1': 'string',
    'age_1': 'Int64',
    'gender_1': 'string',
    'travel_direction_1': 'string',
    'unit_action_1': 'string',
    'violation_1': 'string',
    'alcohol_use_1': 'string',
    'drug_use_1': 'string',
    'unit_type_2': 'string', 
    'age_2': 'Int64',
    'gender_2': 'string',
    'travel_direction_2': 'string',
    'unit_action_2': 'string',
    'violation_2': 'string',
    'alcohol_use_2': 'string',
    'drug_use_2': 'string',
    'lat': 'float',
    'long': 'float'
}

# Apply conversions
for col, dtype in dtype_map.items():
    df[col] = df[col].astype(dtype, errors='ignore')

df.dtypes

x                            float64
y                            float64
object_id                      Int64
incident_id                    Int64
date_time             datetime64[ns]
year                           Int64
street_name                   string
cross_street                  string
distance                     float64
junction_relation             string
total_injuries                 Int64
total_fatalities               Int64
injury_severity               string
collision_manner              string
light_condition               string
weather                       string
surface_condition             string
unit_type_1                   string
age_1                          Int64
gender_1                      string
travel_direction_1            string
unit_action_1                 string
violation_1                   string
alcohol_use_1                 string
drug_use_1                    string
unit_type_2                   string
age_2                          Int64
g

In [20]:
df

Unnamed: 0,x,y,object_id,incident_id,date_time,year,street_name,cross_street,distance,junction_relation,total_injuries,total_fatalities,injury_severity,collision_manner,light_condition,weather,surface_condition,unit_type_1,age_1,gender_1,travel_direction_1,unit_action_1,violation_1,alcohol_use_1,drug_use_1,unit_type_2,age_2,gender_2,travel_direction_2,unit_action_2,violation_2,alcohol_use_2,drug_use_2,lat,long
0,-111.952210,33.421910,1,2598369,2012-01-01 07:25:00,2012,University Dr,Hardy Dr,0.000,Intersection Non Interchange,6,0,Non Incapacitating Injury,ANGLE (Front To Side)(Other Than Left Turn),Dark Lighted,Clear,Dry,Driver,23,Female,West,Making Left Turn,Inattention Distraction,Alcohol,No Apparent Influence,Driver,20,Female,East,Going Straight Ahead,No Improper Action,No Apparent Influence,No Apparent Influence,33.421910,-111.952210
1,-111.937690,33.423190,2,2593420,2012-01-01 11:45:00,2012,7th St,Myrtle Ave,139.920,Not Junction Related,0,0,No Injury,Single Vehicle,Dark Lighted,Clear,Dry,Driver,255,,East,Backing,Unknown,No Apparent Influence,No Apparent Influence,,,,,,,,,33.423190,-111.937690
2,-111.891960,33.418420,3,2588493,2012-01-01 18:07:00,2012,Price Rd,Orange St,0.000,Intersection Interchange,0,0,No Injury,Left Turn,Daylight,Clear,Dry,Driver,26,Male,South,Making Left Turn,Made Improper Turn,No Apparent Influence,No Apparent Influence,Driver,31,Male,South,Going Straight Ahead,No Improper Action,No Apparent Influence,No Apparent Influence,33.418420,-111.891960
3,-111.947910,33.417790,4,2584449,2012-01-02 00:03:00,2012,11th St,Judd St,200.112,Not Junction Related,0,0,No Injury,Sideswipe Opposite Direction,Daylight,Clear,Dry,Driver,255,,Unknown,Unknown,Unknown,No Apparent Influence,No Apparent Influence,Driverless,,,East,Properly Parked,,,,33.417790,-111.947910
4,-111.928470,33.349560,5,2592211,2012-01-02 00:12:00,2012,Rural Rd,Elliot Rd,135.168,Intersection Related Non Interchange,1,0,Possible Injury,Sideswipe Same Direction,Dusk,Clear,Dry,Driver,59,Male,South,Changing Lanes,Unsafe Lane Change,No Apparent Influence,No Apparent Influence,Driver,20,Female,South,Going Straight Ahead,No Improper Action,No Apparent Influence,No Apparent Influence,33.349560,-111.928470
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51300,-111.939524,33.386046,51301,4155893,2024-03-09 07:44:00,2024,US-60 Exit 173 J-Ramp,,0.000,Entrance Exit Ramp 205,0,0,No Injury,Single Vehicle,Dark Lighted,Clear,Dry,Driver,25,Male,South,Making Right Turn,Speed To Fast For Conditions,Alcohol,No Apparent Influence,,,,,,,,,33.386046,-111.939524
51301,-111.972540,33.409359,51302,4154752,2024-03-11 20:30:00,2024,I-10 Exit 153 C1-Ramp,,0.000,Not Junction Related,0,0,No Injury,Sideswipe Same Direction,Daylight,Clear,Dry,Driver,20,Male,East,Making Right Turn,Failed To Yield Right Of Way,No Apparent Influence,No Apparent Influence,Driver,31,Female,East,Making Right Turn,Unsafe Lane Change,No Apparent Influence,No Apparent Influence,33.409359,-111.972540
51302,-111.926385,33.435558,51303,4155884,2024-03-14 13:15:00,2024,SR-202 Exit 7 T-Ramp,,0.000,Not Junction Related,1,0,Possible Injury,Rear End,Daylight,Clear,Dry,Driver,35,Male,East,Making U Turn,Speed To Fast For Conditions,No Apparent Influence,No Apparent Influence,Driver,22,Female,East,Stopped In Trafficway,No Improper Action,No Apparent Influence,No Apparent Influence,33.435558,-111.926385
51303,-111.926404,33.435576,51304,4155885,2024-03-14 13:20:00,2024,SR-202 Exit 7 T-Ramp,,0.000,Entrance Exit Ramp 205,0,0,No Injury,Rear End,Daylight,Clear,Dry,Driver,40,Male,East,Making Right Turn,Followed Too Closely,No Apparent Influence,No Apparent Influence,Driver,25,Female,West,Making Right Turn,No Improper Action,No Apparent Influence,No Apparent Influence,33.435576,-111.926404


In [25]:
# List distinct values in the 'injury_severity' column
distinct_injury_severity = df['injury_severity'].dropna().unique()
distinct_injury_severity.tolist()

# Define the standardization map for injury severity
severity_mapping = {
    "No Injury": "None",
    "Possible Injury": "Minor",
    "Non Incapacitating Injury": "Minor",
    "Suspected Minor Injury": "Minor",
    "Incapacitating Injury": "Severe",
    "Suspected Serious Injury": "Severe",
    "Fatal": "Severe"
}

# Apply the mapping to create a new column
df["severity_class"] = df["injury_severity"].map(severity_mapping)


['Non Incapacitating Injury',
 'No Injury',
 'Possible Injury',
 'Incapacitating Injury',
 'Fatal',
 'Suspected Serious Injury',
 'Suspected Minor Injury']