## Load Data

In [1]:
# enable many obj dumps in a cell w/out print
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import pandas as pd
pd.set_option('display.max_columns', None)

df = pd.read_csv("data.csv")
#print(df)
#df.tail(2)

## Clean up and rename

In [2]:
# Explicit dictionary mapping: original column name → desired column name
explicit_column_map = {
    "X": "x",
    "Y": "y",
    "OBJECTID": "object_id",
    "Incidentid": "incident_id",
    "DateTime": "date_time",
    "Year": "year",
    "StreetName": "street_name",
    "CrossStreet": "cross_street",
    "Distance": "distance",
    "JunctionRelation": "junction_relation",
    "Totalinjuries": "total_injuries",
    "Totalfatalities": "total_fatalities",
    "Injuryseverity": "injury_severity",
    "Collisionmanner": "collision_manner",
    "Lightcondition": "light_condition",
    "Weather": "weather",
    "SurfaceCondition": "surface_condition",
    "Unittype_One": "unit_type_1",
    "Age_Drv1": "age_1",
    "Gender_Drv1": "gender_1",
    "Traveldirection_One": "travel_direction_1",
    "Unitaction_One": "unit_action_1",
    "Violation1_Drv1": "violation_1",
    "AlcoholUse_Drv1": "alcohol_use_1",
    "DrugUse_Drv1": "drug_use_1",
    "Unittype_Two": "unit_type_2",
    "Age_Drv2": "age_2",
    "Gender_Drv2": "gender_2",
    "Traveldirection_Two": "travel_direction_2",
    "Unitaction_Two": "unit_action_2",
    "Violation1_Drv2": "violation_2",
    "AlcoholUse_Drv2": "alcohol_use_2",
    "DrugUse_Drv2": "drug_use_2",
    "Latitude": "lat",
    "Longitude": "long"
}

# Rename using the dictionary
df.rename(columns=explicit_column_map, inplace=True)
df.tail(2)

Unnamed: 0,x,y,object_id,incident_id,date_time,year,street_name,cross_street,distance,junction_relation,total_injuries,total_fatalities,injury_severity,collision_manner,light_condition,weather,surface_condition,unit_type_1,age_1,gender_1,travel_direction_1,unit_action_1,violation_1,alcohol_use_1,drug_use_1,unit_type_2,age_2,gender_2,travel_direction_2,unit_action_2,violation_2,alcohol_use_2,drug_use_2,lat,long
51303,-111.926404,33.435576,51304,4155885.0,2024/03/14 13:20:00+00,2024.0,SR-202 Exit 7 T-Ramp,,0.0,Entrance Exit Ramp 205,0.0,0.0,No Injury,Rear End,Daylight,Clear,Dry,Driver,40.0,Male,East,Making Right Turn,Followed Too Closely,No Apparent Influence,No Apparent Influence,Driver,25.0,Female,West,Making Right Turn,No Improper Action,No Apparent Influence,No Apparent Influence,33.435576,-111.926404
51304,-111.909869,33.436621,51305,4155890.0,2024/03/15 07:11:00+00,2024.0,SR-202 Exit 8 J-Ramp,,0.0,Entrance Exit Ramp 205,0.0,0.0,No Injury,Single Vehicle,Dark Lighted,Clear,Dry,Driver,26.0,Male,North,Making Left Turn,Speed To Fast For Conditions,No Apparent Influence,No Apparent Influence,,,,,,,,,33.436621,-111.909869


In [3]:
# Convert types with error handling and timezone removal for datetime
df['date_time'] = pd.to_datetime(df['date_time'], errors='coerce').dt.tz_localize(None)

# Type conversion map
dtype_map = {
    'x': 'float',
    'y': 'float',
    'object_id': 'Int64',
    'incident_id': 'Int64',
    'year': 'Int64',
    'street_name': 'string',
    'cross_street': 'string',
    'distance': 'float',
    'junction_relation': 'string',
    'total_injuries': 'Int64',
    'total_fatalities': 'Int64',
    'injury_severity': 'string',
    'collision_manner': 'string',
    'light_condition': 'string',
    'weather': 'string',
    'surface_condition': 'string',
    'unit_type_1': 'string',
    'age_1': 'Int64',
    'gender_1': 'string',
    'travel_direction_1': 'string',
    'unit_action_1': 'string',
    'violation_1': 'string',
    'alcohol_use_1': 'string',
    'drug_use_1': 'string',
    'unit_type_2': 'string', 
    'age_2': 'Int64',
    'gender_2': 'string',
    'travel_direction_2': 'string',
    'unit_action_2': 'string',
    'violation_2': 'string',
    'alcohol_use_2': 'string',
    'drug_use_2': 'string',
    'lat': 'float',
    'long': 'float'
}

# Apply conversions
for col, dtype in dtype_map.items():
    df[col] = df[col].astype(dtype, errors='ignore')

df.dtypes

x                            float64
y                            float64
object_id                      Int64
incident_id                    Int64
date_time             datetime64[ns]
year                           Int64
street_name           string[python]
cross_street          string[python]
distance                     float64
junction_relation     string[python]
total_injuries                 Int64
total_fatalities               Int64
injury_severity       string[python]
collision_manner      string[python]
light_condition       string[python]
weather               string[python]
surface_condition     string[python]
unit_type_1           string[python]
age_1                          Int64
gender_1              string[python]
travel_direction_1    string[python]
unit_action_1         string[python]
violation_1           string[python]
alcohol_use_1         string[python]
drug_use_1            string[python]
unit_type_2           string[python]
age_2                          Int64
g

In [4]:
df

Unnamed: 0,x,y,object_id,incident_id,date_time,year,street_name,cross_street,distance,junction_relation,total_injuries,total_fatalities,injury_severity,collision_manner,light_condition,weather,surface_condition,unit_type_1,age_1,gender_1,travel_direction_1,unit_action_1,violation_1,alcohol_use_1,drug_use_1,unit_type_2,age_2,gender_2,travel_direction_2,unit_action_2,violation_2,alcohol_use_2,drug_use_2,lat,long
0,-111.952210,33.421910,1,2598369,2012-01-01 07:25:00,2012,University Dr,Hardy Dr,0.000,Intersection Non Interchange,6,0,Non Incapacitating Injury,ANGLE (Front To Side)(Other Than Left Turn),Dark Lighted,Clear,Dry,Driver,23,Female,West,Making Left Turn,Inattention Distraction,Alcohol,No Apparent Influence,Driver,20,Female,East,Going Straight Ahead,No Improper Action,No Apparent Influence,No Apparent Influence,33.421910,-111.952210
1,-111.937690,33.423190,2,2593420,2012-01-01 11:45:00,2012,7th St,Myrtle Ave,139.920,Not Junction Related,0,0,No Injury,Single Vehicle,Dark Lighted,Clear,Dry,Driver,255,,East,Backing,Unknown,No Apparent Influence,No Apparent Influence,,,,,,,,,33.423190,-111.937690
2,-111.891960,33.418420,3,2588493,2012-01-01 18:07:00,2012,Price Rd,Orange St,0.000,Intersection Interchange,0,0,No Injury,Left Turn,Daylight,Clear,Dry,Driver,26,Male,South,Making Left Turn,Made Improper Turn,No Apparent Influence,No Apparent Influence,Driver,31,Male,South,Going Straight Ahead,No Improper Action,No Apparent Influence,No Apparent Influence,33.418420,-111.891960
3,-111.947910,33.417790,4,2584449,2012-01-02 00:03:00,2012,11th St,Judd St,200.112,Not Junction Related,0,0,No Injury,Sideswipe Opposite Direction,Daylight,Clear,Dry,Driver,255,,Unknown,Unknown,Unknown,No Apparent Influence,No Apparent Influence,Driverless,,,East,Properly Parked,,,,33.417790,-111.947910
4,-111.928470,33.349560,5,2592211,2012-01-02 00:12:00,2012,Rural Rd,Elliot Rd,135.168,Intersection Related Non Interchange,1,0,Possible Injury,Sideswipe Same Direction,Dusk,Clear,Dry,Driver,59,Male,South,Changing Lanes,Unsafe Lane Change,No Apparent Influence,No Apparent Influence,Driver,20,Female,South,Going Straight Ahead,No Improper Action,No Apparent Influence,No Apparent Influence,33.349560,-111.928470
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51300,-111.939524,33.386046,51301,4155893,2024-03-09 07:44:00,2024,US-60 Exit 173 J-Ramp,,0.000,Entrance Exit Ramp 205,0,0,No Injury,Single Vehicle,Dark Lighted,Clear,Dry,Driver,25,Male,South,Making Right Turn,Speed To Fast For Conditions,Alcohol,No Apparent Influence,,,,,,,,,33.386046,-111.939524
51301,-111.972540,33.409359,51302,4154752,2024-03-11 20:30:00,2024,I-10 Exit 153 C1-Ramp,,0.000,Not Junction Related,0,0,No Injury,Sideswipe Same Direction,Daylight,Clear,Dry,Driver,20,Male,East,Making Right Turn,Failed To Yield Right Of Way,No Apparent Influence,No Apparent Influence,Driver,31,Female,East,Making Right Turn,Unsafe Lane Change,No Apparent Influence,No Apparent Influence,33.409359,-111.972540
51302,-111.926385,33.435558,51303,4155884,2024-03-14 13:15:00,2024,SR-202 Exit 7 T-Ramp,,0.000,Not Junction Related,1,0,Possible Injury,Rear End,Daylight,Clear,Dry,Driver,35,Male,East,Making U Turn,Speed To Fast For Conditions,No Apparent Influence,No Apparent Influence,Driver,22,Female,East,Stopped In Trafficway,No Improper Action,No Apparent Influence,No Apparent Influence,33.435558,-111.926385
51303,-111.926404,33.435576,51304,4155885,2024-03-14 13:20:00,2024,SR-202 Exit 7 T-Ramp,,0.000,Entrance Exit Ramp 205,0,0,No Injury,Rear End,Daylight,Clear,Dry,Driver,40,Male,East,Making Right Turn,Followed Too Closely,No Apparent Influence,No Apparent Influence,Driver,25,Female,West,Making Right Turn,No Improper Action,No Apparent Influence,No Apparent Influence,33.435576,-111.926404


In [5]:
# List distinct values in the 'injury_severity' column
distinct_injury_severity = df['injury_severity'].dropna().unique()
distinct_injury_severity.tolist()

# Define the standardization map for injury severity
severity_mapping = {
    "No Injury": "None",
    "Possible Injury": "Minor",
    "Non Incapacitating Injury": "Minor",
    "Suspected Minor Injury": "Minor",
    "Incapacitating Injury": "Severe",
    "Suspected Serious Injury": "Severe",
    "Fatal": "Severe"
}

# Apply the mapping to create a new column
df["severity_class"] = df["injury_severity"].map(severity_mapping)


['Non Incapacitating Injury',
 'No Injury',
 'Possible Injury',
 'Incapacitating Injury',
 'Fatal',
 'Suspected Serious Injury',
 'Suspected Minor Injury']

In [None]:
# Focus on intersection-level severity patterns
severity_df = (
    df.loc[df['severity_class'].notna(), [
        'object_id', 'street_name', 'cross_street', 'total_injuries',
        'total_fatalities', 'severity_class', 'collision_manner',
        'light_condition', 'weather', 'surface_condition'
    ]]
    .copy()
)

severity_df['street_name'] = severity_df['street_name'].fillna('').str.strip().str.title()
severity_df['cross_street'] = severity_df['cross_street'].fillna('').str.strip().str.title()
intersection_mask = severity_df['street_name'].ne('') & severity_df['cross_street'].ne('')
severity_df = severity_df[intersection_mask].copy()
severity_df['intersection'] = severity_df['street_name'] + ' & ' + severity_df['cross_street']

def most_common(series: pd.Series):
    cleaned = series.dropna().astype(str).str.title()
    if cleaned.empty:
        return pd.NA
    return cleaned.mode().iat[0]

intersection_summary = (
    severity_df.groupby('intersection').agg(
        total_crashes=('object_id', 'count'),
        severe_crashes=('severity_class', lambda s: (s == 'Severe').sum()),
        minor_crashes=('severity_class', lambda s: (s == 'Minor').sum()),
        no_injury_crashes=('severity_class', lambda s: (s == 'None').sum()),
        avg_injuries=('total_injuries', 'mean'),
        avg_fatalities=('total_fatalities', 'mean'),
        common_collision=('collision_manner', most_common),
        common_light=('light_condition', most_common),
        common_weather=('weather', most_common),
        common_surface=('surface_condition', most_common)
    )
    .assign(
        severe_rate=lambda d: d['severe_crashes'] / d['total_crashes'],
        minor_rate=lambda d: d['minor_crashes'] / d['total_crashes'],
        none_rate=lambda d: d['no_injury_crashes'] / d['total_crashes']
    )
    .sort_values(['total_crashes', 'severe_rate'], ascending=[False, False])
)

top_by_volume = (
    intersection_summary.head(10)
    .assign(
        severe_rate_pct=lambda d: (d['severe_rate'] * 100).round(1),
        minor_rate_pct=lambda d: (d['minor_rate'] * 100).round(1),
        none_rate_pct=lambda d: (d['none_rate'] * 100).round(1)
    )
)

top_by_severe_share = (
    intersection_summary.query('total_crashes >= 10')
    .sort_values('severe_rate', ascending=False)
    .head(10)
    .assign(severe_rate_pct=lambda d: (d['severe_rate'] * 100).round(1))
    .rename(columns={'severe_rate_pct': 'Severe %'})
)

print('display top by severe share')
display(top_by_severe_share[[
    'total_crashes', 'severe_crashes', 'Severe %', 'common_collision', 'common_light'
]])

print('display top by volume')
display(top_by_volume[[
    'total_crashes', 'severe_crashes', 'minor_crashes', 'no_injury_crashes',
    'severe_rate_pct', 'common_collision', 'common_light'
]].rename(columns={'severe_rate_pct': 'Severe %'}))

def share_table(frame: pd.DataFrame, column: str, min_crashes: int = 50):
    counts = frame[column].value_counts()
    rate = (
        frame.groupby(column)['severity_class']
        .value_counts(normalize=True)
        .rename('share')
        .reset_index()
        .pivot(index=column, columns='severity_class', values='share')
    )
    for level in ['Severe', 'Minor', 'None']:
        if level not in rate:
            rate[level] = 0.0
    rate = rate[['Severe', 'Minor', 'None']]
    rate = rate.join(counts.rename('total_crashes')).query('total_crashes >= @min_crashes')
    rate[['Severe', 'Minor', 'None']] = (rate[['Severe', 'Minor', 'None']] * 100).round(1)
    return rate.rename(columns={'Severe': 'Severe %', 'Minor': 'Minor %', 'None': 'None %'}) \
               .sort_values('Severe %', ascending=False)


severity_by_collision = share_table(severity_df, 'collision_manner')
severity_by_light = share_table(severity_df, 'light_condition')
severity_by_surface = share_table(severity_df, 'surface_condition')

print('severity by collison manner - top 10')
display(severity_by_collision.head(10))

print('severity by light condition')
display(severity_by_light)

print('severity by surface condition')
display(severity_by_surface)


display top by severe share


Unnamed: 0_level_0,total_crashes,severe_crashes,Severe %,common_collision,common_light
intersection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Apache Blvd & 13Th St,12,3,25.0,Left Turn,Daylight
Broadway Rd & Industrial Park Ave,14,3,21.4,Rear End,Daylight
Hardy Dr & 9Th St,10,2,20.0,Angle (Front To Side)(Other Than Left Turn),Daylight
Country Club Way & Southern Ave,18,3,16.7,Other,Daylight
University Dr & Melody Ln,12,2,16.7,Rear End,Daylight
Country Club Way & Broadway Rd,12,2,16.7,Sideswipe Same Direction,Daylight
Priest Dr & State Route 202 Exit 5 J-Ramp,19,3,15.8,Left Turn,Daylight
Dorsey Ln & Apache Blvd,13,2,15.4,Other,Daylight
Broadway Rd & Club Way,15,2,13.3,Left Turn,Daylight
Mill Ave & El Parque Dr,15,2,13.3,Left Turn,Daylight


display top by volume


Unnamed: 0_level_0,total_crashes,severe_crashes,minor_crashes,no_injury_crashes,Severe %,common_collision,common_light
intersection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Mcclintock Dr & Rio Salado Pkwy,442,4,137,301,0.9,Rear End,Daylight
Price Rd & Broadway Rd,435,3,95,337,0.7,Rear End,Daylight
University Dr & Rural Rd,418,4,116,298,1.0,Rear End,Daylight
Baseline Rd & Kyrene Rd,408,9,117,282,2.2,Rear End,Daylight
Broadway Rd & Mcclintock Dr,388,4,112,272,1.0,Rear End,Daylight
Mill Ave & Southern Ave,378,3,97,278,0.8,Rear End,Daylight
Rural Rd & Southern Ave,377,4,91,282,1.1,Rear End,Daylight
Southern Ave & Rural Rd,370,14,106,250,3.8,Left Turn,Daylight
University Dr & Mcclintock Dr,368,1,103,264,0.3,Rear End,Daylight
Rural Rd & Broadway Rd,362,7,98,257,1.9,Rear End,Daylight


severity by collison manner - top 10


Unnamed: 0_level_0,Severe %,Minor %,None %,total_crashes
collision_manner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Other,10.3,56.0,33.6,2136
Head On,5.9,36.9,57.2,624
Single Vehicle,4.3,27.1,68.6,3324
ANGLE (Front To Side)(Other Than Left Turn),2.8,36.5,60.7,5254
Left Turn,2.6,39.9,57.5,9441
Angle - Other Than Left Turn 2,2.3,41.0,56.8,3498
Sideswipe Opposite Direction,1.5,15.2,83.3,407
U Turn,1.3,21.9,76.8,155
Unknown,1.2,10.3,88.4,648
Rear End,0.7,27.4,71.9,18114


severity by light condition


Unnamed: 0_level_0,Severe %,Minor %,None %,total_crashes
light_condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unknown 51,4.8,17.7,77.5,417
Dark Lighted,3.0,28.5,68.4,11790
Dawn,2.9,28.4,68.7,454
Dark Unknown Lighting,2.3,9.4,88.3,256
Dusk,2.1,29.7,68.2,1612
Daylight,1.7,31.0,67.3,35852
Dark Not Lighted,1.6,25.3,73.2,380
51,,19.6,80.4,56


severity by surface condition


Unnamed: 0_level_0,Severe %,Minor %,None %,total_crashes
surface_condition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Unknown,3.7,15.5,80.7,747
Dry,2.0,30.5,67.5,48481
Wet,1.9,25.8,72.3,1558
