In [1]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from sklearn import preprocessing
pd.set_option('display.max_columns', None)

In [20]:
df_raw = pd.read_csv('data/atx_crash_data_2018-2026.csv')

  df_raw = pd.read_csv('data/atx_crash_data_2018-2026.csv')


In [21]:
df_raw.head()

Unnamed: 0,ID,Crash ID,crash_fatal_fl,case_id,rpt_block_num,rpt_street_name,rpt_street_sfx,crash_speed_limit,road_constr_zone_fl,latitude,longitude,crash_sev_id,units_involved,point,onsys_fl,private_dr_fl,Crash timestamp (US/Central),Estimated Total Comprehensive Cost,Location group,Address
0,246606,16209332.0,False,180010160,12300,HARRIS BRANCH,PKWY,45.0,False,30.368405,-97.613322,5,Passenger car,POINT (-97.61332249 30.36840475),True,False,1/1/2018 0:36,60000,1.0,N HARRIS BRANCH PKWY & E PARMER LN
1,245425,16182657.0,False,180010354,14300,IH 35,,50.0,False,30.433927,-97.670054,3,Passenger car,POINT (-97.67005447 30.43392675),False,False,1/1/2018 1:36,440000,1.0,14300 N IH 35
2,245426,16182659.0,False,180010462,8300,BEN WHITE,BLVD,55.0,False,30.222877,-97.6791,0,Passenger car,POINT (-97.67910026 30.2228774),True,False,1/1/2018 2:26,20000,2.0,8300 E STATE HIGHWAY 71
3,245514,16185982.0,False,180010524,8400,SH 71 WB,,45.0,False,30.247853,-97.889753,5,Passenger car,POINT (-97.889753 30.247853),True,False,1/1/2018 2:43,20000,1.0,8400 E STATE HIGHWAY 71
4,245513,16185981.0,False,180010550,5800,LARK CREEK DR,DR,30.0,False,30.192214,-97.731125,5,Passenger car,POINT (-97.73112546 30.1922137),False,False,1/1/2018 2:51,20000,1.0,5800 LARK CREEK DR


Notes:
- rpt_street_name, rpt_block_num, latitude&longitude -> pick one
- units_involved -> break up into distinct flags e.g. passenger_car_involved, large_passenger_vehicle_involved, motorcycle_involved, etc.
- Crash timestamp (US/Central) -> convert to day_of_week, week/month_of_year, time_of_day cols (time of day can be bucketed to hour if needed)
- Location group -> may overlap with onsys_fl but can test with/without

In [51]:
def extract_units(s):
    a_list = []
    for units in s.split('&'):
        for unit in units.split('â€“'):
            a_list.append(unit.strip())

    return a_list

unit_cats = list(set(i for s in df_raw['units_involved'].str.lower().map(extract_units).to_list() for i in s))
unit_cats.remove('other/unknown') # combine with "other"
unit_cats

['train',
 'pedestrian',
 'motorcycle',
 'micromobility device',
 'e-scooter',
 'motor vehicle',
 'large passenger vehicle',
 'other',
 'passenger car',
 'bicycle']

In [114]:
drop_cols = ['ID','Crash ID','case_id','rpt_street_sfx','point','Address','crash_sev_id','crash_fatal_fl']
df = df_raw.drop(columns=drop_cols)

df['day_of_week'] = pd.to_datetime(df['Crash timestamp (US/Central)']).dt.weekday
df['week_of_year'] = pd.to_datetime(df['Crash timestamp (US/Central)']).dt.isocalendar().week
df['hour_of_day'] = pd.to_datetime(df['Crash timestamp (US/Central)']).dt.hour

df['units_involved'] = df['units_involved'].str.lower()
for cat in unit_cats:
    df[f'{cat}_involved'] = df['units_involved'].str.contains(cat)


df.drop(columns=['Crash timestamp (US/Central)','units_involved'],inplace=True)
df.dropna(inplace=True)

In [115]:
df.head()

Unnamed: 0,rpt_block_num,rpt_street_name,crash_speed_limit,road_constr_zone_fl,latitude,longitude,onsys_fl,private_dr_fl,Estimated Total Comprehensive Cost,Location group,day_of_week,week_of_year,hour_of_day,train_involved,pedestrian_involved,motorcycle_involved,micromobility device_involved,e-scooter_involved,motor vehicle_involved,large passenger vehicle_involved,other_involved,passenger car_involved,bicycle_involved
0,12300,HARRIS BRANCH,45.0,False,30.368405,-97.613322,True,False,60000,1.0,0,1,0,False,False,False,False,False,False,False,False,True,False
1,14300,IH 35,50.0,False,30.433927,-97.670054,False,False,440000,1.0,0,1,1,False,False,False,False,False,False,False,False,True,False
2,8300,BEN WHITE,55.0,False,30.222877,-97.6791,True,False,20000,2.0,0,1,2,False,False,False,False,False,False,False,False,True,False
3,8400,SH 71 WB,45.0,False,30.247853,-97.889753,True,False,20000,1.0,0,1,2,False,False,False,False,False,False,False,False,True,False
4,5800,LARK CREEK DR,30.0,False,30.192214,-97.731125,False,False,20000,1.0,0,1,2,False,False,False,False,False,False,False,False,True,False


In [121]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
import pickle as pkl

In [122]:
target = 'Estimated Total Comprehensive Cost'

In [134]:
df_x = df.drop(columns=['rpt_block_num','rpt_street_name',target]) # keep lat & long (test with others)
df_y = df[target]

x_train,x_test,y_train,y_test = train_test_split(df_x,df_y,train_size=35000,test_size=5000,random_state=42)

In [135]:
params = {
    'n_estimators':[25,50,100,200],
    'criterion': ['squared_error'],
    'max_depth': [3,5,10,None],
    'min_samples_leaf':[1,2,4],
    'max_features': ['sqrt','log2',None]
}

clf = GridSearchCV(RandomForestRegressor(),params,n_jobs=-1,cv=5)
#clf = RandomizedSearchCV(RandomForestRegressor(),params,n_iter=30,n_jobs=-1,cv=5,random_state=42)

In [136]:
clf.fit(x_train,y_train)

with open('GridSearchTreeRegressor.pkl','wb') as f:
    pkl.dump(clf,f)

In [137]:
print(clf.best_params_)
print(clf.best_score_)

{'criterion': 'squared_error', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 2, 'n_estimators': 100}
0.08436305178518615


In [131]:
clf.score(x_test,y_test)

0.07771988058684953

In [132]:
feature_importance = pd.Series(clf.best_estimator_.feature_importances_,index=df_x.columns)
feature_importance.sort_values(ascending=False)

pedestrian_involved                 0.304059
motorcycle_involved                 0.182893
longitude                           0.131649
hour_of_day                         0.094977
latitude                            0.061320
crash_speed_limit                   0.060280
Location group                      0.044014
bicycle_involved                    0.040045
week_of_year                        0.034559
day_of_week                         0.010536
onsys_fl                            0.009532
large passenger vehicle_involved    0.009494
e-scooter_involved                  0.007923
passenger car_involved              0.003672
road_constr_zone_fl                 0.001917
motor vehicle_involved              0.001866
other_involved                      0.001263
private_dr_fl                       0.000000
train_involved                      0.000000
micromobility device_involved       0.000000
dtype: float64