In [1]:
# Linear regression
# Date

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [3]:
# Import data
precovid_df = pd.read_csv('../python_etl_processing/precovid_data.csv')
precovid_df.head()

Unnamed: 0,crash_id,crash_fatal_fl,crash_date,crash_time,crash_speed_limit,road_constr_zone_fl,latitude,longitude,crash_sev_id,sus_serious_injry_cnt,...,motor_vehicle_death_count,motor_vehicle_serious_injury_count,bicycle_death_count,bicycle_serious_injury_count,pedestrian_death_count,pedestrian_serious_injury_count,motorcycle_death_count,motorcycle_serious_injury_count,other_death_count,other_serious_injury_count
0,12992764,0,2012-11-06,04:51:00,65.0,0.0,30.377406,-97.734442,3,0,...,0,0,0,0,0,0,0,0,0,0
1,12979184,0,2012-10-27,05:01:00,0.0,0.0,30.421736,-97.665935,0,0,...,0,0,0,0,0,0,0,0,0,0
2,12963865,0,2012-10-22,12:42:00,30.0,0.0,30.438155,-97.785708,3,0,...,0,0,0,0,0,0,0,0,0,0
3,12979325,0,2012-10-29,03:00:00,25.0,0.0,30.20565,-97.85483,5,0,...,0,0,0,0,0,0,0,0,0,0
4,12979569,0,2012-10-18,02:04:00,65.0,0.0,30.417802,-97.67278,2,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Create 'serious' flag as outcome for fatalities or serious injuries

precovid_df['serious_fl'] = precovid_df.apply(lambda row: 1 if row['crash_fatal_fl'] == 1 or row['sus_serious_injry_cnt'] == 1 else 0, axis=1)
precovid_df.head()

Unnamed: 0,crash_id,crash_fatal_fl,crash_date,crash_time,crash_speed_limit,road_constr_zone_fl,latitude,longitude,crash_sev_id,sus_serious_injry_cnt,...,motor_vehicle_serious_injury_count,bicycle_death_count,bicycle_serious_injury_count,pedestrian_death_count,pedestrian_serious_injury_count,motorcycle_death_count,motorcycle_serious_injury_count,other_death_count,other_serious_injury_count,serious_fl
0,12992764,0,2012-11-06,04:51:00,65.0,0.0,30.377406,-97.734442,3,0,...,0,0,0,0,0,0,0,0,0,0
1,12979184,0,2012-10-27,05:01:00,0.0,0.0,30.421736,-97.665935,0,0,...,0,0,0,0,0,0,0,0,0,0
2,12963865,0,2012-10-22,12:42:00,30.0,0.0,30.438155,-97.785708,3,0,...,0,0,0,0,0,0,0,0,0,0
3,12979325,0,2012-10-29,03:00:00,25.0,0.0,30.20565,-97.85483,5,0,...,0,0,0,0,0,0,0,0,0,0
4,12979569,0,2012-10-18,02:04:00,65.0,0.0,30.417802,-97.67278,2,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Create year column
precovid_df['year'] = pd.DatetimeIndex(precovid_df['crash_date']).year
precovid_df['month'] = pd.DatetimeIndex(precovid_df['crash_date']).month
precovid_df['day'] = pd.DatetimeIndex(precovid_df['crash_date']).day
precovid_df['crash_time'] = pd.to_datetime(precovid_df['crash_time'])
precovid_df['hour'] = pd.DatetimeIndex(precovid_df['crash_time']).hour
precovid_df.head()

Unnamed: 0,crash_id,crash_fatal_fl,crash_date,crash_time,crash_speed_limit,road_constr_zone_fl,latitude,longitude,crash_sev_id,sus_serious_injry_cnt,...,pedestrian_serious_injury_count,motorcycle_death_count,motorcycle_serious_injury_count,other_death_count,other_serious_injury_count,serious_fl,year,month,day,hour
0,12992764,0,2012-11-06,2022-10-25 04:51:00,65.0,0.0,30.377406,-97.734442,3,0,...,0,0,0,0,0,0,2012,11,6,4
1,12979184,0,2012-10-27,2022-10-25 05:01:00,0.0,0.0,30.421736,-97.665935,0,0,...,0,0,0,0,0,0,2012,10,27,5
2,12963865,0,2012-10-22,2022-10-25 12:42:00,30.0,0.0,30.438155,-97.785708,3,0,...,0,0,0,0,0,0,2012,10,22,12
3,12979325,0,2012-10-29,2022-10-25 03:00:00,25.0,0.0,30.20565,-97.85483,5,0,...,0,0,0,0,0,0,2012,10,29,3
4,12979569,0,2012-10-18,2022-10-25 02:04:00,65.0,0.0,30.417802,-97.67278,2,0,...,0,0,0,0,0,0,2012,10,18,2


In [6]:
# Import population csv
pop_df = pd.read_csv('../python_etl_processing/austin_pop.csv')
pop_df.head()

Unnamed: 0,Year,Population,Growth Rate
0,2022,2176000,0.0279
1,2021,2117000,0.0312
2,2020,2053000,0.0343
3,2019,1985000,0.0366
4,2018,1915000,0.0419


In [7]:
merged_df = precovid_df.merge(pop_df, left_on="year", right_on="Year", how="left")
merged_df = merged_df.drop(columns="Year")
merged_df.head()

Unnamed: 0,crash_id,crash_fatal_fl,crash_date,crash_time,crash_speed_limit,road_constr_zone_fl,latitude,longitude,crash_sev_id,sus_serious_injry_cnt,...,motorcycle_serious_injury_count,other_death_count,other_serious_injury_count,serious_fl,year,month,day,hour,Population,Growth Rate
0,12992764,0,2012-11-06,2022-10-25 04:51:00,65.0,0.0,30.377406,-97.734442,3,0,...,0,0,0,0,2012,11,6,4,1495000,0.0425
1,12979184,0,2012-10-27,2022-10-25 05:01:00,0.0,0.0,30.421736,-97.665935,0,0,...,0,0,0,0,2012,10,27,5,1495000,0.0425
2,12963865,0,2012-10-22,2022-10-25 12:42:00,30.0,0.0,30.438155,-97.785708,3,0,...,0,0,0,0,2012,10,22,12,1495000,0.0425
3,12979325,0,2012-10-29,2022-10-25 03:00:00,25.0,0.0,30.20565,-97.85483,5,0,...,0,0,0,0,2012,10,29,3,1495000,0.0425
4,12979569,0,2012-10-18,2022-10-25 02:04:00,65.0,0.0,30.417802,-97.67278,2,0,...,0,0,0,0,2012,10,18,2,1495000,0.0425


In [8]:
merged_df.columns

Index(['crash_id', 'crash_fatal_fl', 'crash_date', 'crash_time',
       'crash_speed_limit', 'road_constr_zone_fl', 'latitude', 'longitude',
       'crash_sev_id', 'sus_serious_injry_cnt', 'nonincap_injry_cnt',
       'poss_injry_cnt', 'non_injry_cnt', 'unkn_injry_cnt', 'tot_injry_cnt',
       'death_cnt', 'pedestrian_fl', 'motor_vehicle_fl', 'motorcycle_fl',
       'bicycle_fl', 'other_fl', 'point', 'apd_confirmed_death_count',
       'motor_vehicle_death_count', 'motor_vehicle_serious_injury_count',
       'bicycle_death_count', 'bicycle_serious_injury_count',
       'pedestrian_death_count', 'pedestrian_serious_injury_count',
       'motorcycle_death_count', 'motorcycle_serious_injury_count',
       'other_death_count', 'other_serious_injury_count', 'serious_fl', 'year',
       'month', 'day', 'hour', 'Population', 'Growth Rate'],
      dtype='object')

In [10]:
factor_columns = ['month', 'day', 'hour', 'Population']
X = merged_df[factor_columns]
y = merged_df['serious_fl']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [11]:
clf = RandomForestClassifier(n_estimators=500).fit(X_train, y_train)
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')

Training Score: 0.9782503121964756
Testing Score: 0.9601082281115582


In [12]:
X.head()

Unnamed: 0,month,day,hour,Population
0,11,6,4,1495000
1,10,27,5,1495000
2,10,22,12,1495000
3,10,29,3,1495000
4,10,18,2,1495000


In [13]:
postcovid_df = pd.read_csv('../python_etl_processing/postcovid_data.csv')
postcovid_df.head()

Unnamed: 0,crash_id,crash_fatal_fl,crash_date,crash_time,crash_speed_limit,road_constr_zone_fl,latitude,longitude,crash_sev_id,sus_serious_injry_cnt,...,motor_vehicle_death_count,motor_vehicle_serious_injury_count,bicycle_death_count,bicycle_serious_injury_count,pedestrian_death_count,pedestrian_serious_injury_count,motorcycle_death_count,motorcycle_serious_injury_count,other_death_count,other_serious_injury_count
0,17634258,0,2020-03-22,09:10:00,-1.0,0.0,30.22256,-97.835531,5,0,...,0,0,0,0,0,0,0,0,0,0
1,17634302,0,2020-03-23,00:00:00,70.0,0.0,30.140395,-97.795955,5,0,...,0,0,0,0,0,0,0,0,0,0
2,17624617,0,2020-03-17,07:57:00,55.0,0.0,30.245247,-97.807403,5,0,...,0,0,0,0,0,0,0,0,0,0
3,17634273,0,2020-03-22,20:26:00,45.0,0.0,30.345173,-97.620114,2,0,...,0,0,0,0,0,0,0,0,0,0
4,17630060,0,2020-03-19,13:39:00,-1.0,0.0,30.210261,-97.816585,5,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
postcovid_df = postcovid_df[postcovid_df['crash_date'] < '2020-12-31']
postcovid_df.sort_values(by=['crash_date'])

Unnamed: 0,crash_id,crash_fatal_fl,crash_date,crash_time,crash_speed_limit,road_constr_zone_fl,latitude,longitude,crash_sev_id,sus_serious_injry_cnt,...,pedestrian_serious_injury_count,motorcycle_death_count,motorcycle_serious_injury_count,other_death_count,other_serious_injury_count,serious_fl,year,month,day,hour
185,17624397,0,2020-03-16,2022-10-26 13:53:00,40.0,0.0,30.307535,-97.706562,2,0,...,0,0,0,0,0,0,2020,3,16,13
281,17628920,0,2020-03-16,2022-10-26 10:45:00,50.0,0.0,30.234797,-97.824159,5,0,...,0,0,0,0,0,0,2020,3,16,10
4211,17854845,0,2020-03-16,2022-10-26 15:33:00,65.0,0.0,30.428475,-97.757837,5,0,...,0,0,0,0,0,0,2020,3,16,15
104,17624374,0,2020-03-16,2022-10-26 02:00:00,30.0,0.0,30.230720,-97.853150,0,0,...,0,0,0,0,0,0,2020,3,16,2
228,17624392,0,2020-03-16,2022-10-26 15:01:00,45.0,0.0,30.210087,-97.755178,5,0,...,0,0,0,0,0,0,2020,3,16,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8970,18037631,0,2020-12-30,2022-10-26 13:07:00,35.0,0.0,30.244285,-97.765112,5,0,...,0,0,0,0,0,0,2020,12,30,13
8962,18037761,0,2020-12-30,2022-10-26 18:42:00,-1.0,0.0,30.308259,-97.741093,2,0,...,0,0,0,0,0,0,2020,12,30,18
8954,18042455,0,2020-12-30,2022-10-26 07:21:00,50.0,0.0,30.235217,-97.857515,5,0,...,0,0,0,0,0,0,2020,12,30,7
8908,18047402,0,2020-12-30,2022-10-26 15:02:00,65.0,0.0,30.181704,-97.900038,5,0,...,0,0,0,0,0,0,2020,12,30,15


In [26]:
postcovid_df['serious_fl'] = postcovid_df.apply(lambda row: 1 if row['crash_fatal_fl'] == 1 or row['sus_serious_injry_cnt'] == 1 else 0, axis=1)
postcovid_df.head()

Unnamed: 0,crash_id,crash_fatal_fl,crash_date,crash_time,crash_speed_limit,road_constr_zone_fl,latitude,longitude,crash_sev_id,sus_serious_injry_cnt,...,pedestrian_serious_injury_count,motorcycle_death_count,motorcycle_serious_injury_count,other_death_count,other_serious_injury_count,serious_fl,year,month,day,hour
0,17634258,0,2020-03-22,2022-10-26 09:10:00,-1.0,0.0,30.22256,-97.835531,5,0,...,0,0,0,0,0,0,2020,3,22,9
1,17634302,0,2020-03-23,2022-10-26 00:00:00,70.0,0.0,30.140395,-97.795955,5,0,...,0,0,0,0,0,0,2020,3,23,0
2,17624617,0,2020-03-17,2022-10-26 07:57:00,55.0,0.0,30.245247,-97.807403,5,0,...,0,0,0,0,0,0,2020,3,17,7
3,17634273,0,2020-03-22,2022-10-26 20:26:00,45.0,0.0,30.345173,-97.620114,2,0,...,0,0,0,0,0,0,2020,3,22,20
4,17630060,0,2020-03-19,2022-10-26 13:39:00,-1.0,0.0,30.210261,-97.816585,5,0,...,0,0,0,0,0,0,2020,3,19,13


In [27]:
postcovid_df['year'] = pd.DatetimeIndex(postcovid_df['crash_date']).year
postcovid_df['month'] = pd.DatetimeIndex(postcovid_df['crash_date']).month
postcovid_df['day'] = pd.DatetimeIndex(postcovid_df['crash_date']).day
postcovid_df['crash_time'] = pd.to_datetime(postcovid_df['crash_time'])
postcovid_df['hour'] = pd.DatetimeIndex(postcovid_df['crash_time']).hour
postcovid_df.head()

Unnamed: 0,crash_id,crash_fatal_fl,crash_date,crash_time,crash_speed_limit,road_constr_zone_fl,latitude,longitude,crash_sev_id,sus_serious_injry_cnt,...,pedestrian_serious_injury_count,motorcycle_death_count,motorcycle_serious_injury_count,other_death_count,other_serious_injury_count,serious_fl,year,month,day,hour
0,17634258,0,2020-03-22,2022-10-26 09:10:00,-1.0,0.0,30.22256,-97.835531,5,0,...,0,0,0,0,0,0,2020,3,22,9
1,17634302,0,2020-03-23,2022-10-26 00:00:00,70.0,0.0,30.140395,-97.795955,5,0,...,0,0,0,0,0,0,2020,3,23,0
2,17624617,0,2020-03-17,2022-10-26 07:57:00,55.0,0.0,30.245247,-97.807403,5,0,...,0,0,0,0,0,0,2020,3,17,7
3,17634273,0,2020-03-22,2022-10-26 20:26:00,45.0,0.0,30.345173,-97.620114,2,0,...,0,0,0,0,0,0,2020,3,22,20
4,17630060,0,2020-03-19,2022-10-26 13:39:00,-1.0,0.0,30.210261,-97.816585,5,0,...,0,0,0,0,0,0,2020,3,19,13


In [28]:
mergedcovid_df = postcovid_df.merge(pop_df, left_on="year", right_on="Year", how="left")
mergedcovid_df = mergedcovid_df.drop(columns="Year")
mergedcovid_df.head()

Unnamed: 0,crash_id,crash_fatal_fl,crash_date,crash_time,crash_speed_limit,road_constr_zone_fl,latitude,longitude,crash_sev_id,sus_serious_injry_cnt,...,motorcycle_serious_injury_count,other_death_count,other_serious_injury_count,serious_fl,year,month,day,hour,Population,Growth Rate
0,17634258,0,2020-03-22,2022-10-26 09:10:00,-1.0,0.0,30.22256,-97.835531,5,0,...,0,0,0,0,2020,3,22,9,2053000,0.0343
1,17634302,0,2020-03-23,2022-10-26 00:00:00,70.0,0.0,30.140395,-97.795955,5,0,...,0,0,0,0,2020,3,23,0,2053000,0.0343
2,17624617,0,2020-03-17,2022-10-26 07:57:00,55.0,0.0,30.245247,-97.807403,5,0,...,0,0,0,0,2020,3,17,7,2053000,0.0343
3,17634273,0,2020-03-22,2022-10-26 20:26:00,45.0,0.0,30.345173,-97.620114,2,0,...,0,0,0,0,2020,3,22,20,2053000,0.0343
4,17630060,0,2020-03-19,2022-10-26 13:39:00,-1.0,0.0,30.210261,-97.816585,5,0,...,0,0,0,0,2020,3,19,13,2053000,0.0343


In [29]:
X_covid = mergedcovid_df[factor_columns]
y_covid = mergedcovid_df['serious_fl']

In [30]:
predictions = clf.predict(X_covid)
pd.DataFrame({"Prediction": predictions, "Actual": y_covid})

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
8830,0,0
8831,0,0
8832,0,0
8833,0,0


In [31]:
from sklearn.metrics import accuracy_score
accuracy_score(y_covid, predictions)

0.9568760611205432