## Are there specific states with higher or lower fatality rates? What factors might contribute to regional variations?

In [None]:
# Run this cell to authenticate yourself to BigQuery
from google.oauth2 import service_account
key_path = './cs145-project2-406000-9a59fc7c0b3d.json'
credential = service_account.Credentials.from_service_account_file(key_path)

In [None]:
# Initialize BiqQuery client
from google.cloud import bigquery
%load_ext google.cloud.bigquery
%env GOOGLE_APPLICATION_CREDENTIALS=$key_path
project_id = "cs145-project2-406000"
client = bigquery.Client(credentials=credential, project=project_id)

## Taste of Dataset

In [None]:
%%bigquery --project $project_id
SELECT *
FROM `bigquery-public-data.nhtsa_traffic_fatalities. accident_2016`
LIMIT 5

## Aggregate our Data into one Table

In [None]:
%%bigquery --project $project_id
CREATE OR REPLACE TABLE traffic_fatalities.traffic_features AS
SELECT
  CONCAT(accident2016.consecutive_number, accident2016.year_of_crash) AS id,    
  accident2016.state_name,
  accident2016.number_of_motor_vehicles_in_transport_mvit,
  accident2016.number_of_parked_working_vehicles,
  accident2016.number_of_persons_in_motor_vehicles_in_transport_mvit,
  accident2016.number_of_persons_not_in_motor_vehicles_in_transport_mvit,
  accident2016.city_name,
  accident2016.day_name,
  accident2016.month_of_crash_name,
  accident2016.day_of_week_name,
  accident2016.hour_of_crash_name,
  accident2016.year_of_crash,
  accident2016.route_signing_name,
  accident2016.land_use_name,
  accident2016.first_harmful_event_name,
  accident2016.work_zone_name,
  accident2016.relation_to_trafficway_name,
  accident2016.light_condition_name,
  accident2016.atmospheric_conditions_name,
  IF(accident2016.hour_of_notification > 23 OR accident2016.hour_of_arrival_at_scene > 23 OR accident2016.minute_of_arrival_at_scene > 59, 9999, (IF(accident2016.hour_of_notification <= accident2016.hour_of_arrival_at_scene, accident2016.hour_of_arrival_at_scene - accident2016.hour_of_notification, accident2016.hour_of_arrival_at_scene - accident2016.hour_of_notification + 24) * 60 + IF(accident2016.minute_of_notification <= accident2016.minute_of_arrival_at_scene, accident2016.minute_of_arrival_at_scene - accident2016.minute_of_notification, accident2016.minute_of_arrival_at_scene - accident2016.minute_of_notification + 60))) AS time_to_scene,
  IF(accident2016.hour_of_notification > 23 OR accident2016.hour_of_ems_arrival_at_hospital > 23 OR accident2016.minute_of_ems_arrival_at_hospital > 59, 9999, (IF(accident2016.hour_of_notification <= accident2016.hour_of_ems_arrival_at_hospital, accident2016.hour_of_ems_arrival_at_hospital - accident2016.hour_of_notification, accident2016.hour_of_ems_arrival_at_hospital - accident2016.hour_of_notification + 24) * 60 + IF(accident2016.minute_of_notification <= accident2016.minute_of_ems_arrival_at_hospital, accident2016.minute_of_ems_arrival_at_hospital - accident2016.minute_of_notification, accident2016.minute_of_ems_arrival_at_hospital - accident2016.minute_of_notification + 60))) AS time_to_hospital,
  accident2016.number_of_drunk_drivers,
  accident2016.number_of_fatalities,
  2 * accident2016.number_of_motor_vehicles_in_transport_mvit + accident2016.number_of_parked_working_vehicles + 4 * accident2016.number_of_fatalities AS severity,
  IF(2 * accident2016.number_of_motor_vehicles_in_transport_mvit + accident2016.number_of_parked_working_vehicles + 4 * accident2016.number_of_fatalities > 6, 1, 0) AS label,
FROM `bigquery-public-data.nhtsa_traffic_fatalities. accident_2016` AS accident2016
UNION ALL
SELECT
  CONCAT(accident2017.consecutive_number, accident2017.year_of_crash) AS id,    
  accident2017.state_name,
  accident2017.number_of_motor_vehicles_in_transport_mvit,
  accident2017.number_of_parked_working_vehicles,
  accident2017.number_of_persons_in_motor_vehicles_in_transport_mvit,
  accident2017.number_of_persons_not_in_motor_vehicles_in_transport_mvit,
  accident2017.city_name,
  accident2017.day_name,
  accident2017.month_of_crash_name,
  accident2017.day_of_week_name,
  accident2017.hour_of_crash_name,
  accident2017.year_of_crash,
  accident2017.route_signing_name,
  accident2017.land_use_name,
  accident2017.first_harmful_event_name,
  accident2017.work_zone_name,
  accident2017.relation_to_trafficway_name,
  accident2017.light_condition_name,
  accident2017.atmospheric_conditions_name,
  IF(accident2017.hour_of_notification > 23 OR accident2017.hour_of_arrival_at_scene > 23 OR accident2017.minute_of_arrival_at_scene > 59, 9999, (IF(accident2017.hour_of_notification <= accident2017.hour_of_arrival_at_scene, accident2017.hour_of_arrival_at_scene - accident2017.hour_of_notification, accident2017.hour_of_arrival_at_scene - accident2017.hour_of_notification + 24) * 60 + IF(accident2017.minute_of_notification <= accident2017.minute_of_arrival_at_scene, accident2017.minute_of_arrival_at_scene - accident2017.minute_of_notification, accident2017.minute_of_arrival_at_scene - accident2017.minute_of_notification + 60))) AS time_to_scene,
  IF(accident2017.hour_of_notification > 23 OR accident2017.hour_of_ems_arrival_at_hospital > 23 OR accident2017.minute_of_ems_arrival_at_hospital > 59, 9999, (IF(accident2017.hour_of_notification <= accident2017.hour_of_ems_arrival_at_hospital, accident2017.hour_of_ems_arrival_at_hospital - accident2017.hour_of_notification, accident2017.hour_of_ems_arrival_at_hospital - accident2017.hour_of_notification + 24) * 60 + IF(accident2017.minute_of_notification <= accident2017.minute_of_ems_arrival_at_hospital, accident2017.minute_of_ems_arrival_at_hospital - accident2017.minute_of_notification, accident2017.minute_of_ems_arrival_at_hospital - accident2017.minute_of_notification + 60))) AS time_to_hospital,
  accident2017.number_of_drunk_drivers,
  accident2017.number_of_fatalities,
  2 * accident2017.number_of_motor_vehicles_in_transport_mvit + accident2017.number_of_parked_working_vehicles + 4 * accident2017.number_of_fatalities AS severity,
  IF(2 * accident2017.number_of_motor_vehicles_in_transport_mvit + accident2017.number_of_parked_working_vehicles + 4 * accident2017.number_of_fatalities > 6, 1, 0) AS label,
FROM `bigquery-public-data.nhtsa_traffic_fatalities. accident_2017` AS accident2017
UNION ALL
SELECT
  CONCAT(accident2018.consecutive_number, accident2018.year_of_crash) AS id,    
  accident2018.state_name,
  accident2018.number_of_motor_vehicles_in_transport_mvit,
  accident2018.number_of_parked_working_vehicles,
  accident2018.number_of_persons_in_motor_vehicles_in_transport_mvit,
  accident2018.number_of_persons_not_in_motor_vehicles_in_transport_mvit,
  accident2018.city_name,
  accident2018.day_name,
  accident2018.month_of_crash_name,
  accident2018.day_of_week_name,
  accident2018.hour_of_crash_name,
  accident2018.year_of_crash, 
  accident2018.route_signing_name,
  accident2018.land_use_name,
  accident2018.first_harmful_event_name,
  accident2018.work_zone_name,
  accident2018.relation_to_trafficway_name,
  accident2018.light_condition_name,
  accident2018.atmospheric_conditions_name,
  IF(accident2018.hour_of_notification > 23 OR accident2018.hour_of_arrival_at_scene > 23 OR accident2018.minute_of_arrival_at_scene > 59, 9999, (IF(accident2018.hour_of_notification <= accident2018.hour_of_arrival_at_scene, accident2018.hour_of_arrival_at_scene - accident2018.hour_of_notification, accident2018.hour_of_arrival_at_scene - accident2018.hour_of_notification + 24) * 60 + IF(accident2018.minute_of_notification <= accident2018.minute_of_arrival_at_scene, accident2018.minute_of_arrival_at_scene - accident2018.minute_of_notification, accident2018.minute_of_arrival_at_scene - accident2018.minute_of_notification + 60))) AS time_to_scene,
  IF(accident2018.hour_of_notification > 23 OR accident2018.hour_of_ems_arrival_at_hospital > 23 OR accident2018.minute_of_ems_arrival_at_hospital > 59, 9999, (IF(accident2018.hour_of_notification <= accident2018.hour_of_ems_arrival_at_hospital, accident2018.hour_of_ems_arrival_at_hospital - accident2018.hour_of_notification, accident2018.hour_of_ems_arrival_at_hospital - accident2018.hour_of_notification + 24) * 60 + IF(accident2018.minute_of_notification <= accident2018.minute_of_ems_arrival_at_hospital, accident2018.minute_of_ems_arrival_at_hospital - accident2018.minute_of_notification, accident2018.minute_of_ems_arrival_at_hospital - accident2018.minute_of_notification + 60))) AS time_to_hospital,
  accident2018.number_of_drunk_drivers,
  accident2018.number_of_fatalities,
  2 * accident2018.number_of_motor_vehicles_in_transport_mvit + accident2018.number_of_parked_working_vehicles + 4 * accident2018.number_of_fatalities AS severity,
  IF(2 * accident2018.number_of_motor_vehicles_in_transport_mvit + accident2018.number_of_parked_working_vehicles + 4 * accident2018.number_of_fatalities > 6, 1, 0) AS label,
FROM `bigquery-public-data.nhtsa_traffic_fatalities. accident_2018` AS accident2018
UNION ALL
SELECT
  CONCAT(accident2019.consecutive_number, accident2019.year_of_crash) AS id,    
  accident2019.state_name,
  accident2019.number_of_motor_vehicles_in_transport_mvit,
  accident2019.number_of_parked_working_vehicles,
  accident2019.number_of_persons_in_motor_vehicles_in_transport_mvit,
  accident2019.number_of_persons_not_in_motor_vehicles_in_transport_mvit,
  accident2019.city_name,
  accident2019.day_name,
  accident2019.month_of_crash_name,
  accident2019.day_of_week_name,
  accident2019.hour_of_crash_name,
  accident2019.year_of_crash, 
  accident2019.route_signing_name,
  accident2019.land_use_name,
  accident2019.first_harmful_event_name,
  accident2019.work_zone_name,
  accident2019.relation_to_trafficway_name,
  accident2019.light_condition_name,
  accident2019.atmospheric_conditions_name,
  IF(accident2019.hour_of_notification > 23 OR accident2019.hour_of_arrival_at_scene > 23 OR accident2019.minute_of_arrival_at_scene > 59, 9999, (IF(accident2019.hour_of_notification <= accident2019.hour_of_arrival_at_scene, accident2019.hour_of_arrival_at_scene - accident2019.hour_of_notification, accident2019.hour_of_arrival_at_scene - accident2019.hour_of_notification + 24) * 60 + IF(accident2019.minute_of_notification <= accident2019.minute_of_arrival_at_scene, accident2019.minute_of_arrival_at_scene - accident2019.minute_of_notification, accident2019.minute_of_arrival_at_scene - accident2019.minute_of_notification + 60))) AS time_to_scene,
  IF(accident2019.hour_of_notification > 23 OR accident2019.hour_of_ems_arrival_at_hospital > 23 OR accident2019.minute_of_ems_arrival_at_hospital > 59, 9999, (IF(accident2019.hour_of_notification <= accident2019.hour_of_ems_arrival_at_hospital, accident2019.hour_of_ems_arrival_at_hospital - accident2019.hour_of_notification, accident2019.hour_of_ems_arrival_at_hospital - accident2019.hour_of_notification + 24) * 60 + IF(accident2019.minute_of_notification <= accident2019.minute_of_ems_arrival_at_hospital, accident2019.minute_of_ems_arrival_at_hospital - accident2019.minute_of_notification, accident2019.minute_of_ems_arrival_at_hospital - accident2019.minute_of_notification + 60))) AS time_to_hospital,
  accident2019.number_of_drunk_drivers,
  accident2019.number_of_fatalities,
  2 * accident2019.number_of_motor_vehicles_in_transport_mvit + accident2019.number_of_parked_working_vehicles + 4 * accident2019.number_of_fatalities AS severity,
  IF(2 * accident2019.number_of_motor_vehicles_in_transport_mvit + accident2019.number_of_parked_working_vehicles + 4 * accident2019.number_of_fatalities > 6, 1, 0) AS label,
FROM `bigquery-public-data.nhtsa_traffic_fatalities. accident_2019` AS accident2019
UNION ALL
SELECT
  CONCAT(accident2020.consecutive_number, accident2020.year_of_crash) AS id,    
  accident2020.state_name,
  accident2020.number_of_motor_vehicles_in_transport_mvit,
  accident2020.number_of_parked_working_vehicles,
  accident2020.number_of_persons_in_motor_vehicles_in_transport_mvit,
  accident2020.number_of_persons_not_in_motor_vehicles_in_transport_mvit,
  accident2020.city_name,
  accident2020.day_name,
  accident2020.month_of_crash_name,
  accident2020.day_of_week_name,
  accident2020.hour_of_crash_name,
  accident2020.year_of_crash, 
  accident2020.route_signing_name,
  accident2020.land_use_name,
  accident2020.first_harmful_event_name,
  accident2020.work_zone_name,
  accident2020.relation_to_trafficway_name,
  accident2020.light_condition_name,
  accident2020.atmospheric_conditions_1_name AS atmospheric_conditions_name,
  IF(accident2020.hour_of_notification > 23 OR accident2020.hour_of_arrival_at_scene > 23 OR accident2020.minute_of_arrival_at_scene > 59, 9999, (IF(accident2020.hour_of_notification <= accident2020.hour_of_arrival_at_scene, accident2020.hour_of_arrival_at_scene - accident2020.hour_of_notification, accident2020.hour_of_arrival_at_scene - accident2020.hour_of_notification + 24) * 60 + IF(accident2020.minute_of_notification <= accident2020.minute_of_arrival_at_scene, accident2020.minute_of_arrival_at_scene - accident2020.minute_of_notification, accident2020.minute_of_arrival_at_scene - accident2020.minute_of_notification + 60))) AS time_to_scene,
  IF(accident2020.hour_of_notification > 23 OR accident2020.hour_of_ems_arrival_at_hospital > 23 OR accident2020.minute_of_ems_arrival_at_hospital > 59, 9999, (IF(accident2020.hour_of_notification <= accident2020.hour_of_ems_arrival_at_hospital, accident2020.hour_of_ems_arrival_at_hospital - accident2020.hour_of_notification, accident2020.hour_of_ems_arrival_at_hospital - accident2020.hour_of_notification + 24) * 60 + IF(accident2020.minute_of_notification <= accident2020.minute_of_ems_arrival_at_hospital, accident2020.minute_of_ems_arrival_at_hospital - accident2020.minute_of_notification, accident2020.minute_of_ems_arrival_at_hospital - accident2020.minute_of_notification + 60))) AS time_to_hospital,
  accident2020.number_of_drunk_drivers,
  accident2020.number_of_fatalities,
  2 * accident2020.number_of_motor_vehicles_in_transport_mvit + accident2020.number_of_parked_working_vehicles + 4 * accident2020.number_of_fatalities AS severity,
  IF(2 * accident2020.number_of_motor_vehicles_in_transport_mvit + accident2020.number_of_parked_working_vehicles + 4 * accident2020.number_of_fatalities > 6, 1, 0) AS label
FROM `bigquery-public-data.nhtsa_traffic_fatalities. accident_2020` AS accident2020

In [None]:
%%bigquery --project $project_id

SELECT COUNT(DISTINCT id)
FROM `traffic_fatalities.traffic_features`

In [None]:
%%bigquery --project $project_id

SELECT COUNT(*)
FROM `traffic_fatalities.traffic_features`

In [None]:
%%bigquery --project $project_id

SELECT driver_distracted_by_name, COUNT(*) as count
FROM `traffic_fatalities.traffic_features`
GROUP BY driver_distracted_by_name
ORDER BY count

In [None]:
%%bigquery --project $project_id

SELECT driver_maneuvered_to_avoid_name, COUNT(*) as count
FROM `traffic_fatalities.traffic_features`
GROUP BY driver_maneuvered_to_avoid_name
ORDER BY count

## Investigate how we should create our car severity equation

Below is a query that aims to investigate the number of accidents with different numbers of vehicles in transport. As we can see here, most accidents involve one car. And number of accidents increase as number of motor vehicles in transport decreases. Ultimately we want to see if number of vehicles in transport involved in accident should be used to determine crash severity. 

In [None]:
%%bigquery --project $project_id

SELECT number_of_motor_vehicles_in_transport_mvit, COUNT(number_of_motor_vehicles_in_transport_mvit) AS count
FROM (SELECT DISTINCT id, number_of_motor_vehicles_in_transport_mvit FROM `traffic_fatalities.traffic_features`)
GROUP BY number_of_motor_vehicles_in_transport_mvit
ORDER BY count

Below is a query that aims to investigate the number of accidents with different numbers of vehicles that are parked.

In [None]:
%%bigquery --project $project_id

SELECT number_of_parked_working_vehicles, COUNT(number_of_parked_working_vehicles) AS count
FROM (SELECT DISTINCT id, number_of_parked_working_vehicles FROM `traffic_fatalities.traffic_features`)
GROUP BY number_of_parked_working_vehicles
ORDER BY count

We also wanted to see how accidents were stratified by number of fatalities. Below we noticed that like number of vehicles involved (which makes sense), number of fatalities in an accident is mostly 1. As such we see that most fatal accidents in the US do not involve more than just the driver.

In [None]:
%%bigquery --project $project_id

SELECT number_of_fatalities, COUNT(number_of_fatalities) AS count
FROM (SELECT DISTINCT id, number_of_fatalities FROM `traffic_fatalities.traffic_features`)
GROUP BY number_of_fatalities
ORDER BY count

When determining the label, we were disappointed to find that the data set only includes accidents with fatalities. We could do so much more with a dataset that included accidents with and without fatalities. Nevertheless we decided to artificially make a label variable that estimates the severity of a crash. Based on the above data we let the severity = 1* number of parked vehicles + 2*number of vehicles in transport + 4*number of fatalities. This is because vehicles that are parked are less likely to have occupants. Moreover we think the number of injuries and damage caused will greatly increase with number of moving vehicles in the crash. Finally we weight the number of fatalities in an accident the most for obvious reasons. Below we see that the data is biased towards "low severity fatal accidents". Interestingly we see much more severity 8 than severity 7 accidents, which lends towards situations where there are two cars but only one of the drivers is killed. 

In [None]:
%%bigquery --project $project_id

SELECT severity, COUNT(severity) AS count
FROM (SELECT DISTINCT id, severity FROM `traffic_fatalities.traffic_features`)
GROUP BY severity
ORDER BY severity

We see that there are almost 5000 "severe" fatal crashes in the US according to our severity metric. As a result we see that there are 31,000 crashes that are labelled as not severe. We would've liked to find a metric with more of a 50/50 split, but were unable to add metrics or justify changing any of the metrics that make up our severity label.

In [None]:
%%bigquery --project $project_id

SELECT COUNT(DISTINCT id) AS num_severe
FROM `traffic_fatalities.traffic_features`
WHERE label = 1

In [None]:
%%bigquery --project $project_id

SELECT COUNT(DISTINCT id) AS num_not_severe
FROM `traffic_fatalities.traffic_features`
WHERE label = 0

## Number of Accidents

In [None]:
%%bigquery --project $project_id

SELECT COUNT(DISTINCT id)
FROM `traffic_fatalities.traffic_features`

## Number of Cars

NOT SURE IF THIS IS CORRECT

In [None]:
%%bigquery --project $project_id

SELECT SUM(number_of_motor_vehicles_in_transport_mvit + number_of_parked_working_vehicles)
FROM `traffic_fatalities.traffic_features`

## Number of Reports

NOT SURE IF CORRECT

In [None]:
%%bigquery --project $project_id

SELECT COUNT(*)
FROM `traffic_fatalities.traffic_features`

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

## Crashes by Day of week

In [None]:
query  = "SELECT COUNT(DISTINCT(id)) as num_crashes, day_of_week_name FROM `traffic_fatalities.traffic_features` GROUP BY day_of_week_name ORDER BY num_crashes"

day = pd.read_gbq(query, project_id=project_id, dialect='standard')

plt.bar(day["day_of_week_name"], day["num_crashes"])

## Number of Crashes by Hour

In [None]:
query  = "SELECT COUNT(DISTINCT(id)) as num_crashes, hour_of_crash_name FROM `traffic_fatalities.traffic_features` GROUP BY hour_of_crash_name ORDER BY CASE WHEN hour_of_crash_name = '0:00am-0:59am' THEN 0 WHEN hour_of_crash_name = '1:00am-1:59am' THEN 1 WHEN hour_of_crash_name = '2:00am-2:59am' THEN 2 WHEN hour_of_crash_name = '3:00am-3:59am' THEN 3 WHEN hour_of_crash_name = '4:00am-4:59am' THEN 4 WHEN hour_of_crash_name = '5:00am-5:59am' THEN 5 WHEN hour_of_crash_name = '6:00am-6:59am' THEN 6 WHEN hour_of_crash_name = '7:00am-7:59am' THEN 7 WHEN hour_of_crash_name = '8:00am-8:59am' THEN 8 WHEN hour_of_crash_name = '9:00am-9:59am' THEN 9 WHEN hour_of_crash_name = '10:00am-10:59am' THEN 10 WHEN hour_of_crash_name = '11:00am-11:59am' THEN 11 WHEN hour_of_crash_name = '12:00pm-12:59pm' THEN 12 WHEN hour_of_crash_name = '1:00pm-1:59pm' THEN 13 WHEN hour_of_crash_name = '2:00pm-2:59pm' THEN 14 WHEN hour_of_crash_name = '3:00pm-3:59pm' THEN 15 WHEN hour_of_crash_name = '4:00pm-4:59pm' THEN 16 WHEN hour_of_crash_name = '5:00pm-5:59pm' THEN 17 WHEN hour_of_crash_name = '6:00pm-6:59pm' THEN 18 WHEN hour_of_crash_name = '7:00pm-7:59pm' THEN 19 WHEN hour_of_crash_name = '8:00pm-8:59pm' THEN 20 WHEN hour_of_crash_name = '9:00pm-9:59pm' THEN 21 WHEN hour_of_crash_name = '10:00pm-10:59pm' THEN 22 WHEN hour_of_crash_name = '11:00pm-11:59pm' THEN 23 END"

hours = pd.read_gbq(query, project_id=project_id, dialect='standard')

x = [i for i in range(24)]
plt.bar(x, hours["num_crashes"])
hours

## Top 10 Cities with highest number of Crashes

In [None]:
query  = "SELECT COUNT(DISTINCT(id)) as num_crashes, city_name FROM `traffic_fatalities.traffic_features` WHERE city_name != 'NOT APPLICABLE' AND city_name != 'Not Reported' GROUP BY city_name ORDER BY num_crashes DESC LIMIT 10"
city = pd.read_gbq(query, project_id=project_id, dialect='standard')
plt.bar(city["city_name"], city["num_crashes"])
city

## Percent of Crashes where Driver is drunk by Hour

In [None]:
query  = "SELECT SUM(number_of_drunk_drivers)/SUM(number_of_motor_vehicles_in_transport_mvit) * 100 as percent_drunk, COUNT(id) AS num_crashes, hour_of_crash_name FROM (SELECT DISTINCT id, number_of_drunk_drivers, hour_of_crash_name, number_of_motor_vehicles_in_transport_mvit FROM `traffic_fatalities.traffic_features`) GROUP BY hour_of_crash_name ORDER BY CASE WHEN hour_of_crash_name = '0:00am-0:59am' THEN 0 WHEN hour_of_crash_name = '1:00am-1:59am' THEN 1 WHEN hour_of_crash_name = '2:00am-2:59am' THEN 2 WHEN hour_of_crash_name = '3:00am-3:59am' THEN 3 WHEN hour_of_crash_name = '4:00am-4:59am' THEN 4 WHEN hour_of_crash_name = '5:00am-5:59am' THEN 5 WHEN hour_of_crash_name = '6:00am-6:59am' THEN 6 WHEN hour_of_crash_name = '7:00am-7:59am' THEN 7 WHEN hour_of_crash_name = '8:00am-8:59am' THEN 8 WHEN hour_of_crash_name = '9:00am-9:59am' THEN 9 WHEN hour_of_crash_name = '10:00am-10:59am' THEN 10 WHEN hour_of_crash_name = '11:00am-11:59am' THEN 11 WHEN hour_of_crash_name = '12:00pm-12:59pm' THEN 12 WHEN hour_of_crash_name = '1:00pm-1:59pm' THEN 13 WHEN hour_of_crash_name = '2:00pm-2:59pm' THEN 14 WHEN hour_of_crash_name = '3:00pm-3:59pm' THEN 15 WHEN hour_of_crash_name = '4:00pm-4:59pm' THEN 16 WHEN hour_of_crash_name = '5:00pm-5:59pm' THEN 17 WHEN hour_of_crash_name = '6:00pm-6:59pm' THEN 18 WHEN hour_of_crash_name = '7:00pm-7:59pm' THEN 19 WHEN hour_of_crash_name = '8:00pm-8:59pm' THEN 20 WHEN hour_of_crash_name = '9:00pm-9:59pm' THEN 21 WHEN hour_of_crash_name = '10:00pm-10:59pm' THEN 22 WHEN hour_of_crash_name = '11:00pm-11:59pm' THEN 23 END"

drunk = pd.read_gbq(query, project_id=project_id, dialect='standard')

x = [i for i in range(24)]
plt.bar(x, drunk["percent_drunk"])
drunk

## Contributing Circumstances to Crashes Ranked

In [None]:
query  = "SELECT COUNT(id) as num_crashes, contributing_circumstances_motor_vehicle_name FROM `traffic_fatalities.traffic_features` GROUP BY contributing_circumstances_motor_vehicle_name ORDER BY num_crashes DESC"
factor = pd.read_gbq(query, project_id=project_id, dialect='standard')
factor

## Driver Impairments Contributing to Crashes Ranked

In [None]:
query  = "SELECT COUNT(id) as num_crashes, condition_impairment_at_time_of_crash_driver_name FROM `traffic_fatalities.traffic_features` GROUP BY condition_impairment_at_time_of_crash_driver_name ORDER BY num_crashes DESC"
drimpair = pd.read_gbq(query, project_id=project_id, dialect='standard')
drimpair

## Atmospheric Conditions contributing to Crashes Ranked

In [None]:
query  = "SELECT COUNT(id) as num_crashes, atmospheric_conditions_name FROM `traffic_fatalities.traffic_features` GROUP BY atmospheric_conditions_name ORDER BY num_crashes DESC"
atmos = pd.read_gbq(query, project_id=project_id, dialect='standard')
atmos

## Percent of Severe Crashes Given Atmospheric Condition Present

Interestingly in the query below we see that the atmospheric condition with highest number of high severity crashes is when it is clear. This is probably because most drivers are on the road on clear days as oppposed to other condiitions within a given year. Similarly this is consistent with cloudy weather. However the percentage of severe crashes increases with weather conditions such as freezing rain, snow and hail (despite small sample sizes) - which makes a lot of sense. We 

In [None]:
%%bigquery --project $project_id

SELECT SUM(label)/COUNT(*) AS percent_severe, SUM(label) AS num_severe, atmospheric_conditions_name
FROM (SELECT DISTINCT id, label, atmospheric_conditions_name FROM `traffic_fatalities.traffic_features`)
GROUP BY atmospheric_conditions_name
ORDER BY percent_severe

## Driver Distractions contributing to Crashes Ranked

In [None]:
query  = "SELECT COUNT(DISTINCT(id)) as num_crashes, driver_distracted_by_name FROM `traffic_fatalities.traffic_features` GROUP BY driver_distracted_by_name ORDER BY num_crashes DESC"
drimpair = pd.read_gbq(query, project_id=project_id, dialect='standard')
drimpair

## Whether Driver Maneuvered to Avoid an Object and What that Object was Ranked by Average Severity

NUMBER OF REPORTS IN ERROR - Below we see the number of reports that include high severity crashes, stratified by objects avoided by a driver. We also notice that number of reports seem a little outlandish given the number of accidents. 

In [None]:
%%bigquery --project $project_id

SELECT COUNT(driver_maneuvered_to_avoid_name) AS num_reports, AVG(severity) AS avg_severity, driver_maneuvered_to_avoid_name
FROM `traffic_fatalities.traffic_features`
GROUP BY driver_maneuvered_to_avoid_name
ORDER BY avg_severity

## Severity Ranked by Average Time to Scene

In [None]:
%%bigquery --project $project_id

SELECT severity, COUNT(severity) AS count, AVG(time_to_scene) AS avg_time_to_scene
FROM (SELECT DISTINCT id, severity, time_to_scene FROM `traffic_fatalities.traffic_features` WHERE time_to_scene != 9999)
GROUP BY severity
ORDER BY severity

## Number of Fatalities Ranked by Average time to Scene

In [None]:
%%bigquery --project $project_id

SELECT number_of_fatalities, COUNT(number_of_fatalities) AS count, AVG(time_to_scene) AS avg_time_to_scene
FROM (SELECT DISTINCT id, number_of_fatalities, time_to_scene FROM `traffic_fatalities.traffic_features` WHERE time_to_scene != 9999)
GROUP BY number_of_fatalities
ORDER BY number_of_fatalities

## Violations given to Driver Ranked by Average Severity

In [None]:
%%bigquery --project $project_id

SELECT violations_charged_name, COUNT(violations_charged_name) AS count, AVG(severity) AS avg_severity
FROM `traffic_fatalities.traffic_features`
GROUP BY violations_charged_name
ORDER BY avg_severity

## Number of Fatalities per state Ranked by Average Time to Scene

In [None]:
%%bigquery --project $project_id

SELECT state_name, SUM(number_of_fatalities) AS total_fatalities, COUNT(state_name) AS num_crashes, AVG(time_to_scene) AS avg_time_to_scene
FROM (SELECT DISTINCT id, state_name, number_of_fatalities, time_to_scene FROM `traffic_fatalities.traffic_features` WHERE time_to_scene != 9999)
GROUP BY state_name
ORDER BY avg_time_to_scene

## Number of Fatalities per State Ranked by Average Time to Hospital

In [None]:
%%bigquery --project $project_id

SELECT state_name, SUM(number_of_fatalities) AS total_fatalities, COUNT(state_name) AS num_crashes, AVG(time_to_hospital) AS avg_time_to_hospital
FROM (SELECT DISTINCT id, state_name, number_of_fatalities, time_to_hospital FROM `traffic_fatalities.traffic_features` WHERE time_to_hospital != 9999)
GROUP BY state_name
ORDER BY avg_time_to_hospital

## Training Time

In [None]:
%%bigquery --project $project_id

CREATE OR REPLACE MODEL `traffic_fatalities.traffic_model`
OPTIONS(model_type='logistic_reg') AS
SELECT
    state_name,
    number_of_motor_vehicles_in_transport_mvit,
    number_of_parked_working_vehicles,
    number_of_persons_in_motor_vehicles_in_transport_mvit,
    number_of_persons_not_in_motor_vehicles_in_transport_mvit,
    city_name,
    day_name,
    month_of_crash_name,
    day_of_week_name,
    hour_of_crash_name,
    route_signing_name,
    land_use_name,
    first_harmful_event_name,
    work_zone_name,
    relation_to_trafficway_name,
    light_condition_name,
    atmospheric_conditions_name,
    time_to_scene,
    time_to_hospital,
    number_of_drunk_drivers,
    label,
    driver_distracted_by_name,
    condition_impairment_at_time_of_crash_driver_name,
    contributing_circumstances_motor_vehicle_name,
    driver_maneuvered_to_avoid_name,
    non_motorist_contributing_circumstances_name,
    condition_impairment_at_time_of_crash_non_motorist_name,
    violations_charged_name,
    drivers_vision_obscured_by_name
FROM `traffic_fatalities.traffic_features`
LIMIT 31799040