# The data used for this project's exploratory data analysis has been sourced from:

1) National Highway Traffic Safety Administration API (US data) (https://crashviewer.nhtsa.dot.gov/CrashAPI), all NHTSA files are also available to download in CSV form at (https://www.nhtsa.gov/file-downloads?p=nhtsa/downloads/FARS/2021/National/). 

## Data cleansing of the National Highway Traffic Safety Administration API (US data):

In [1]:
# importing dependencies
import pandas as pd
import numpy as np
import scipy as st
import matplotlib.pyplot as plt
import requests
import json
import csv 

In [2]:
# Import CSV containing road accident data from the year 2020. 
csv_path = "source_data/Accidents_2020_2020.csv"
Accidents_2020 = pd.read_csv(csv_path)
Accidents_2020.head()

Unnamed: 0,caseyear,state,st_case,statename,ve_total,ve_forms,pvh_invl,peds,pernotmvit,permvit,...,cf2,cf2name,cf3,cf3name,fatals,drunk_dr,age,speed_limit,race,vehicle_make
0,2020,1,10001,Alabama,1,1,0,0,0,4,...,,,,,3,1,24.0,45.0,Black or African American,Lexus
1,2020,1,10002,Alabama,4,4,0,0,0,6,...,,,,,1,0,40.0,35.0,Not a Fatality (not Applicable),Chevrolet
2,2020,1,10003,Alabama,2,2,0,0,0,2,...,,,,,1,0,34.0,65.0,White,Toyota
3,2020,1,10004,Alabama,1,1,0,0,0,5,...,,,,,1,0,17.0,35.0,Not a Fatality (not Applicable),Hyundai
4,2020,1,10005,Alabama,1,1,0,0,0,1,...,,,,,1,0,47.0,45.0,White,GMC


In [6]:
# The output DataFrame has lots of redundant columns, the below create a new DataFrames with the columns of focus in this exploration stage. 
Cleaned_df = Accidents_2020[["caseyear", "st_case","statename","persons", "countyname", "monthname", 
                                  "day_week", "hour", "rur_urbname", "routename", "latitude", "longitud", 
                                  "harm_evname","lgt_condname", "weathername", "fatals", "drunk_dr", "age", "speed_limit", "race", "vehicle_make"]]
Cleaned_df.head()

Unnamed: 0,caseyear,st_case,statename,persons,countyname,monthname,day_week,hour,rur_urbname,routename,...,longitud,harm_evname,lgt_condname,weathername,fatals,drunk_dr,age,speed_limit,race,vehicle_make
0,2020,10001,Alabama,4,ELMORE (51),January,4,2,Rural,County Road,...,-86.09485,Tree (Standing Only),Dark - Not Lighted,Clear,3,1,24.0,45.0,Black or African American,Lexus
1,2020,10002,Alabama,6,JEFFERSON (73),January,5,17,Urban,Local Street - Municipality,...,-86.839544,Motor Vehicle In-Transport,Dark - Lighted,Rain,1,0,40.0,35.0,Not a Fatality (not Applicable),Chevrolet
2,2020,10003,Alabama,2,SHELBY (117),January,5,14,Rural,State Highway,...,-86.369642,Ditch,Daylight,Rain,1,0,34.0,65.0,White,Toyota
3,2020,10004,Alabama,5,CALHOUN (15),January,6,15,Rural,County Road,...,-85.883486,Tree (Standing Only),Daylight,Cloudy,1,0,17.0,35.0,Not a Fatality (not Applicable),Hyundai
4,2020,10005,Alabama,1,COOSA (37),January,7,0,Rural,County Road,...,-86.083547,Tree (Standing Only),Dark - Not Lighted,Rain,1,0,47.0,45.0,White,GMC


In [7]:
# Dropping NaN rows. 
Cleaned_df = Cleaned_df.dropna(how="any")
Cleaned_df.head()

Unnamed: 0,caseyear,st_case,statename,persons,countyname,monthname,day_week,hour,rur_urbname,routename,...,longitud,harm_evname,lgt_condname,weathername,fatals,drunk_dr,age,speed_limit,race,vehicle_make
0,2020,10001,Alabama,4,ELMORE (51),January,4,2,Rural,County Road,...,-86.09485,Tree (Standing Only),Dark - Not Lighted,Clear,3,1,24.0,45.0,Black or African American,Lexus
1,2020,10002,Alabama,6,JEFFERSON (73),January,5,17,Urban,Local Street - Municipality,...,-86.839544,Motor Vehicle In-Transport,Dark - Lighted,Rain,1,0,40.0,35.0,Not a Fatality (not Applicable),Chevrolet
2,2020,10003,Alabama,2,SHELBY (117),January,5,14,Rural,State Highway,...,-86.369642,Ditch,Daylight,Rain,1,0,34.0,65.0,White,Toyota
3,2020,10004,Alabama,5,CALHOUN (15),January,6,15,Rural,County Road,...,-85.883486,Tree (Standing Only),Daylight,Cloudy,1,0,17.0,35.0,Not a Fatality (not Applicable),Hyundai
4,2020,10005,Alabama,1,COOSA (37),January,7,0,Rural,County Road,...,-86.083547,Tree (Standing Only),Dark - Not Lighted,Rain,1,0,47.0,45.0,White,GMC


In [8]:
# Checking the data types of the columns in the Cleaned_df. 
Cleaned_df.dtypes

caseyear          int64
st_case           int64
statename        object
persons           int64
countyname       object
monthname        object
day_week          int64
hour              int64
rur_urbname      object
routename        object
latitude        float64
longitud        float64
harm_evname      object
lgt_condname     object
weathername      object
fatals            int64
drunk_dr          int64
age             float64
speed_limit     float64
race             object
vehicle_make     object
dtype: object

In [9]:
# Converting the age column from float to integer.
Cleaned_df = Cleaned_df.astype({"age":"int", 
                                 "speed_limit":"int"})
Cleaned_df.head()

Unnamed: 0,caseyear,st_case,statename,persons,countyname,monthname,day_week,hour,rur_urbname,routename,...,longitud,harm_evname,lgt_condname,weathername,fatals,drunk_dr,age,speed_limit,race,vehicle_make
0,2020,10001,Alabama,4,ELMORE (51),January,4,2,Rural,County Road,...,-86.09485,Tree (Standing Only),Dark - Not Lighted,Clear,3,1,24,45,Black or African American,Lexus
1,2020,10002,Alabama,6,JEFFERSON (73),January,5,17,Urban,Local Street - Municipality,...,-86.839544,Motor Vehicle In-Transport,Dark - Lighted,Rain,1,0,40,35,Not a Fatality (not Applicable),Chevrolet
2,2020,10003,Alabama,2,SHELBY (117),January,5,14,Rural,State Highway,...,-86.369642,Ditch,Daylight,Rain,1,0,34,65,White,Toyota
3,2020,10004,Alabama,5,CALHOUN (15),January,6,15,Rural,County Road,...,-85.883486,Tree (Standing Only),Daylight,Cloudy,1,0,17,35,Not a Fatality (not Applicable),Hyundai
4,2020,10005,Alabama,1,COOSA (37),January,7,0,Rural,County Road,...,-86.083547,Tree (Standing Only),Dark - Not Lighted,Rain,1,0,47,45,White,GMC


In [10]:
# Renaming the columns in Cleaned_df to be self-explanatory.
Cleaned_df = Cleaned_df.rename(columns={"caseyear":"Year", "st_case": "Case_ID","statename":"State","persons": "Person_Count", 
                                                 "countyname":"County", "monthname": "Month", "day_week":"Day_Name", 
                                                 "hour":"Hour", "rur_urbname": "Rural_Urban", "routename":"Route_Name", 
                                                 "latitude": "Latitude", "longitud":"Longitude", "harm_evname": "Harm_Event",
                                                 "lgt_condname":"Visability", "weathername":"Weather_Condition", "fatals":"Fatality_Count", "drunk_dr":"Drunk_Driver_Count"
                                                  , "age":"Age", "speed_limit":"Speed_Limit", "race":"Race", "vehicle_make": "Vehicle_Make"})
Cleaned_df.head()

Unnamed: 0,Year,Case_ID,State,Person_Count,County,Month,Day_Name,Hour,Rural_Urban,Route_Name,...,Longitude,Harm_Event,Visability,Weather_Condition,Fatality_Count,Drunk_Driver_Count,Age,Speed_Limit,Race,Vehicle_Make
0,2020,10001,Alabama,4,ELMORE (51),January,4,2,Rural,County Road,...,-86.09485,Tree (Standing Only),Dark - Not Lighted,Clear,3,1,24,45,Black or African American,Lexus
1,2020,10002,Alabama,6,JEFFERSON (73),January,5,17,Urban,Local Street - Municipality,...,-86.839544,Motor Vehicle In-Transport,Dark - Lighted,Rain,1,0,40,35,Not a Fatality (not Applicable),Chevrolet
2,2020,10003,Alabama,2,SHELBY (117),January,5,14,Rural,State Highway,...,-86.369642,Ditch,Daylight,Rain,1,0,34,65,White,Toyota
3,2020,10004,Alabama,5,CALHOUN (15),January,6,15,Rural,County Road,...,-85.883486,Tree (Standing Only),Daylight,Cloudy,1,0,17,35,Not a Fatality (not Applicable),Hyundai
4,2020,10005,Alabama,1,COOSA (37),January,7,0,Rural,County Road,...,-86.083547,Tree (Standing Only),Dark - Not Lighted,Rain,1,0,47,45,White,GMC


## API Consideration