# The data used for this project's exploratory data analysis has been sourced from:

1) National Highway Traffic Safety Administration API (US data) (https://crashviewer.nhtsa.dot.gov/CrashAPI), all NHTSA files are also available to download in CSV form at (https://www.nhtsa.gov/file-downloads?p=nhtsa/downloads/FARS/2021/National/). 

## Data cleansing of the National Highway Traffic Safety Administration API (US data):

In [1]:
# importing dependencies
import pandas as pd
import numpy as np
import scipy as st
import matplotlib.pyplot as plt
import requests
import json
import csv 

In [2]:
# Import CSV containing road accident data from the year 2020. 
csv_path = "source_data/Accidents_2020_2020.csv"
Accidents_2020 = pd.read_csv(csv_path)
Accidents_2020

Unnamed: 0,caseyear,state,st_case,statename,ve_total,ve_forms,pvh_invl,peds,pernotmvit,permvit,...,cf1name,cf2,cf2name,cf3,cf3name,fatals,drunk_dr,age,race,vehicle_make
0,2020,1,10001,Alabama,1,1,0,0,0,4,...,,,,,,3,1,31.0,Black or African American,Jeep / Kaiser-Jeep / Willys- Jeep
1,2020,1,10002,Alabama,4,4,0,0,0,6,...,,,,,,1,0,34.0,Black or African American,Lexus
2,2020,1,10003,Alabama,2,2,0,0,0,2,...,,,,,,1,0,32.0,White,Toyota
3,2020,1,10004,Alabama,1,1,0,0,0,5,...,,,,,,1,0,32.0,Black or African American,Infiniti
4,2020,1,10005,Alabama,1,1,0,0,0,1,...,,,,,,1,0,30.0,White,Dodge
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36159,2020,56,560111,Wyoming,2,2,0,0,0,2,...,,,,,,1,0,,,
36160,2020,56,560112,Wyoming,3,3,0,0,0,8,...,,,,,,2,0,,,
36161,2020,56,560113,Wyoming,3,3,0,0,0,9,...,,,,,,1,0,,,
36162,2020,56,560114,Wyoming,2,2,0,1,1,3,...,,,,,,1,0,,,


In [3]:
# The output DataFrame has lots of redundant columns, the below create a new DataFrames with the columns of focus in this exploration stage. 
Cleaned_df = Accidents_2020[["caseyear", "st_case","statename","persons", "countyname", "monthname", 
                                  "day_week", "hour", "rur_urbname", "routename", "latitude", "longitud", 
                                  "harm_evname","lgt_condname", "weathername", "fatals", "drunk_dr", "age", "race", "vehicle_make"]]
Cleaned_df

Unnamed: 0,caseyear,st_case,statename,persons,countyname,monthname,day_week,hour,rur_urbname,routename,latitude,longitud,harm_evname,lgt_condname,weathername,fatals,drunk_dr,age,race,vehicle_make
0,2020,10001,Alabama,4,ELMORE (51),January,4,2,Rural,County Road,32.433133,-86.094850,Tree (Standing Only),Dark - Not Lighted,Clear,3,1,31.0,Black or African American,Jeep / Kaiser-Jeep / Willys- Jeep
1,2020,10002,Alabama,6,JEFFERSON (73),January,5,17,Urban,Local Street - Municipality,33.484658,-86.839544,Motor Vehicle In-Transport,Dark - Lighted,Rain,1,0,34.0,Black or African American,Lexus
2,2020,10003,Alabama,2,SHELBY (117),January,5,14,Rural,State Highway,33.299942,-86.369642,Ditch,Daylight,Rain,1,0,32.0,White,Toyota
3,2020,10004,Alabama,5,CALHOUN (15),January,6,15,Rural,County Road,33.795072,-85.883486,Tree (Standing Only),Daylight,Cloudy,1,0,32.0,Black or African American,Infiniti
4,2020,10005,Alabama,1,COOSA (37),January,7,0,Rural,County Road,32.848414,-86.083547,Tree (Standing Only),Dark - Not Lighted,Rain,1,0,30.0,White,Dodge
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36159,2020,560111,Wyoming,2,LARAMIE (21),December,7,16,Rural,County Road,41.211981,-105.123781,Motor Vehicle In-Transport,Dark - Not Lighted,Snow,1,0,,,
36160,2020,560112,Wyoming,8,LINCOLN (23),December,6,18,Rural,U.S. Highway,42.814131,-110.957272,Motor Vehicle In-Transport,Dark - Not Lighted,Clear,2,0,,,
36161,2020,560113,Wyoming,9,FREMONT (13),December,6,17,Rural,State Highway,42.993033,-108.406514,Motor Vehicle In-Transport,Dark - Not Lighted,Snow,1,0,,,
36162,2020,560114,Wyoming,3,LARAMIE (21),December,3,20,Urban,U.S. Highway,41.138478,-104.787133,Motor Vehicle In-Transport,Dark - Not Lighted,Clear,1,0,,,


In [4]:
# Dropping NaN rows. 
Cleaned_df = Cleaned_df.dropna(how="any")
Cleaned_df

Unnamed: 0,caseyear,st_case,statename,persons,countyname,monthname,day_week,hour,rur_urbname,routename,latitude,longitud,harm_evname,lgt_condname,weathername,fatals,drunk_dr,age,race,vehicle_make
0,2020,10001,Alabama,4,ELMORE (51),January,4,2,Rural,County Road,32.433133,-86.094850,Tree (Standing Only),Dark - Not Lighted,Clear,3,1,31.0,Black or African American,Jeep / Kaiser-Jeep / Willys- Jeep
1,2020,10002,Alabama,6,JEFFERSON (73),January,5,17,Urban,Local Street - Municipality,33.484658,-86.839544,Motor Vehicle In-Transport,Dark - Lighted,Rain,1,0,34.0,Black or African American,Lexus
2,2020,10003,Alabama,2,SHELBY (117),January,5,14,Rural,State Highway,33.299942,-86.369642,Ditch,Daylight,Rain,1,0,32.0,White,Toyota
3,2020,10004,Alabama,5,CALHOUN (15),January,6,15,Rural,County Road,33.795072,-85.883486,Tree (Standing Only),Daylight,Cloudy,1,0,32.0,Black or African American,Infiniti
4,2020,10005,Alabama,1,COOSA (37),January,7,0,Rural,County Road,32.848414,-86.083547,Tree (Standing Only),Dark - Not Lighted,Rain,1,0,30.0,White,Dodge
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36148,2020,560100,Wyoming,8,CAMPBELL (5),October,5,9,Rural,State Highway,43.843164,-105.464778,Motor Vehicle In-Transport,Daylight,Snow,2,0,34.0,White,GMC
36149,2020,560101,Wyoming,2,ALBANY (1),October,3,6,Rural,Interstate,41.138225,-105.355258,"Post, Pole or Other Supports",Dark - Not Lighted,Clear,1,0,84.0,White,Chevrolet
36150,2020,560102,Wyoming,1,ALBANY (1),October,7,15,Rural,County Road,41.379339,-105.543508,Rollover/Overturn,Daylight,Clear,1,0,61.0,White,Freightliner
36151,2020,560103,Wyoming,5,ALBANY (1),October,5,17,Rural,U.S. Highway,41.844653,-106.033519,Motor Vehicle In-Transport,Dusk,Snow,1,0,62.0,White,BMW


In [5]:
# Checking the data types of the columns in the Cleaned_df. 
Cleaned_df.dtypes

caseyear          int64
st_case           int64
statename        object
persons           int64
countyname       object
monthname        object
day_week          int64
hour              int64
rur_urbname      object
routename        object
latitude        float64
longitud        float64
harm_evname      object
lgt_condname     object
weathername      object
fatals            int64
drunk_dr          int64
age             float64
race             object
vehicle_make     object
dtype: object

In [6]:
# Converting the age column from float to integer.
Cleaned_df = Cleaned_df.astype({"age":"int"})
Cleaned_df

Unnamed: 0,caseyear,st_case,statename,persons,countyname,monthname,day_week,hour,rur_urbname,routename,latitude,longitud,harm_evname,lgt_condname,weathername,fatals,drunk_dr,age,race,vehicle_make
0,2020,10001,Alabama,4,ELMORE (51),January,4,2,Rural,County Road,32.433133,-86.094850,Tree (Standing Only),Dark - Not Lighted,Clear,3,1,31,Black or African American,Jeep / Kaiser-Jeep / Willys- Jeep
1,2020,10002,Alabama,6,JEFFERSON (73),January,5,17,Urban,Local Street - Municipality,33.484658,-86.839544,Motor Vehicle In-Transport,Dark - Lighted,Rain,1,0,34,Black or African American,Lexus
2,2020,10003,Alabama,2,SHELBY (117),January,5,14,Rural,State Highway,33.299942,-86.369642,Ditch,Daylight,Rain,1,0,32,White,Toyota
3,2020,10004,Alabama,5,CALHOUN (15),January,6,15,Rural,County Road,33.795072,-85.883486,Tree (Standing Only),Daylight,Cloudy,1,0,32,Black or African American,Infiniti
4,2020,10005,Alabama,1,COOSA (37),January,7,0,Rural,County Road,32.848414,-86.083547,Tree (Standing Only),Dark - Not Lighted,Rain,1,0,30,White,Dodge
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36148,2020,560100,Wyoming,8,CAMPBELL (5),October,5,9,Rural,State Highway,43.843164,-105.464778,Motor Vehicle In-Transport,Daylight,Snow,2,0,34,White,GMC
36149,2020,560101,Wyoming,2,ALBANY (1),October,3,6,Rural,Interstate,41.138225,-105.355258,"Post, Pole or Other Supports",Dark - Not Lighted,Clear,1,0,84,White,Chevrolet
36150,2020,560102,Wyoming,1,ALBANY (1),October,7,15,Rural,County Road,41.379339,-105.543508,Rollover/Overturn,Daylight,Clear,1,0,61,White,Freightliner
36151,2020,560103,Wyoming,5,ALBANY (1),October,5,17,Rural,U.S. Highway,41.844653,-106.033519,Motor Vehicle In-Transport,Dusk,Snow,1,0,62,White,BMW


In [8]:
# Renaming the columns in Cleaned_df to be self-explanatory.
Cleaned_df = Cleaned_df.rename(columns={"caseyear":"Year", "st_case": "Case_ID","statename":"State","persons": "Person_Count", 
                                                 "countyname":"County", "monthname": "Month", "day_week":"Day_Name", 
                                                 "hour":"Hour", "rur_urbname": "Rural_Urban", "routename":"Route_Name", 
                                                 "latitude": "Latitude", "longitud":"Longitude", "harm_evname": "Harm_Event",
                                                 "lgt_condname":"Visability", "weathername":"Weather_Condition", "fatals":"Fatality_Count", "drunk_dr":"Drunk_Driver_Count"
                                                  , "age":"Age", "race":"Race", "vehicle_make": "Vehicle_Make"})
Cleaned_df

Unnamed: 0,Year,Case_ID,State,Person_Count,County,Month,Day_Name,Hour,Rural_Urban,Route_Name,Latitude,Longitude,Harm_Event,Visability,Weather_Condition,Fatality_Count,Drunk_Driver_Count,Age,Race,Vehicle_Make
0,2020,10001,Alabama,4,ELMORE (51),January,4,2,Rural,County Road,32.433133,-86.094850,Tree (Standing Only),Dark - Not Lighted,Clear,3,1,31,Black or African American,Jeep / Kaiser-Jeep / Willys- Jeep
1,2020,10002,Alabama,6,JEFFERSON (73),January,5,17,Urban,Local Street - Municipality,33.484658,-86.839544,Motor Vehicle In-Transport,Dark - Lighted,Rain,1,0,34,Black or African American,Lexus
2,2020,10003,Alabama,2,SHELBY (117),January,5,14,Rural,State Highway,33.299942,-86.369642,Ditch,Daylight,Rain,1,0,32,White,Toyota
3,2020,10004,Alabama,5,CALHOUN (15),January,6,15,Rural,County Road,33.795072,-85.883486,Tree (Standing Only),Daylight,Cloudy,1,0,32,Black or African American,Infiniti
4,2020,10005,Alabama,1,COOSA (37),January,7,0,Rural,County Road,32.848414,-86.083547,Tree (Standing Only),Dark - Not Lighted,Rain,1,0,30,White,Dodge
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36148,2020,560100,Wyoming,8,CAMPBELL (5),October,5,9,Rural,State Highway,43.843164,-105.464778,Motor Vehicle In-Transport,Daylight,Snow,2,0,34,White,GMC
36149,2020,560101,Wyoming,2,ALBANY (1),October,3,6,Rural,Interstate,41.138225,-105.355258,"Post, Pole or Other Supports",Dark - Not Lighted,Clear,1,0,84,White,Chevrolet
36150,2020,560102,Wyoming,1,ALBANY (1),October,7,15,Rural,County Road,41.379339,-105.543508,Rollover/Overturn,Daylight,Clear,1,0,61,White,Freightliner
36151,2020,560103,Wyoming,5,ALBANY (1),October,5,17,Rural,U.S. Highway,41.844653,-106.033519,Motor Vehicle In-Transport,Dusk,Snow,1,0,62,White,BMW
