<h1>Dependencies and Load Data</h1>

In [1]:
#import dependencies
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

In [2]:
#Load CSV Files

#Source: Sobhan Moosavi - "US Accidents (2.25 million records) - A Countrywide Traffi Accident Dataset (2016-2019)""
# https://www.kaggle.com/sobhanmoosavi/us-accidents
file = 'Resources/ignore/US_Accidents_May19.csv'
df_accidents = pd.read_csv(file)

#Source: US Department of Transportation - "Highway Statistic 2017 - Licensed Drivers by State"
# https://www.fhwa.dot.gov/policyinformation/statistics/2017/
dl_file = 'Resources/US_DLCount.csv'
df_dl = pd.read_csv(dl_file, thousands=',')

#Source: American Community Survey - "Annual Estimates of the Resident Population...""
#https://www.census.gov/content/census/en/data/tables/time-series/demo/popest/2010s-state-total.html#par_textimage_1574439295
statepop_file = 'Resources/US_State_Population.csv'
df_statepop = pd.read_csv(statepop_file, encoding='iso-8859-1', thousands=',')

#Source: American Community Survey - "County Population Totals and Components of Change"
# https://www.census.gov/content/census/en/data/tables/time-series/demo/popest/2010s-counties-total.html#par_textimage_242301767

countypop_file = 'Resources/US_County_Population.csv'
df_countypop = pd.read_csv(countypop_file, encoding='iso-8859-1')

#Source: Amerian Community Survey - "Annual Estimates of the Resident Population for Incorporated Places of 50,000 or More"
#https://www.census.gov/content/census/en/data/tables/time-series/demo/popest/2010s-total-cities-and-towns.html
citypop_file = 'Resources/US_Cities_Population.csv'
df_citypop = pd.read_csv(citypop_file, encoding='iso-8859-1')

###Diana needs to review and share csv###
# #Source:
# file1 = 'Resources/TMC_decoding.csv'
# TMC_decoding = pd.read_csv(file1)

# #Source:
# file2 = 'Resources/denver_weather_num_days.csv'
# denver_weather_days = pd.read_csv(file2)
# denver_weather_days.set_index('weather_denver', inplace = True)

# #Source:
# file3 = 'Resources/rhode_island_weather_num_days_calc.csv'
# ri_weather_days = pd.read_csv(file3)
# ri_weather_days.set_index('weather_ri', inplace = True)

In [3]:
#dictionary utilized for the DFs
us_state_abbrev = {
    'Alabama': 'AL', 
    'Alaska': 'AK',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Palau': 'PW',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'}

<h1>Accidents Data</h1>

In [4]:
df_clean_accidents = df_accidents

#Create columns for time
df_clean_accidents["Start_Year"] = df_clean_accidents["Start_Time"].agg(lambda x: x.split("-")[0])
df_clean_accidents["Start_Month"] = df_clean_accidents["Start_Time"].agg(lambda x: x.split("-")[1])
df_clean_accidents["Start_Hr"] = df_clean_accidents["Start_Time"].agg(lambda x: x.split()[1])
df_clean_accidents["Start_Hr"] = df_clean_accidents["Start_Hr"].agg(lambda x: x.split(":")[0])

#Remove 2015 and 2019 
df_clean_accidents = df_clean_accidents[df_clean_accidents["Start_Year"] != "2019"]
df_clean_accidents = df_clean_accidents[df_clean_accidents["Start_Year"] != "2015"]

df_clean_accidents.to_csv("Clean_Data/ignore/accidents.csv")

<h1>Licensed Drivers Data</h1>

In [5]:
#DL by State
df_clean_dl = df_dl.replace({"Alaska 2/": "Alaska",
                                       "Hawaii 2/": "Hawaii",
                                       "Dist. of Col.": "District of Columbia"})
state_list = df_clean_dl["STATE"].str.strip()
df_clean_dl["STATE"] = state_list
df_clean_dl = df_clean_dl.replace({"STATE": us_state_abbrev})
df_clean_dl.rename(columns={'STATE': 'State'}, inplace=True)
df_clean_dl.to_csv("Clean_Data/licensed_drivers.csv")

<h1>Population Data</h1.

<h2>State</h2>

In [6]:
df_clean_statepop = df_statepop
df_clean_statepop['Geographic Area'] = df_clean_statepop['Geographic Area'].str.replace(".", "")
pop_state_list = df_clean_statepop["Geographic Area"].str.strip()
df_clean_statepop["Geographic Area"] = pop_state_list
df_clean_statepop = df_clean_statepop.replace({"Geographic Area": us_state_abbrev})
df_clean_statepop.to_csv("Clean_Data/population_state.csv")

<h2>County</h2>

In [7]:
df_clean_countypop = df_countypop
df_clean_countypop.columns = [x.strip().replace('Population Estimate (as of July 1) - ', '') for x in df_clean_countypop.columns]
df_clean_countypop.columns = [x.strip().replace('April 1, ', '') for x in df_clean_countypop.columns]
df_clean_countypop.columns = [x.strip().replace('Census', '') for x in df_clean_countypop.columns]
df_clean_countypop.columns = [x.strip().replace('-', '') for x in df_clean_countypop.columns]
df_clean_countypop.columns = [x.strip().replace('Estimates Base', '') for x in df_clean_countypop.columns]

df_clean_countypop["County"] = df_clean_countypop["Geography"].agg(lambda x: x.split(",")[0])
df_clean_countypop["State"] = df_clean_countypop["Geography"].agg(lambda x: x.split(",")[1])

state_list = df_clean_countypop["State"].str.strip()
df_clean_countypop["State"] = state_list
df_clean_countypop = df_clean_countypop.replace({"State": us_state_abbrev})
df_clean_countypop['County'] = df_clean_countypop['County'].str.replace(" County", "")

df_clean_countypop.to_csv("Clean_Data/population_county.csv")

<h2>City</h2>

In [8]:
df_clean_citypop = df_citypop
df_clean_citypop.columns = [x.strip().replace('Population Estimate (as of July 1) - ', '') for x in df_clean_citypop.columns]
df_clean_citypop.columns = [x.strip().replace('April 1, ', '') for x in df_clean_citypop.columns]
df_clean_citypop.columns = [x.strip().replace('Census', '') for x in df_clean_citypop.columns]
df_clean_citypop.columns = [x.strip().replace('-', '') for x in df_clean_citypop.columns]
df_clean_citypop.columns = [x.strip().replace('Estimates Base', '') for x in df_clean_citypop.columns]

df_clean_citypop["City"] = df_clean_citypop["Geography.2"].agg(lambda x: x.split(",")[0])
df_clean_citypop["State"] = df_clean_citypop["Geography.2"].agg(lambda x: x.split(",")[1])

state_list = df_clean_citypop["State"].str.strip()
df_clean_citypop["State"] = state_list
df_clean_citypop = df_clean_citypop.replace({"State": us_state_abbrev})
df_clean_citypop['City'] = df_clean_citypop['City'].str.replace(" city", "")

df_clean_citypop.to_csv("Clean_Data/population_city.csv")

<h1>TMC Decoding Data</h1>

In [9]:
###Diana needs to review###

<h1>Denver Weather Data</h1>

In [10]:
###Diana needs to review###

<h1>Rhode Island Weather Data</h1>

In [11]:
###Diana needs to review###