In [2]:
import pandas as pd
import numpy as np
import copy
import holoviews as hv
import plotly.figure_factory as ff
import time
import matplotlib.pyplot as plt
from collections import Counter
import pickle

import geopandas as gpd
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter


In [3]:
def fill_chart(time_of_day,tod_name,df,prob = False):
    weather_types = list(df.columns)
    if prob:
        div = np.shape(time_of_day)[0]
    else:
        div = 1
    acc_data = [np.shape(time_of_day[time_of_day[:,2] == weather])[0]/div for weather in weather_types]
    df.loc[tod_name] = acc_data 
    return df

In [4]:
filename = './archive/US_Accidents_Dec20.csv'
data = pd.read_csv(filename,usecols = ['ID','Start_Time','Weather_Condition','Start_Lat','Start_Lng','State','County'])
data = data.dropna()

In [5]:
#Remove data entries that cannot be accounted for
#Fix Weird Dona Ana
a = np.array(data.loc[data['County'].str.contains('Ana'),'County'])
weird_ana = a[a != 'Doña Ana'][0]
correct_ana = 'Doña Ana'
print(weird_ana)

clean_data = copy.deepcopy(data)
clean_data_np = clean_data.to_numpy()
county_names = clean_data_np[:,4]
county_names[county_names == weird_ana] = correct_ana
clean_data['County'] = county_names

#Drop unused counties
drop_counties = ['Dewitt IL','Harford PA','Kenosha IL', 'Oglala Lakota SD', 'Walworth IL']
clean_data_np = np.array([entry for entry in clean_data_np if entry[4]+' '+entry[5] not in drop_counties])
clean_data = pd.DataFrame(clean_data_np,columns = clean_data.columns)

Do�a Ana


In [6]:
#county_names = pd.read_csv(filename,usecols = ['County'])

In [7]:
file = open("mod_fip_dict.pkl", "rb")
fip_dict = pickle.load(file)

In [8]:
fip_dict

{'Abbeville SC': '45001',
 'Acadia LA': '22001',
 'Accomack VA': '51001',
 'Ada ID': '16001',
 'Adair IA': '19001',
 'Adair KY': '21001',
 'Adair MO': '29001',
 'Adams CO': '08001',
 'Adams IA': '19003',
 'Adams ID': '16003',
 'Adams IL': '17001',
 'Adams IN': '18001',
 'Adams MS': '28001',
 'Adams ND': '38001',
 'Adams NE': '31001',
 'Adams OH': '39001',
 'Adams PA': '42001',
 'Adams WA': '53001',
 'Adams WI': '55001',
 'Addison VT': '50001',
 'Aiken SC': '45003',
 'Aitkin MN': '27001',
 'Alachua FL': '12001',
 'Alamance NC': '37001',
 'Alameda CA': '06001',
 'Alamosa CO': '08003',
 'Albany NY': '36001',
 'Albany WY': '56001',
 'Albemarle VA': '51003',
 'Alcona MI': '26001',
 'Alcorn MS': '28003',
 'Alexander IL': '17003',
 'Alexander NC': '37003',
 'Alexandria City VA': '51510',
 'Alexandria VA': '51510',
 'Allamakee IA': '19005',
 'Allegan MI': '26005',
 'Allegany MD': '24001',
 'Allegany NY': '36003',
 'Alleghany NC': '37005',
 'Alleghany VA': '51005',
 'Allegheny PA': '42003',
 'A

In [9]:
state_dict = {}
state = 'AL'
for key in fip_dict:
    if key.split(' ')[-1] == state:
        state_dict[key] = fip_dict[key]
state_dict

{'Autauga AL': '01001',
 'Baldwin AL': '01003',
 'Barbour AL': '01005',
 'Bibb AL': '01007',
 'Blount AL': '01009',
 'Bullock AL': '01011',
 'Butler AL': '01013',
 'Calhoun AL': '01015',
 'Chambers AL': '01017',
 'Cherokee AL': '01019',
 'Chilton AL': '01021',
 'Choctaw AL': '01023',
 'Clarke AL': '01025',
 'Clay AL': '01027',
 'Cleburne AL': '01029',
 'Coffee AL': '01031',
 'Colbert AL': '01033',
 'Conecuh AL': '01035',
 'Coosa AL': '01037',
 'Covington AL': '01039',
 'Crenshaw AL': '01041',
 'Cullman AL': '01043',
 'Dale AL': '01045',
 'Dallas AL': '01047',
 'De Kalb AL': '01049',
 'DeKalb AL': '01049',
 'Elmore AL': '01051',
 'Escambia AL': '01053',
 'Etowah AL': '01055',
 'Fayette AL': '01057',
 'Franklin AL': '01059',
 'Geneva AL': '01061',
 'Greene AL': '01063',
 'Hale AL': '01065',
 'Henry AL': '01067',
 'Houston AL': '01069',
 'Jackson AL': '01071',
 'Jefferson AL': '01073',
 'Lamar AL': '01075',
 'Lauderdale AL': '01077',
 'Lawrence AL': '01079',
 'Lee AL': '01081',
 'Limeston

In [20]:
date_timestamps = np.array([data.split() for data in clean_data['Start_Time'].to_numpy()])
dates = date_timestamps[:,0]
timestamps = date_timestamps[:,1]

In [21]:
timestamps

array(['05:46:00', '06:07:59', '06:49:27', ..., '19:00:21', '19:00:21',
       '18:52:06'], dtype='<U18')

In [22]:
time_of_day = []
for time in timestamps:
    hour = int(time.split(':')[0])
    if hour >= 6 and hour <= 11:
        time_of_day.append('day')
    elif hour >= 12 and hour <= 17:
        time_of_day.append('noon')
    else:
        time_of_day.append('night')
time_of_day = np.array(time_of_day)

In [23]:
time_of_day

array(['night', 'day', 'day', ..., 'night', 'night', 'night'], dtype='<U5')

In [16]:
FIPS = []
for county,state in zip(clean_data['County'],clean_data['State']):
    name = county + ' ' + state
    FIPS.append(fip_dict[name])
FIPS = np.array(FIPS)

In [17]:
FIPS

array(['39113', '39049', '39025', ..., '06059', '06037', '06071'],
      dtype='<U5')

In [24]:
res = 1
processed_data = np.array([clean_data['ID'].to_numpy(),timestamps,clean_data['Weather_Condition'],\
                           clean_data['Start_Lat'],clean_data['Start_Lng'],\
                           clean_data['County'],time_of_day,clean_data['State'],FIPS,dates],\
                           dtype = object).T
#processed_data = processed_data[np.arange(0,len(processed_data),res)]

In [25]:
processed_data.shape

(4134150, 10)

In [26]:
rain_terms = ['Rain','Precipitation','Drizzle','Hail','Squalls','Ice','Thunder','Thunderstorm','T-Storm','Showers in the Vicinity','Rain and Sleet']
cloudy_terms = ['Cloudy','Cloud','Clouds','Thunder in the Vicinity']
clear_terms = ['Clear','Fair']
snow_terms = ['Snow','Wintry','Sleet']
dust_terms = ['Dust','Ash','Sand','Tornado']
fog_terms = ['Mist','Fog','Smoke','Haze','Overcast']

In [27]:
update = True
if update:
    grouped_weather = []
    for ind,w_string in enumerate(processed_data[:,2]):
        weather = np.array(w_string.split())
        if w_string in fog_terms or any(word in fog_terms for word in weather):
            group = 'Fog'
        elif w_string in snow_terms or any(word in snow_terms for word in weather):
            group = 'Snow'
        elif w_string in cloudy_terms or any(word in cloudy_terms for word in weather):
            group = 'Cloudy'
        elif w_string in rain_terms or any(word in rain_terms for word in weather):
            group = 'Rain'
        elif w_string in clear_terms or any(word in clear_terms for word in weather):
            group = 'Clear'
        elif w_string in dust_terms or any(word in dust_terms for word in weather):
            group = 'Dust'
        else:
            group = w_string
        processed_data[ind,2] = group


In [29]:
df = pd.DataFrame(processed_data,columns =  ['ID','Start_Time','Weather_Condition','Start_Lat','Start_Lng','County',\
                                            'Time of Day','State','FIP','Date'])
df.to_csv('processed_data.csv')

In [30]:
processed_data = pd.read_csv('processed_data.csv',\
                             usecols = ['ID','Start_Time','Weather_Condition','Start_Lat','Start_Lng','County',\
                                       'Time of Day','State','FIP','Date']).to_numpy()
processed_data

array([['A-1', '05:46:00', 'Rain', ..., 'OH', 39113, '2016-02-08'],
       ['A-2', '06:07:59', 'Rain', ..., 'OH', 39049, '2016-02-08'],
       ['A-3', '06:49:27', 'Fog', ..., 'OH', 39025, '2016-02-08'],
       ...,
       ['A-4239404', '19:00:21', 'Cloudy', ..., 'CA', 6059, '2019-08-23'],
       ['A-4239405', '19:00:21', 'Clear', ..., 'CA', 6037, '2019-08-23'],
       ['A-4239406', '18:52:06', 'Clear', ..., 'CA', 6071, '2019-08-23']],
      dtype=object)