In [34]:
import numpy as np
import pandas as pd
import requests

from plotnine import *

# source: https://gist.github.com/rogerallen/1583593
us_state_abbrev = requests.get('https://raw.githubusercontent.com/jwhendy/covid19/master/lib/state_abbrevs.json').json()
abbrev_us_state = dict(map(reversed, us_state_abbrev.items()))

pop = pd.read_excel('https://www2.census.gov/library/publications/2011/compendia/usa-counties/excel/POP01.xls')
pop = pop[['Area_name', 'STCOU', 'POP010210D']]
land = pd.read_excel('https://www2.census.gov/library/publications/2011/compendia/usa-counties/excel/LND01.xls')
land = land[['Areaname', 'STCOU', 'LND110210D']]
age = pd.read_excel('https://www2.census.gov/library/publications/2011/compendia/usa-counties/excel/AGE01.xls',
                    sheet_name='Sheet2')
age = age[['Areaname', 'STCOU', 'AGE050210D']]
inc = pd.read_excel('https://www2.census.gov/library/publications/2011/compendia/usa-counties/excel/INC01.xls')
inc = inc[['Area_name', 'STCOU', 'INC110209D']]

In [64]:
age.head()

Unnamed: 0,Areaname,STCOU,AGE050210D
0,UNITED STATES,0,37.2
1,ALABAMA,1000,37.9
2,"Autauga, AL",1001,37.0
3,"Baldwin, AL",1003,41.1
4,"Barbour, AL",1005,39.0


In [49]:
### population data: saves out state, county, fips, and population
df = pop.copy()
df.columns = ['area', 'fips', 'pop']
df['state'] = df['area'].str.split(', ', expand=True)[1]
df['county'] = df['area'].str.split(', ', expand=True)[0]
df = df[['state', 'county', 'fips', 'pop']]
df['state'] = df['state'].map(state_abbrevs.abbrev_us_state)
df = df[-df.state.isna()]
df.to_csv('data/population.csv', index=False)
df.head()

Unnamed: 0,state,county,fips,pop
2,Alabama,Autauga,1001,54571
3,Alabama,Baldwin,1003,182265
4,Alabama,Barbour,1005,27457
5,Alabama,Bibb,1007,22915
6,Alabama,Blount,1009,57322


In [50]:
### land area: saves out state, county, fips, and land in square miles
df = land.copy()
df.columns = ['area', 'fips', 'land_sqm']
df['state'] = df['area'].str.split(', ', expand=True)[1]
df['county'] = df['area'].str.split(', ', expand=True)[0]
df = df[['state', 'county', 'fips', 'land_sqm']]
df['state'] = df['state'].map(state_abbrevs.abbrev_us_state)
df = df[-df.state.isna()]
df.to_csv('data/land_sqm.csv', index=False)
df.head()

Unnamed: 0,state,county,fips,land_sqm
2,Alabama,Autauga,1001,594.44
3,Alabama,Baldwin,1003,1589.78
4,Alabama,Barbour,1005,884.88
5,Alabama,Bibb,1007,622.58
6,Alabama,Blount,1009,644.78


In [62]:
### age: saves out state, county, fips, and median age
df = age.copy()
df.columns = ['area', 'fips', 'age_med']
df['state'] = df['area'].str.split(', ', expand=True)[1]
df['county'] = df['area'].str.split(', ', expand=True)[0]
df = df[['state', 'county', 'fips', 'age_med']]
df['state'] = df['state'].map(state_abbrevs.abbrev_us_state)
df = df[-df.state.isna()]
df.to_csv('data/age_median.csv', index=False)
df.head()

Unnamed: 0,state,county,fips,age_med
2,Alabama,Autauga,1001,37.0
3,Alabama,Baldwin,1003,41.1
4,Alabama,Barbour,1005,39.0
5,Alabama,Bibb,1007,37.8
6,Alabama,Blount,1009,39.0


In [63]:
### income: saves out state, county, fips, and median income 2005-2009
df = inc.copy()
df.columns = ['area', 'fips', 'inc_med']
df['state'] = df['area'].str.split(', ', expand=True)[1]
df['county'] = df['area'].str.split(', ', expand=True)[0]
df = df[['state', 'county', 'fips', 'inc_med']]
df['state'] = df['state'].map(state_abbrevs.abbrev_us_state)
df = df[-df.state.isna()]
df.to_csv('data/income_median.csv', index=False)
df.head()

In [34]:
### pulls all mobility data from google
# - google: https://www.google.com/covid19/mobility/
# - quasi api used below: https://github.com/datasciencecampus/mobility-report-data-extractor
import datetime
import pandas as pd
import os
import re
import subprocess
import time

seg_list = [x for _ in range(2)
            for x in ['Retail & recreation', 'Grocery & pharmacy', 'Parks',
                      'Transit stations', 'Workplace', 'Residential']]
path = '/home/jwhendy/vault/personal/covid19/'
dir_mob = 'mobility-report-data-extractor'
### run to re-download and process reports
#subprocess.call(['./lib/mobility-script.sh'])
areas = [d for d in os.listdir(os.path.join(path, dir_mob, 'output'))
         if d.startswith('US')]

start = time.time()
data_all = []
for area in areas:
    ### re-process pdfs to text
    f = os.path.join(path, dir_mob, 'pdfs', area)
    subprocess.call(['/usr/bin/pdftotext', '-layout', '-raw', f'{f}.pdf', f'{f}.txt'])
    with open(f'{f}.txt') as f:
        lines = [l for l in f.read().split('\n') if l.strip()]
    #print(lines)
    
    header = re.split(', | ', lines[1])
    date = f'{header[-1]}-{header[-3]}-{header[-2]}'
    date = datetime.datetime.strptime(date, '%Y-%B-%d').strftime('%Y-%m-%d')
    area = ' '.join(header[:-3])
    
    data = []
    for i, line in enumerate(lines):
        if re.findall('Retail & recreation', line) and i<20:
            vals = [re.sub('%|\+', '', lines[i+x]) for x in [1, 13, 26, 38, 49, 59]]
            rows = [{'state': area, 'county': 'summary', 'seg': seg_list[i], 'conf': None, 'value': vals[i]} for i in range(6)]
            data.extend(rows)
        if re.findall('\f', line) and i>50:
            locs = [x.strip() for x in [lines[i], lines[i+13]] for _ in range(6)]
            locs = [l for l in locs if len(l.split(' ')) < 4]
            asts = [lines[i+n-1] for n, x in enumerate(lines[i:i+110]) if x.startswith('Sun')]
            asts = [0 if ast=='*' else 1 for ast in asts]
            vals = [re.sub('%|\+|compared to baseline', '', lines[i+x])
                    for x in [2, 4, 6, 8, 10, 12, 15, 17, 19, 21, 23, 25]]
            vals = [val.strip(' ') if val != 'Not enough data for this date' else None for val in vals]
            segs = [lines[i+n+1] for n in [0, 2, 4, 6, 8, 10, 13, 15, 17, 19, 21, 23]]
            for i, loc in enumerate(locs):
                if segs[i] not in seg_list:
                    continue
                data.append({'state': area, 'county': locs[i], 'seg': segs[i], 'conf': asts[i], 'value': vals[i]})

    for i, d in enumerate(data):
        seq = (6*int(i/6))+(i%6)+1
        data[i]['i'] = seq
        data[i]['path'] = f"output/US-{d['state'].replace(' ', '_', -1)}/csv/{seq}.csv"
    
    data_all.extend(data)
end = time.time()
print(end-start)

df = pd.DataFrame(data_all)
df['county'] = df['county'].str.split(' County', expand=True)[0]
df = df[df['county'] != 'summary']
df = df[df['state'] != 'United States']
df['value'] = pd.to_numeric(df['value'])
df['conf'] = pd.to_numeric(df['conf'])

8.991184711456299


In [35]:
df_save = df.copy()[['state', 'county', 'seg', 'conf', 'value']]
df_save.to_csv('data/mobility-data-agg.csv', index=False)
#pd.set_option('display.width', 1000)
#print(df_save[(df_save.state=='Ohio') & (df_save.county=='Lucas')])

In [36]:
ts_all = []
df2 = df.copy()
for i, row in df2.iterrows():
    if row.state=='Texas' or row.state=='Virginia' or row.state=='Wisconsin':
        continue
    ts = pd.read_csv(os.path.join(path, dir_mob, row.path))[['value', 'date']]
    ts['seg'] = [row.seg]*len(ts)
    ts['state'] = [row['state']]*len(ts)
    ts['county'] = [row['county']]*len(ts)
    ts_all.append(ts)

df_ts = pd.concat(ts_all)
df_ts = df_ts[['state', 'county', 'seg', 'date', 'value']]
df_ts

Unnamed: 0,state,county,seg,date,value
0,Alabama,Autauga,Retail & recreation,23/02/2020,-0.172
1,Alabama,Autauga,Retail & recreation,24/02/2020,-2.062
2,Alabama,Autauga,Retail & recreation,25/02/2020,9.883
3,Alabama,Autauga,Retail & recreation,26/02/2020,17.471
4,Alabama,Autauga,Retail & recreation,27/02/2020,13.809
...,...,...,...,...,...
38,Wyoming,Weston,Residential,01/04/2020,
39,Wyoming,Weston,Residential,02/04/2020,
40,Wyoming,Weston,Residential,03/04/2020,
41,Wyoming,Weston,Residential,04/04/2020,


In [38]:
#df_ts[(df_ts.state=='Ohio') & (df_ts.county=='Mahoning')]
#print(df_ts[(df_ts.state=='Ohio') & (df_ts.county=='Lucas') & (df_ts.seg=='Parks')])
df_ts.to_csv('data/mobility-data-ts.csv', index=False)

In [184]:
sah_dates = {
 'Alabama': '2020-04-04',
 'Alaska': '2020-03-28',
 'Arizona': '2020-03-31',
 'Arkansas': None,
 'California': '2020-03-19',
 'Colorado': '2020-03-26',
 'Connecticut': '2020-03-23',
 'Delaware': '2020-03-24',
 'District of Columbia': '2020-04-01',
 'Florida': '2020-04-03',
 'Georgia': '2020-04-03',
 'Hawaii': '2020-03-25',
 'Iowa': None,
 'Idaho': '2020-03-25',
 'Illinois': '2020-03-21',
 'Indiana': '2020-03-24',
 'Kansas': '2020-03-30',
 'Kentucky': '2020-03-26',
 'Louisiana': '2020-03-23',
 'Maine': '2020-04-02',
 'Massachusetts': '2020-',
 'Maryland': '2020-03-30',
 'Michigan': '2020-03-24',
 'Minnesota': '2020-03-27',
 'Missouri': '2020-04-06',
 'Montana': '2020-03-28',
 'New Hampshire': '2020-03-27',
 'New Jersey': '2020-03-21',
 'New Mexico': '2020-03-24',
 'New York': '2020-03-22',
 'North Carolina': '2020-03-30',
 'North Dakota': None,
 'Nebraska': None,
 'Nevada': None,
 'Ohio': '2020-03-23',
 'Oklahoma': '2020-03-28', # no statewide order, mean of implementing cities
 'Oregon': '2020-03-23',
 'Pennsylvania': '2020-04-01',
 'Puerto Rico': '2020-03-15',
 'Rhode Island': '2020-03-28',
 'South Carolina': '2020-04-07',
 'South Dakota': None,
 'Tennessee': '2020-03-31',
 'Texas': '2020-04-02',
 'Utah': '2020-03-30', # no statewide order, mean of implementing cities
 'Vermont': '2020-03-25',
 'Virginia': '2020-03-30',
 'Washington': '2020-03-23',
 'West Virginia': '2020-03-24',
 'Wisconsin': '2020-03-25',
 'Wyoming': '2020-03-28'
}

import json
with open('data/stay-at-home-dates.json', 'w') as f:
    f.write(json.dumps(sah_dates))

Unnamed: 0,state,county,seg,date,value
0,Alabama,Autauga,Retail & recreation,2020-02-16,0.173
1,Alabama,Autauga,Retail & recreation,2020-02-17,7.732
2,Alabama,Autauga,Retail & recreation,2020-02-18,-1.675
3,Alabama,Autauga,Retail & recreation,2020-02-19,-1.663
4,Alabama,Autauga,Retail & recreation,2020-02-20,-7.739
...,...,...,...,...,...
38,Wyoming,Weston,Residential,2020-03-25,
39,Wyoming,Weston,Residential,2020-03-26,
40,Wyoming,Weston,Residential,2020-03-27,
41,Wyoming,Weston,Residential,2020-03-28,


In [159]:
p = ggplot(df_ts, aes(x='date', y='value', group='state+county')) + geom_line(alpha=0.05, size=0.3) + facet_wrap('~seg', nrow=3)
p = p + scale_x_datetime()
p = p + theme_bw() + theme(axis_text_x = element_text(angle=315, hjust=0))

In [160]:
p.save('mobility-by-segment.png', dpi=150, width=10, height=6)

In [176]:
print(df_ts.tail(10))


      state  county          seg        date  value
33  Wyoming  Weston  Residential  2020-03-20    NaN
34  Wyoming  Weston  Residential  2020-03-21    NaN
35  Wyoming  Weston  Residential  2020-03-22    NaN
36  Wyoming  Weston  Residential  2020-03-23    NaN
37  Wyoming  Weston  Residential  2020-03-24    NaN
38  Wyoming  Weston  Residential  2020-03-25    NaN
39  Wyoming  Weston  Residential  2020-03-26    NaN
40  Wyoming  Weston  Residential  2020-03-27    NaN
41  Wyoming  Weston  Residential  2020-03-28    NaN
42  Wyoming  Weston  Residential  2020-03-29    NaN
