In [1]:
import numpy as np
import pandas as pd
import requests

from plotnine import *

# source: https://gist.github.com/rogerallen/1583593
us_state_abbrev = requests.get('https://raw.githubusercontent.com/jwhendy/covid19/master/lib/state_abbrevs.json').json()
abbrev_us_state = dict(map(reversed, us_state_abbrev.items()))

pop = pd.read_excel('https://www2.census.gov/library/publications/2011/compendia/usa-counties/excel/POP01.xls')
pop = pop[['Area_name', 'STCOU', 'POP010210D']]
land = pd.read_excel('https://www2.census.gov/library/publications/2011/compendia/usa-counties/excel/LND01.xls')
land = land[['Areaname', 'STCOU', 'LND110210D']]
age = pd.read_excel('https://www2.census.gov/library/publications/2011/compendia/usa-counties/excel/AGE01.xls',
                    sheet_name='Sheet2')
age = age[['Areaname', 'STCOU', 'AGE050210D']]
inc = pd.read_excel('https://www2.census.gov/library/publications/2011/compendia/usa-counties/excel/INC01.xls')
inc = inc[['Area_name', 'STCOU', 'INC110209D']]

In [64]:
age.head()

Unnamed: 0,Areaname,STCOU,AGE050210D
0,UNITED STATES,0,37.2
1,ALABAMA,1000,37.9
2,"Autauga, AL",1001,37.0
3,"Baldwin, AL",1003,41.1
4,"Barbour, AL",1005,39.0


In [49]:
### population data: saves out state, county, fips, and population
df = pop.copy()
df.columns = ['area', 'fips', 'pop']
df['state'] = df['area'].str.split(', ', expand=True)[1]
df['county'] = df['area'].str.split(', ', expand=True)[0]
df = df[['state', 'county', 'fips', 'pop']]
df['state'] = df['state'].map(state_abbrevs.abbrev_us_state)
df = df[-df.state.isna()]
df.to_csv('data/population.csv', index=False)
df.head()

Unnamed: 0,state,county,fips,pop
2,Alabama,Autauga,1001,54571
3,Alabama,Baldwin,1003,182265
4,Alabama,Barbour,1005,27457
5,Alabama,Bibb,1007,22915
6,Alabama,Blount,1009,57322


In [50]:
### land area: saves out state, county, fips, and land in square miles
df = land.copy()
df.columns = ['area', 'fips', 'land_sqm']
df['state'] = df['area'].str.split(', ', expand=True)[1]
df['county'] = df['area'].str.split(', ', expand=True)[0]
df = df[['state', 'county', 'fips', 'land_sqm']]
df['state'] = df['state'].map(state_abbrevs.abbrev_us_state)
df = df[-df.state.isna()]
df.to_csv('data/land_sqm.csv', index=False)
df.head()

Unnamed: 0,state,county,fips,land_sqm
2,Alabama,Autauga,1001,594.44
3,Alabama,Baldwin,1003,1589.78
4,Alabama,Barbour,1005,884.88
5,Alabama,Bibb,1007,622.58
6,Alabama,Blount,1009,644.78


In [62]:
### age: saves out state, county, fips, and median age
df = age.copy()
df.columns = ['area', 'fips', 'age_med']
df['state'] = df['area'].str.split(', ', expand=True)[1]
df['county'] = df['area'].str.split(', ', expand=True)[0]
df = df[['state', 'county', 'fips', 'age_med']]
df['state'] = df['state'].map(state_abbrevs.abbrev_us_state)
df = df[-df.state.isna()]
df.to_csv('data/age_median.csv', index=False)
df.head()

Unnamed: 0,state,county,fips,age_med
2,Alabama,Autauga,1001,37.0
3,Alabama,Baldwin,1003,41.1
4,Alabama,Barbour,1005,39.0
5,Alabama,Bibb,1007,37.8
6,Alabama,Blount,1009,39.0


In [63]:
### income: saves out state, county, fips, and median income 2005-2009
df = inc.copy()
df.columns = ['area', 'fips', 'inc_med']
df['state'] = df['area'].str.split(', ', expand=True)[1]
df['county'] = df['area'].str.split(', ', expand=True)[0]
df = df[['state', 'county', 'fips', 'inc_med']]
df['state'] = df['state'].map(state_abbrevs.abbrev_us_state)
df = df[-df.state.isna()]
df.to_csv('data/income_median.csv', index=False)
df.head()

In [281]:
### pulls all mobility data from google
# - google: https://www.google.com/covid19/mobility/
# - quasi api used below: https://github.com/datasciencecampus/mobility-report-data-extractor
import datetime
import pandas as pd
import os
import re
import subprocess
import time

#data_date = '2020-04-05'
data_date = '2020-03-29'
seg_list = [x for _ in range(2)
            for x in ['Retail & recreation', 'Grocery & pharmacy', 'Parks',
                      'Transit stations', 'Workplace', 'Residential']]
path = '/home/jwhendy/vault/personal/covid19/'
dir_mob = 'mobility-report-data-extractor'
### run to re-download and process reports
#subprocess.call(['./lib/mobility-script.sh'])
areas = [d for d in os.listdir(os.path.join(path, dir_mob, 'output'))
         if d.startswith('US') and d.endswith('.csv') and data_date in d]
df_ts = pd.concat(pd.read_csv(os.path.join(path, dir_mob, 'output', area))
                  for area in areas)
df_ts

Unnamed: 0,country,page_num,plot_num,region,plot_name,asterisk,date,value,headline
0,California,1,1,California,Retail & recreation,False,2020-02-16,5.235,-50%
1,California,1,1,California,Retail & recreation,False,2020-02-17,8.644,-50%
2,California,1,1,California,Retail & recreation,False,2020-02-18,-0.209,-50%
3,California,1,1,California,Retail & recreation,False,2020-02-19,0.425,-50%
4,California,1,1,California,Retail & recreation,False,2020-02-20,2.027,-50%
...,...,...,...,...,...,...,...,...,...
5929,Wyoming,13,138,Weston County,Residential,True,2020-03-25,,Not enough data for this date
5930,Wyoming,13,138,Weston County,Residential,True,2020-03-26,,Not enough data for this date
5931,Wyoming,13,138,Weston County,Residential,True,2020-03-27,,Not enough data for this date
5932,Wyoming,13,138,Weston County,Residential,True,2020-03-28,,Not enough data for this date


In [308]:
df_mod = df_ts2.copy()
df_mod = df_mod[['country', 'region', 'plot_name', 'asterisk', 'date', 'value', 'headline']]
df_mod.columns=['state', 'county', 'seg', 'conf', 'date', 'value', 'headline' ]
df_mod = df_mod[df_mod['county'] != df_mod['state']]
df_mod = df_mod[-df_mod['county'].isin(['Baltimore', 'St. Louis', 'Fairfax', 'Franklin', 'Richmond', 'Roanoke'])]
df_mod['county'] = df_mod['county'].str.split(' County', expand=True)[0]
df_mod['county'] = df_mod['county'].str.split('  April 5, 2020', expand=True)[0]
df_mod['state'] = df_mod['state'].str.split('  April 5, 2020', expand=True)[0]
df_mod['county'] = df_mod['county'].str.split(' April 5, 2020', expand=True)[0]
df_mod['state'] = df_mod['state'].str.split(' April 5, 2020', expand=True)[0]
df_mod['conf'] = 1-df_mod['conf'].astype(int)
df_mod['headline'] = df_mod['headline'].str.replace('%', '')
df_mod['headline'] = df_mod['headline'].str.replace('+', '')
df_mod['headline'] = df_mod['headline'].str.replace('Not enough data for this date', '')
df_mod['headline'] = pd.to_numeric(df_mod['headline'])
df_mod
#df_mod = df_mod[['state', 'county', 'seg', 'conf', 'date', 'value']]
#df_mod.to_csv('data/mobility-data-ts_2020-04-05.csv', index=False)
#df_mod[(df_mod.state=='Colorado') & (df_mod.county=='Adams') & (df_mod.seg=='Parks')]

Unnamed: 0,state,county,seg,conf,date,value,headline
258,Alabama,Autauga,Retail & recreation,1,2020-02-23,-0.172,-53.0
259,Alabama,Autauga,Retail & recreation,1,2020-02-24,-2.062,-53.0
260,Alabama,Autauga,Retail & recreation,1,2020-02-25,9.883,-53.0
261,Alabama,Autauga,Retail & recreation,1,2020-02-26,17.471,-53.0
262,Alabama,Autauga,Retail & recreation,1,2020-02-27,13.809,-53.0
...,...,...,...,...,...,...,...
739939,Wyoming,Weston,Residential,0,2020-04-01,,
739940,Wyoming,Weston,Residential,0,2020-04-02,,
739941,Wyoming,Weston,Residential,0,2020-04-03,,
739942,Wyoming,Weston,Residential,0,2020-04-04,,


In [310]:
df_mod1 = df_mod.groupby(['state', 'county', 'seg'], as_index=False).agg({'value': 'last', 'headline': 'last'})
df_mod1['abs_delta'] = abs(df_mod1['value']-df_mod1['headline'])
df_mod1 = df_mod1[df_mod1['abs_delta'] > 1]
df_mod1.to_csv('data/mobility-errata_2020-04-05.csv', index=False)
df_mod1

Unnamed: 0,state,county,seg,value,headline,abs_delta
107,Alabama,Conecuh,Workplace,-44.184,-26.0,18.184
127,Alabama,Cullman,Parks,63.491,65.0,1.509
145,Alabama,DeKalb,Parks,9.439,-4.0,13.439
148,Alabama,DeKalb,Transit stations,-28.572,-49.0,20.428
163,Alabama,Etowah,Parks,19.227,-23.0,42.227
...,...,...,...,...,...,...
15100,Virginia,Bland,Transit stations,-26.832,-30.0,3.168
15245,Virginia,Emporia,Workplace,-31.580,-16.0,15.580
15340,Virginia,Greensville,Transit stations,-27.918,-44.0,16.082
15786,Virginia,Wythe,Grocery & pharmacy,-42.896,-16.0,26.896


In [309]:
df_mod

Unnamed: 0,state,county,seg,conf,date,value,headline
258,Alabama,Autauga,Retail & recreation,1,2020-02-23,-0.172,-53.0
259,Alabama,Autauga,Retail & recreation,1,2020-02-24,-2.062,-53.0
260,Alabama,Autauga,Retail & recreation,1,2020-02-25,9.883,-53.0
261,Alabama,Autauga,Retail & recreation,1,2020-02-26,17.471,-53.0
262,Alabama,Autauga,Retail & recreation,1,2020-02-27,13.809,-53.0
...,...,...,...,...,...,...,...
739939,Wyoming,Weston,Residential,0,2020-04-01,,
739940,Wyoming,Weston,Residential,0,2020-04-02,,
739941,Wyoming,Weston,Residential,0,2020-04-03,,
739942,Wyoming,Weston,Residential,0,2020-04-04,,


In [291]:
df_agg = df_mod.groupby(['state', 'county', 'seg'], as_index=False).agg({'conf': 'mean', 'headline': 'last'})
df_agg.columns = ['state', 'county', 'seg', 'conf', 'value']
df_agg.to_csv('data/mobility-data-agg_2020-04-05.csv', index=False)
df_agg

Unnamed: 0,state,county,seg,conf,value
0,Alabama,Autauga,Grocery & pharmacy,1,-8.0
1,Alabama,Autauga,Parks,0,-14.0
2,Alabama,Autauga,Residential,0,17.0
3,Alabama,Autauga,Retail & recreation,1,-53.0
4,Alabama,Autauga,Transit stations,0,
...,...,...,...,...,...
16861,Wyoming,Weston,Parks,0,
16862,Wyoming,Weston,Residential,0,
16863,Wyoming,Weston,Retail & recreation,0,-57.0
16864,Wyoming,Weston,Transit stations,0,


In [311]:
mob1_raw = pd.read_csv('data_raw/mobility-data-agg-raw_2020-03-29.csv')
mob2_raw = pd.read_csv('data_raw/mobility-data-agg-raw_2020-04-05.csv')
mob_ts1_raw = pd.read_csv('data_raw/mobility-data-ts-raw_2020-03-29.csv')
mob_ts2_raw = pd.read_csv('data_raw/mobility-data-ts-raw_2020-04-05.csv')
mob_ts1 = pd.read_csv('data/mobility-data-ts_2020-03-29.csv')
mob_ts2 = pd.read_csv('data/mobility-data-ts_2020-04-05.csv')

In [314]:
#mob_ts3 = mob_ts1.append(mob_ts2[mob_ts2.date > mob_ts1.date.max()])
mob_ts4 = mob_ts1[mob_ts1.date < mob_ts2.date.min()].append(mob_ts2)
#mob_ts3 = mob_ts3.sort_values(['state', 'county', 'seg', 'date'])
mob_ts4 = mob_ts4.sort_values(['state', 'county', 'seg', 'date'])
#df_ts4.to_csv(f'data/mobility-data-ts_all.csv', index=False)
#print(mob_ts3[(mob_ts3.state=='Colorado') & (mob_ts3.county=='Adams')])
#print(mob_ts4[(mob_ts4.state=='Colorado') & (mob_ts4.county=='Adams') & (mob_ts4.seg=='Parks')])
#mob_ts4.date.max()
mob_ts4.to_csv('data/mobility-data-ts_all.csv', index=False)

In [297]:
pd.set_option('display.width', 1000)

In [74]:
sah = pd.read_csv('data/sah_dates.csv')
sah.sort_values('date')

Unnamed: 0,state,date
38,Puerto Rico,2020-03-15
4,California,2020-03-19
27,New Jersey,2020-03-21
14,Illinois,2020-03-21
29,New York,2020-03-22
47,Washington,2020-03-23
36,Oregon,2020-03-23
6,Connecticut,2020-03-23
34,Ohio,2020-03-23
18,Louisiana,2020-03-23


In [89]:
df_ts_sub = df_ts2[df_ts2.county.isin(df[df.conf==1].county.unique())]
p = ggplot(df_ts_sub, aes(x='date', y='value', group='state+county')) + geom_line(alpha=0.05, size=0.1) + facet_wrap('~seg', nrow=3)
p = p + scale_x_datetime()
p = p + theme_bw() + theme(axis_text_x = element_text(angle=315, hjust=0))

In [90]:
p.save('mobility-by-segment_2020-04-05.png', dpi=150, width=10, height=6)

  warn("Saving {0} x {1} {2} image.".format(


In [94]:
df_ts_sub = df_ts4[df_ts4.county.isin(df[df.conf==1].county.unique())]
df_ts_sub = df_ts_sub[df_ts_sub['state'].isin(['California', 'New Jersey', 'Florida', 'Georgia'])]
p = ggplot(df_ts_sub, aes(x='date', y='value', group='state+county')) + geom_line(alpha=0.05, size=0.3) + facet_wrap('~state+seg', nrow=4)
p = p + scale_x_datetime()
p = p + theme_bw() + theme(axis_text_x = element_text(angle=315, hjust=0))
p.save('mobility-by-segment_ca-nj-vs-fl-ga_all_2020-04-05.png', dpi=150, width=12, height=6)
#p

  warn("Saving {0} x {1} {2} image.".format(
