In [1]:
import numpy as np
import pandas as pd
import requests

from plotnine import *

# source: https://gist.github.com/rogerallen/1583593
us_state_abbrev = requests.get('https://raw.githubusercontent.com/jwhendy/covid19/master/lib/state_abbrevs.json').json()
abbrev_us_state = dict(map(reversed, us_state_abbrev.items()))

pop = pd.read_excel('https://www2.census.gov/library/publications/2011/compendia/usa-counties/excel/POP01.xls')
pop = pop[['Area_name', 'STCOU', 'POP010210D']]
land = pd.read_excel('https://www2.census.gov/library/publications/2011/compendia/usa-counties/excel/LND01.xls')
land = land[['Areaname', 'STCOU', 'LND110210D']]
age = pd.read_excel('https://www2.census.gov/library/publications/2011/compendia/usa-counties/excel/AGE01.xls',
                    sheet_name='Sheet2')
age = age[['Areaname', 'STCOU', 'AGE050210D']]
inc = pd.read_excel('https://www2.census.gov/library/publications/2011/compendia/usa-counties/excel/INC01.xls')
inc = inc[['Area_name', 'STCOU', 'INC110209D']]

In [64]:
age.head()

Unnamed: 0,Areaname,STCOU,AGE050210D
0,UNITED STATES,0,37.2
1,ALABAMA,1000,37.9
2,"Autauga, AL",1001,37.0
3,"Baldwin, AL",1003,41.1
4,"Barbour, AL",1005,39.0


In [49]:
### population data: saves out state, county, fips, and population
df = pop.copy()
df.columns = ['area', 'fips', 'pop']
df['state'] = df['area'].str.split(', ', expand=True)[1]
df['county'] = df['area'].str.split(', ', expand=True)[0]
df = df[['state', 'county', 'fips', 'pop']]
df['state'] = df['state'].map(state_abbrevs.abbrev_us_state)
df = df[-df.state.isna()]
df.to_csv('data/population.csv', index=False)
df.head()

Unnamed: 0,state,county,fips,pop
2,Alabama,Autauga,1001,54571
3,Alabama,Baldwin,1003,182265
4,Alabama,Barbour,1005,27457
5,Alabama,Bibb,1007,22915
6,Alabama,Blount,1009,57322


In [50]:
### land area: saves out state, county, fips, and land in square miles
df = land.copy()
df.columns = ['area', 'fips', 'land_sqm']
df['state'] = df['area'].str.split(', ', expand=True)[1]
df['county'] = df['area'].str.split(', ', expand=True)[0]
df = df[['state', 'county', 'fips', 'land_sqm']]
df['state'] = df['state'].map(state_abbrevs.abbrev_us_state)
df = df[-df.state.isna()]
df.to_csv('data/land_sqm.csv', index=False)
df.head()

Unnamed: 0,state,county,fips,land_sqm
2,Alabama,Autauga,1001,594.44
3,Alabama,Baldwin,1003,1589.78
4,Alabama,Barbour,1005,884.88
5,Alabama,Bibb,1007,622.58
6,Alabama,Blount,1009,644.78


In [62]:
### age: saves out state, county, fips, and median age
df = age.copy()
df.columns = ['area', 'fips', 'age_med']
df['state'] = df['area'].str.split(', ', expand=True)[1]
df['county'] = df['area'].str.split(', ', expand=True)[0]
df = df[['state', 'county', 'fips', 'age_med']]
df['state'] = df['state'].map(state_abbrevs.abbrev_us_state)
df = df[-df.state.isna()]
df.to_csv('data/age_median.csv', index=False)
df.head()

Unnamed: 0,state,county,fips,age_med
2,Alabama,Autauga,1001,37.0
3,Alabama,Baldwin,1003,41.1
4,Alabama,Barbour,1005,39.0
5,Alabama,Bibb,1007,37.8
6,Alabama,Blount,1009,39.0


In [63]:
### income: saves out state, county, fips, and median income 2005-2009
df = inc.copy()
df.columns = ['area', 'fips', 'inc_med']
df['state'] = df['area'].str.split(', ', expand=True)[1]
df['county'] = df['area'].str.split(', ', expand=True)[0]
df = df[['state', 'county', 'fips', 'inc_med']]
df['state'] = df['state'].map(state_abbrevs.abbrev_us_state)
df = df[-df.state.isna()]
df.to_csv('data/income_median.csv', index=False)
df.head()

In [3]:
### pulls all mobility data from google
# - google: https://www.google.com/covid19/mobility/
# - quasi api used below: https://github.com/datasciencecampus/mobility-report-data-extractor
import datetime
import pandas as pd
import os
import re
import subprocess
import time

data_date = '2020-04-05'
#data_date = '2020-03-29'
seg_list = [x for _ in range(2)
            for x in ['Retail & recreation', 'Grocery & pharmacy', 'Parks',
                      'Transit stations', 'Workplace', 'Residential']]
path = '/home/jwhendy/vault/personal/covid19/'
dir_mob = 'mobility-report-data-extractor'
### run to re-download and process reports
#subprocess.call(['./lib/mobility-script.sh'])
areas = [d for d in os.listdir(os.path.join(path, dir_mob, 'output'))
         if d.startswith('US') and d.endswith('.csv') and data_date in d]
df_ts = pd.concat(pd.read_csv(os.path.join(path, dir_mob, 'output', area))
                  for area in areas)
#df_ts.to_csv('data_raw/mobility-data-ts-raw_2020-04-05.csv', index=False)
df_ts

Unnamed: 0,country,page_num,plot_num,region,plot_name,asterisk,date,value,headline
0,Alabama,1,1,Alabama,Retail & recreation,False,2020-02-23,3.354,-50%
1,Alabama,1,1,Alabama,Retail & recreation,False,2020-02-24,-1.975,-50%
2,Alabama,1,1,Alabama,Retail & recreation,False,2020-02-25,6.971,-50%
3,Alabama,1,1,Alabama,Retail & recreation,False,2020-02-26,15.597,-50%
4,Alabama,1,1,Alabama,Retail & recreation,False,2020-02-27,11.776,-50%
...,...,...,...,...,...,...,...,...,...
5929,Wyoming,13,138,Weston County,Residential,True,2020-04-01,,Not enough data for this date
5930,Wyoming,13,138,Weston County,Residential,True,2020-04-02,,Not enough data for this date
5931,Wyoming,13,138,Weston County,Residential,True,2020-04-03,,Not enough data for this date
5932,Wyoming,13,138,Weston County,Residential,True,2020-04-04,,Not enough data for this date


In [23]:
df_mod = mob_ts1_raw.copy()
df_mod = df_mod[['country', 'region', 'plot_name', 'asterisk', 'date', 'value', 'headline']]
df_mod.columns=['state', 'county', 'seg', 'conf', 'date', 'value', 'headline' ]
df_mod = df_mod[-df_mod['county'].isin(['Baltimore', 'St. Louis', 'Fairfax', 'Franklin', 'Richmond', 'Roanoke'])]


df_mod['conf'] = 1-df_mod['conf'].astype(int)
df_mod['headline'] = df_mod['headline'].str.replace('%', '')
df_mod['headline'] = df_mod['headline'].str.replace('+', '')
df_mod['headline'] = df_mod['headline'].str.replace('Not enough data for this date', '')
df_mod['headline'] = pd.to_numeric(df_mod['headline'])

df_mod_states1 = df_mod[df_mod['county'] == df_mod['state']]
df_mod = df_mod[df_mod['county'] != df_mod['state']]
df_mod['county'] = df_mod['county'].str.split(' County', expand=True)[0]
print(len_orig)
df_mod_states

#df_mod = df_mod[['state', 'county', 'seg', 'conf', 'date', 'value']]
#df_mod.to_csv('data/mobility-data-ts_2020-04-05.csv', index=False)
#df_mod_states.to_csv('data/mobility-data-ts-states_2020-03-29.csv', index=False)
#df_mod[(df_mod.state=='Colorado') & (df_mod.county=='Adams') & (df_mod.seg=='Parks')]

725238


Unnamed: 0,state,county,seg,conf,date,value,headline
0,Alabama,Alabama,Retail & recreation,1,2020-02-23,3.354,-50.0
1,Alabama,Alabama,Retail & recreation,1,2020-02-24,-1.975,-50.0
2,Alabama,Alabama,Retail & recreation,1,2020-02-25,6.971,-50.0
3,Alabama,Alabama,Retail & recreation,1,2020-02-26,15.597,-50.0
4,Alabama,Alabama,Retail & recreation,1,2020-02-27,11.776,-50.0
...,...,...,...,...,...,...,...
734263,Wyoming,Wyoming,Residential,1,2020-04-01,14.725,9.0
734264,Wyoming,Wyoming,Residential,1,2020-04-02,18.972,9.0
734265,Wyoming,Wyoming,Residential,1,2020-04-03,17.384,9.0
734266,Wyoming,Wyoming,Residential,1,2020-04-04,10.865,9.0


In [321]:
### extracting which ts data still don't match the headline
df_mod2 = df_mod.groupby(['state', 'county', 'seg'], as_index=False).agg({'value': 'last', 'headline': 'last'})
df_mod2['abs_delta'] = abs(df_mod2['value']-df_mod2['headline'])
df_mod2 = df_mod2[df_mod2['abs_delta'] > 1]
#df_mod1.to_csv('data/mobility-errata_2020-04-05.csv', index=False)

In [None]:
### making aggregate files
df_agg = df_mod.groupby(['state', 'county', 'seg'], as_index=False).agg({'conf': 'mean', 'headline': 'last'})
df_agg.columns = ['state', 'county', 'seg', 'conf', 'value']
df_agg.to_csv('data/mobility-data-agg_2020-04-05.csv', index=False)
df_agg

In [21]:
df_agg = df_mod_states.groupby(['state', 'county', 'seg'], as_index=False).agg({'conf': 'mean', 'headline': 'last'})
df_agg.columns = ['state', 'county', 'seg', 'conf', 'value']
df_agg.to_csv('data/mobility-data-agg-states_2020-04-05.csv', index=False)
df_agg

Unnamed: 0,state,county,seg,conf,value
0,Alabama,Alabama,Grocery & pharmacy,1,-18.0
1,Alabama,Alabama,Parks,1,-1.0
2,Alabama,Alabama,Residential,1,12.0
3,Alabama,Alabama,Retail & recreation,1,-50.0
4,Alabama,Alabama,Transit stations,1,-40.0
...,...,...,...,...,...
301,Wyoming,Wyoming,Parks,1,22.0
302,Wyoming,Wyoming,Residential,1,9.0
303,Wyoming,Wyoming,Retail & recreation,1,-39.0
304,Wyoming,Wyoming,Transit stations,1,-21.0


In [2]:
mob1_raw = pd.read_csv('data_raw/mobility-data-agg-raw_2020-03-29.csv')
mob2_raw = pd.read_csv('data_raw/mobility-data-agg-raw_2020-04-05.csv')
mob_ts1_raw = pd.read_csv('data_raw/mobility-data-ts-raw_2020-03-29.csv')
mob_ts2_raw = pd.read_csv('data_raw/mobility-data-ts-raw_2020-04-05.csv')
mob_ts1 = pd.read_csv('data/mobility-data-ts_2020-03-29.csv')
mob_ts2 = pd.read_csv('data/mobility-data-ts_2020-04-05.csv')
mob_ts_all = pd.read_csv('data/mobility-data-ts_all.csv')

In [314]:
#mob_ts3 = mob_ts1.append(mob_ts2[mob_ts2.date > mob_ts1.date.max()])
mob_ts4 = mob_ts1[mob_ts1.date < mob_ts2.date.min()].append(mob_ts2)
mob_ts4_states = df_mod_states1[df_mod_states1.date < df_mod_states2.date.min()].append(df_mod_states2)
#mob_ts3 = mob_ts3.sort_values(['state', 'county', 'seg', 'date'])
mob_ts4 = mob_ts4.sort_values(['state', 'county', 'seg', 'date'])
#df_ts4.to_csv(f'data/mobility-data-ts_all.csv', index=False)
#print(mob_ts3[(mob_ts3.state=='Colorado') & (mob_ts3.county=='Adams')])
#print(mob_ts4[(mob_ts4.state=='Colorado') & (mob_ts4.county=='Adams') & (mob_ts4.seg=='Parks')])
#mob_ts4.date.max()
mob_ts4.to_csv('data/mobility-data-ts_all.csv', index=False)

In [297]:
pd.set_option('display.width', 1000)

In [326]:
mob_ts_all.head()

Unnamed: 0,state,county,seg,conf,date,value
0,Alabama,Autauga,Grocery & pharmacy,1,2020-02-16,0.764
1,Alabama,Autauga,Grocery & pharmacy,1,2020-02-17,-0.382
2,Alabama,Autauga,Grocery & pharmacy,1,2020-02-18,0.385
3,Alabama,Autauga,Grocery & pharmacy,1,2020-02-19,-0.386
4,Alabama,Autauga,Grocery & pharmacy,1,2020-02-20,-2.756


In [347]:
#df_ts_sub = mob_ts_all[mob_ts_all.conf==1].iloc[0:1000]
df_ts_sub = mob_ts_all.copy()
p = ggplot(df_ts_sub, aes(x='date', y='value', color='factor(conf)', group='state+county')) + geom_line(alpha=0.05, size=0.1) + facet_wrap('~seg', nrow=3)
p = p + scale_color_manual(name='confidence', breaks=[0, 1], labels=['low', 'high'], values=['red', 'black'])
p = p + scale_x_datetime()
p = p + theme_bw() + theme(axis_text_x = element_text(angle=315, hjust=0))

In [348]:
p.save('mobility-by-segment_conf.png', dpi=150, width=10, height=6)

  warn("Saving {0} x {1} {2} image.".format(


In [17]:
df_ts_sub = mob_ts_all[mob_ts_all.conf==1]
df_ts_sub = df_ts_sub[df_ts_sub['state'].isin(['California', 'New York', 'Florida', 'Georgia'])]
df_ts_sub.state = pd.Categorical(values=df_ts_sub.state, categories=['California', 'New York', 'Florida', 'Georgia'], ordered=True)
p = ggplot(df_ts_sub, aes(x='date', y='value', group='county')) + geom_line(alpha=0.1, size=0.3) + facet_grid('state~seg')
p = p + scale_x_datetime(name=' ', date_labels='%b %d')
p = p + scale_y_continuous(name='activity compared to baseline, percent')
p = p + theme_bw() + theme(axis_text_x = element_text(angle=315, hjust=0))
p.save('mobility-by-segment_ca-ny-vs-fl-ga_all.png', dpi=150, width=12, height=6)
#p

  warn("Saving {0} x {1} {2} image.".format(
