# Census and CSSE Preprocessing

The goal of this notebook is to obtain and organize the following county-level data:

- nominal data: state, county, fips
- census data: 
  - total population
  - ethnic population(s)
  - voting statistics
  - median income
  - educational attainment
- geographic data (from GeoJSON): 
  - census area
  - latitude/longitude

The statistics gathered in this notebook will only need to be updated once the 2020 Census information is released to the public.

In [93]:
# standard EDA
import numpy as np
import pandas as pd

# processing geodata
import geopandas as gp
import pickle               # saving to pickle instead of csv
from scipy import sparse
from scipy.signal import savgol_filter          # fast smoothing of data
from shapely.geometry import asShape, Polygon

# opening external coordinates
import json

# opening urls
from urllib.request import urlopen

# pd.options.display.max_rows = 150
# pd.options.display.max_columns = 150

# 1. import census data from `census.gov`

2019 population estimates can be collected from [census.gov](https://www.census.gov/data/datasets/time-series/demo/popest/2010s-counties-total.html). For the most current estimates, we will only save data from `YEAR == 12` and `AGEGRP == 0` ([data dictionary](https://www2.census.gov/programs-surveys/popest/technical-documentation/file-layouts/2010-2019/cc-est2019-alldata.pdf)).

In [94]:
raw_eth_cols = ['TOT', 'NHWA', 'NHBA', 'NHIA', 'NHAA', 'NHNA', 'NHTOM', 'H']
sex_cols = ['_MALE', '_FEMALE']
es_cols = [e+s for e in raw_eth_cols for s in sex_cols]

pop_cols = ['SUMLEV', 'STATE', 'COUNTY', 'STNAME', 'CTYNAME', 'YEAR', 'AGEGRP', 'TOT_POP']\
       + es_cols

pop_df = pd.read_csv(
  '../data/external/cc-est2019-alldata.csv',
  encoding='latin-1',     # to avoid unicode error
  usecols=pop_cols,       # it's a big file, only import certain columns
  dtype={'SUMLEV': 'str',   # these are FIPS codes
       'STATE':'str',    
       'COUNTY':'str'},
)

# mask for 2019 estimates (12)
pop_df = pop_df.loc[(pop_df['YEAR'] == 12)]  
pop_df = pop_df.drop(columns=['YEAR'])

# rename columns to better-match nytimes data (and personal preference)
pop_df = pop_df.rename(
  columns={
    'STATE':'state_fips',
    'COUNTY':'county_fips',
    'STNAME':'state',
    'CTYNAME':'county',
  }
)

# not sure if we need this level of granularity but we can keep it for now
eth_cols = ['tot_pop_white', 'tot_pop_black', 'tot_pop_native', 'tot_pop_asian', 
      'tot_pop_pacific', 'tot_pop_twoplus', 'tot_pop_hispanic']
es_cols_2 = [(e+s).lower() for e in eth_cols for s in sex_cols]
pop_df = pop_df.rename(
  columns=dict(zip(es_cols[2:], es_cols_2))
)

pop_df.columns = pop_df.columns.str.lower()

# nytimes fips is 5-digit combo of state and county fips
pop_df['fips'] = pop_df['state_fips'] + pop_df['county_fips']
pop_df = pop_df.drop(columns=['county_fips'])

pop_df = pop_df.reset_index(drop=True)

pop_df.head()

Unnamed: 0,sumlev,state_fips,state,county,agegrp,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,...,tot_pop_native_female,tot_pop_asian_male,tot_pop_asian_female,tot_pop_pacific_male,tot_pop_pacific_female,tot_pop_twoplus_male,tot_pop_twoplus_female,tot_pop_hispanic_male,tot_pop_hispanic_female,fips
0,50,1,Alabama,Autauga County,0,55869,27092,28777,20138,21077,...,138,282,364,20,20,492,464,884,787,1001
1,50,1,Alabama,Autauga County,1,3277,1713,1564,1180,1072,...,6,23,19,2,3,85,64,86,60,1001
2,50,1,Alabama,Autauga County,2,3465,1787,1678,1210,1134,...,8,16,25,0,1,78,81,88,70,1001
3,50,1,Alabama,Autauga County,3,3851,1977,1874,1362,1285,...,9,17,24,0,3,66,65,94,79,1001
4,50,1,Alabama,Autauga County,4,3659,1854,1805,1291,1272,...,0,21,13,3,3,43,46,63,74,1001


In [95]:
pop_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59698 entries, 0 to 59697
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   sumlev                   59698 non-null  object
 1   state_fips               59698 non-null  object
 2   state                    59698 non-null  object
 3   county                   59698 non-null  object
 4   agegrp                   59698 non-null  int64 
 5   tot_pop                  59698 non-null  int64 
 6   tot_male                 59698 non-null  int64 
 7   tot_female               59698 non-null  int64 
 8   tot_pop_white_male       59698 non-null  int64 
 9   tot_pop_white_female     59698 non-null  int64 
 10  tot_pop_black_male       59698 non-null  int64 
 11  tot_pop_black_female     59698 non-null  int64 
 12  tot_pop_native_male      59698 non-null  int64 
 13  tot_pop_native_female    59698 non-null  int64 
 14  tot_pop_asian_male       59698 non-nul

In [96]:
# remove descriptive terms from county names
# we'll use this again so it's nice to have a function
def remove_county_terms(s):
  county_terms = ['County', 'Parish', 'Municipality']
  for term in county_terms:
    s = s.str.replace(' ' + term, '')
  return s

pop_df[['county']] = pop_df[['county']].apply(remove_county_terms)

# personally like ordinal columns listed first
pop_cols = pop_df.select_dtypes(exclude='number').columns.tolist()\
       + pop_df.select_dtypes(include='number').columns.tolist()
pop_df = pop_df[pop_cols]
pop_df.head()

Unnamed: 0,sumlev,state_fips,state,county,fips,agegrp,tot_pop,tot_male,tot_female,tot_pop_white_male,...,tot_pop_native_male,tot_pop_native_female,tot_pop_asian_male,tot_pop_asian_female,tot_pop_pacific_male,tot_pop_pacific_female,tot_pop_twoplus_male,tot_pop_twoplus_female,tot_pop_hispanic_male,tot_pop_hispanic_female
0,50,1,Alabama,Autauga,1001,0,55869,27092,28777,20138,...,105,138,282,364,20,20,492,464,884,787
1,50,1,Alabama,Autauga,1001,1,3277,1713,1564,1180,...,3,6,23,19,2,3,85,64,86,60
2,50,1,Alabama,Autauga,1001,2,3465,1787,1678,1210,...,7,8,16,25,0,1,78,81,88,70
3,50,1,Alabama,Autauga,1001,3,3851,1977,1874,1362,...,3,9,17,24,0,3,66,65,94,79
4,50,1,Alabama,Autauga,1001,4,3659,1854,1805,1291,...,4,0,21,13,3,3,43,46,63,74


In [97]:
# check to see if we have all ethnic groups covered
(pop_df.iloc[:, 9:].sum(axis=1) / pop_df['tot_pop']).describe()

count    59690.0
mean         1.0
std          0.0
min          1.0
25%          1.0
50%          1.0
75%          1.0
max          1.0
dtype: float64

In [98]:
# add sex-aggregated columns
eth_cols_3 = [e.lower() for e in eth_cols]

for e in eth_cols_3:
  pop_df.loc[:, e] =\
  pop_df.loc[:, e+'_male'] + pop_df.loc[:, e+'_female']
  
pop_df.columns

Index(['sumlev', 'state_fips', 'state', 'county', 'fips', 'agegrp', 'tot_pop',
       'tot_male', 'tot_female', 'tot_pop_white_male', 'tot_pop_white_female',
       'tot_pop_black_male', 'tot_pop_black_female', 'tot_pop_native_male',
       'tot_pop_native_female', 'tot_pop_asian_male', 'tot_pop_asian_female',
       'tot_pop_pacific_male', 'tot_pop_pacific_female',
       'tot_pop_twoplus_male', 'tot_pop_twoplus_female',
       'tot_pop_hispanic_male', 'tot_pop_hispanic_female', 'tot_pop_white',
       'tot_pop_black', 'tot_pop_native', 'tot_pop_asian', 'tot_pop_pacific',
       'tot_pop_twoplus', 'tot_pop_hispanic'],
      dtype='object')

### calculating age coefficient and adding percentages

In [99]:
# engineer an 'age' column from the age group bins
def age_coefficient(df):
  
  grouped = df[df['agegrp']!=0].groupby(by='fips')
  cols = df.select_dtypes(include='number').columns.tolist()
  cols.remove('agegrp')
  age_cols = ['age_' + c[4:] for c in cols]
  
  def _age(g):
    return (g['agegrp'] * g.loc[:, cols].T).sum(axis=1)\
         / g.loc[:, cols].sum()
  
  adf = pd.DataFrame(grouped.apply(_age).values, columns=age_cols)
  df = df[df['agegrp']==0].drop(columns='agegrp').reset_index(drop=True)
  
  return pd.concat([df, adf], axis=1)

In [100]:
pop_df = pop_df.pipe(age_coefficient)

FUTURE WORK: impute numbers using neighbors

In [101]:
pop_df.tail()

Unnamed: 0,sumlev,state_fips,state,county,fips,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,...,age_pop_twoplus_female,age_pop_hispanic_male,age_pop_hispanic_female,age_pop_white,age_pop_black,age_pop_native,age_pop_asian,age_pop_pacific,age_pop_twoplus,age_pop_hispanic
3137,50,56,Wyoming,Sweetwater,56037,42343,21808,20535,17223,16338,...,5.368715,6.708533,6.6914,8.249993,7.766932,8.125373,7.314634,7.381818,5.69209,6.700384
3138,50,56,Wyoming,Teton,56039,23464,12142,11322,9832,9168,...,6.362963,6.478238,6.364072,9.129947,9.055172,9.121622,7.544974,9.666667,6.19661,6.424592
3139,50,56,Wyoming,Uinta,56041,20226,10224,10002,8935,8722,...,5.892308,6.529666,6.106992,8.239961,6.706349,8.451389,7.978261,5.916667,6.650641,6.316408
3140,50,56,Wyoming,Washakie,56043,7805,3963,3842,3266,3151,...,7.318182,7.125664,6.983425,9.446782,9.342105,9.490566,9.054545,,7.171642,7.055957
3141,50,56,Wyoming,Weston,56045,6927,3624,3303,3273,2963,...,5.791045,7.819355,6.530769,9.333387,8.4,8.17757,10.168142,9.0,6.221429,7.231579


In [102]:
def save_df(df, filename, csv_=False, pickle_=True, path='../data/processed/'):
  if csv_ + pickle_ == 0:
    csv_ = True
  if csv_:
    df.to_csv(f'{path}{filename}.csv', index=False)
  if pickle_:
    with open(f'{path}{filename}.p', 'wb') as file:
      pickle.dump(df, file, protocol=pickle.HIGHEST_PROTOCOL)

In [103]:
pop_df = pop_df.sort_values(by='fips')
save_df(pop_df, 'pop_df')

# 2. add census region labels

In [104]:
with urlopen('https://raw.githubusercontent.com/cphalpert/census-regions/master/us%20census%20bureau%20regions%20and%20divisions.csv') as response:
  region_df = pd.read_csv(
    response
  )
region_df.columns = region_df.columns.str.lower()
region_df.head()

Unnamed: 0,state,state code,region,division
0,Alaska,AK,West,Pacific
1,Alabama,AL,South,East South Central
2,Arkansas,AR,South,West South Central
3,Arizona,AZ,West,Mountain
4,California,CA,West,Pacific


## merge with `pop_df` to begin building `info_df`

In [105]:
info_df = pop_df.merge(region_df[['state', 'region', 'division']], on='state')
info_df.head()

Unnamed: 0,sumlev,state_fips,state,county,fips,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,...,age_pop_hispanic_female,age_pop_white,age_pop_black,age_pop_native,age_pop_asian,age_pop_pacific,age_pop_twoplus,age_pop_hispanic,region,division
0,50,1,Alabama,Autauga,1001,55869,27092,28777,20138,21077,...,7.072427,8.727963,7.81375,9.786008,7.899381,7.125,5.35251,6.707361,South,East South Central
1,50,1,Alabama,Baldwin,1003,223234,108247,114987,89845,95902,...,6.719182,9.321749,7.849388,8.999336,7.638534,8.422764,5.8126,6.602525,South,East South Central
2,50,1,Alabama,Barbour,1005,24686,13064,11622,5894,5341,...,5.766393,9.818336,8.107733,9.852632,9.137931,6.741935,6.136842,6.142346,South,East South Central
3,50,1,Alabama,Bibb,1007,22394,11929,10465,8482,8181,...,6.975,8.871332,7.993219,8.384615,8.76087,5.833333,6.357724,7.091493,South,East South Central
4,50,1,Alabama,Blount,1009,57826,28472,29354,24494,25682,...,6.06193,8.955796,7.990826,9.79078,8.723926,8.952381,6.512329,6.240595,South,East South Central


# 2. import geojson for boundaries and census areas

In [106]:
# https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html

geo_df = gp.read_file('../data/external/cb_2018_us_county_20m/cb_2018_us_county_20m.shp')
geo_df = geo_df.sort_values(by='GEOID').reset_index(drop=True)
geo_df['ALAND'] = geo_df['ALAND'] / 1e6   # convert m^2 to km^2
geo_df = geo_df[['STATEFP', 'GEOID', 'ALAND', 'geometry']]
geo_df.rename(columns={
  'STATEFP': 'state_fips', 
  'GEOID': 'fips', 
  'ALAND': 'area'
}, inplace=True)
geo_df.head()

Unnamed: 0,state_fips,fips,area,geometry
0,1,1001,1539.602123,"POLYGON ((-86.91759 32.66417, -86.71339 32.661..."
1,1,1003,4117.546676,"POLYGON ((-88.02632 30.75336, -87.94455 30.827..."
2,1,1005,2292.144655,"POLYGON ((-85.73573 31.62449, -85.66565 31.786..."
3,1,1007,1612.167481,"POLYGON ((-87.42194 33.00338, -87.31854 33.006..."
4,1,1009,1670.103911,"POLYGON ((-86.96336 33.85822, -86.92439 33.909..."


## find neighbors (for clustering later)

In [107]:
# https://gis.stackexchange.com/a/281676

def county_neighbors(g):
  
  indices = g['fips'].tolist()
  neighbor_matrix = []
  
  for i, row in g.iterrows():
    neighbors = g[g['geometry'].intersects(row['geometry'])]['fips'].tolist()
    neighbors.remove(row['fips'])
    neighbor_matrix.append(neighbors)
  
  g['neighbors'] = neighbor_matrix
  return g

geo_df = geo_df.groupby(by='state_fips').apply(county_neighbors)
geo_df.head()

Unnamed: 0,state_fips,fips,area,geometry,neighbors
0,1,1001,1539.602123,"POLYGON ((-86.91759 32.66417, -86.71339 32.661...","[01021, 01047, 01051, 01085, 01101]"
1,1,1003,4117.546676,"POLYGON ((-88.02632 30.75336, -87.94455 30.827...","[01025, 01053, 01097, 01099, 01129]"
2,1,1005,2292.144655,"POLYGON ((-85.73573 31.62449, -85.66565 31.786...","[01011, 01045, 01067, 01109, 01113]"
3,1,1007,1612.167481,"POLYGON ((-87.42194 33.00338, -87.31854 33.006...","[01021, 01065, 01073, 01105, 01117, 01125]"
4,1,1009,1670.103911,"POLYGON ((-86.96336 33.85822, -86.92439 33.909...","[01043, 01055, 01073, 01095, 01115, 01127]"


In [108]:
# def centroid(df):
#   centroids = df['geometry'].centroid
#   return [c.coords[0] for c in centroids]

# geo_df['lon'], geo_df['lat'] = zip(*geo_df.pipe(centroid))
# geo_df.head()

In [109]:
save_df(geo_df, 'geo_df')

In [110]:
info_df = info_df.merge(geo_df[['fips', 'area', 'neighbors']], on='fips')
info_df.head()

Unnamed: 0,sumlev,state_fips,state,county,fips,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,...,age_pop_black,age_pop_native,age_pop_asian,age_pop_pacific,age_pop_twoplus,age_pop_hispanic,region,division,area,neighbors
0,50,1,Alabama,Autauga,1001,55869,27092,28777,20138,21077,...,7.81375,9.786008,7.899381,7.125,5.35251,6.707361,South,East South Central,1539.602123,"[01021, 01047, 01051, 01085, 01101]"
1,50,1,Alabama,Baldwin,1003,223234,108247,114987,89845,95902,...,7.849388,8.999336,7.638534,8.422764,5.8126,6.602525,South,East South Central,4117.546676,"[01025, 01053, 01097, 01099, 01129]"
2,50,1,Alabama,Barbour,1005,24686,13064,11622,5894,5341,...,8.107733,9.852632,9.137931,6.741935,6.136842,6.142346,South,East South Central,2292.144655,"[01011, 01045, 01067, 01109, 01113]"
3,50,1,Alabama,Bibb,1007,22394,11929,10465,8482,8181,...,7.993219,8.384615,8.76087,5.833333,6.357724,7.091493,South,East South Central,1612.167481,"[01021, 01065, 01073, 01105, 01117, 01125]"
4,50,1,Alabama,Blount,1009,57826,28472,29354,24494,25682,...,7.990826,9.79078,8.723926,8.952381,6.512329,6.240595,South,East South Central,1670.103911,"[01043, 01055, 01073, 01095, 01115, 01127]"


We will merge latitude/longitude coordinates with `info_df` when we process CSSE data.

# 3. add 2020 general election data

Mask compliance has been very political, so it would be interesting to see how political differences vary by county. Data taken from [github.com/tonmcg](https://github.com/tonmcg). Alaska data taken from [RRH Elections](https://rrhelections.com/index.php/2018/02/02/alaska-results-by-county-equivalent-1960-2016/).

County-level election data not immediately available for 2020 general election, so we will substitute 2016 election results using analysis from [RRH Elections](https://rrhelections.com/index.php/2018/02/02/alaska-results-by-county-equivalent-1960-2016/).

In [111]:
with urlopen('https://raw.githubusercontent.com/tonmcg/US_County_Level_Election_Results_08-20/master/2020_US_County_Level_Presidential_Results.csv') as response:
  elect_20_df = pd.read_csv(
    response,
    # encoding='latin-1',    # to avoid unicode error
    dtype={
      'votes_dem':'int',
      'votes_gop':'int',
      'total_votes':'int',
      'county_fips':'str'},
  )

elect_20_df.rename(
  columns={
    'state_name':'state',
    'county_name':'county',
    'county_fips':'fips',
    'votes_dem':'tot_votes_dem',
    'votes_gop':'tot_votes_gop',
    'total_votes':'tot_votes',
    'diff': 'votes_diff'
  }, inplace=True
)

elect_20_df[['county']] = elect_20_df[['county']].apply(remove_county_terms)

# # https://stackoverflow.com/a/23836353
# elect_df['fips'] = elect_df['fips'].apply('{0:0>5}'.format) 

elect_cols = ['state', 'county', 'fips', 'tot_votes_dem', 'tot_votes_gop', 'tot_votes', 'votes_diff']
elect_20_df = elect_20_df[elect_cols]
elect_20_df = elect_20_df[elect_20_df['state'] != 'Alaska']
elect_20_df = elect_20_df.sort_values(by='fips')
elect_20_df.head()

Unnamed: 0,state,county,fips,tot_votes_dem,tot_votes_gop,tot_votes,votes_diff
0,Alabama,Autauga,1001,7503,19838,27770,12335
1,Alabama,Baldwin,1003,24578,83544,109679,58966
2,Alabama,Barbour,1005,4816,5622,10518,806
3,Alabama,Bibb,1007,1986,7525,9595,5539
4,Alabama,Blount,1009,2640,24711,27588,22071


In [112]:
elect_20_df[elect_20_df.isna().any(axis=1)]

Unnamed: 0,state,county,fips,tot_votes_dem,tot_votes_gop,tot_votes,votes_diff


In [113]:
# with urlopen('https://raw.githubusercontent.com/tonmcg/US_County_Level_Election_Results_08-20/master/2016_US_County_Level_Presidential_Results.csv') as response:
#   elect_16_df = pd.read_csv(
#     response,
#     # encoding='latin-1',    # to avoid unicode error
#     dtype={
#       'votes_dem':'int',
#       'votes_gop':'int',
#       'total_votes':'int',
#       'county_fips':'str'},
#   )

# elect_16_df.rename(
#   columns={
#     'county_name':'county',
#     'combined_fips':'fips',
#     'votes_dem':'tot_votes_dem',
#     'votes_gop':'tot_votes_gop',
#     'total_votes':'tot_votes',
#     'diff': 'votes_diff'
#   }, inplace=True
# )

# elect_16_df[['county']] = elect_16_df[['county']].apply(remove_county_terms)

# # https://stackoverflow.com/a/23836353
# elect_16_df['fips'] = elect_16_df['fips'].apply('{0:0>5}'.format) 

# elect_cols = ['state_abbr', 'county', 'fips', 'tot_votes_dem', 'tot_votes_gop', 'tot_votes', 'votes_diff']
# elect_16_df = elect_16_df[elect_cols]
# elect_16_df = elect_16_df.sort_values(by='fips')
# elect_16_df.head()

## add alaska 2016 elections data

In [114]:
ak_elect_df = pd.read_excel('../data/external/2016 AK Gen Official.xlsx', sheet_name='By CE')
ak_elect_df = ak_elect_df.iloc[0:29, 0:12]
ak_elect_df.rename(
  columns={
    'Trump, Donald J. ':'tot_votes_gop',
    'Clinton, Hillary ':'tot_votes_dem',
    'ED/Muni': 'county',
    'ED Total': 'tot_votes',
  }, inplace=True
)
ak_elect_df = ak_elect_df[['county', 'tot_votes_gop', 'tot_votes_dem', 'tot_votes']].sort_values(by='county')
ak_elect_df[['tot_votes_gop', 'tot_votes_dem', 'tot_votes']] = ak_elect_df[['tot_votes_gop', 'tot_votes_dem', 'tot_votes']].astype(int)
ak_elect_df['votes_diff'] = ak_elect_df['tot_votes_gop'] - ak_elect_df['tot_votes_dem']
ak_elect_df['fips'] = pop_df[pop_df['state'] == 'Alaska']['fips'].values
ak_elect_df['state'] = ['Alaska'] * len(ak_elect_df)
ak_elect_df.head()

Unnamed: 0,county,tot_votes_gop,tot_votes_dem,tot_votes,votes_diff,fips,state
22,Aleutians East,198,121,369,77,2013,Alaska
24,Aleutians West,260,493,846,-233,2016,Alaska
19,Anchorage,39942,32130,81678,7812,2020,Alaska
12,Bethel,809,2178,3933,-1369,2050,Alaska
25,Bristol Bay,180,99,316,81,2060,Alaska


In [115]:
elect_df = pd.concat([elect_20_df, ak_elect_df]).sort_values(by='fips').reset_index(drop=True)
elect_df.shape

(3141, 7)

In [116]:
elect_df['per_diff'] = elect_df['votes_diff'] / elect_df['tot_votes']
elect_df.head()

Unnamed: 0,state,county,fips,tot_votes_dem,tot_votes_gop,tot_votes,votes_diff,per_diff
0,Alabama,Autauga,1001,7503,19838,27770,12335,0.444184
1,Alabama,Baldwin,1003,24578,83544,109679,58966,0.537623
2,Alabama,Barbour,1005,4816,5622,10518,806,0.076631
3,Alabama,Bibb,1007,1986,7525,9595,5539,0.57728
4,Alabama,Blount,1009,2640,24711,27588,22071,0.800022


In [117]:
elect_df.head()

Unnamed: 0,state,county,fips,tot_votes_dem,tot_votes_gop,tot_votes,votes_diff,per_diff
0,Alabama,Autauga,1001,7503,19838,27770,12335,0.444184
1,Alabama,Baldwin,1003,24578,83544,109679,58966,0.537623
2,Alabama,Barbour,1005,4816,5622,10518,806,0.076631
3,Alabama,Bibb,1007,1986,7525,9595,5539,0.57728
4,Alabama,Blount,1009,2640,24711,27588,22071,0.800022


In [118]:
save_df(elect_df, 'elect_df')

In [119]:
info_df = info_df.merge(elect_df.loc[:, 'fips':], on='fips', how='left')
info_df.tail()

Unnamed: 0,sumlev,state_fips,state,county,fips,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,...,age_pop_hispanic,region,division,area,neighbors,tot_votes_dem,tot_votes_gop,tot_votes,votes_diff,per_diff
3137,50,56,Wyoming,Sweetwater,56037,42343,21808,20535,17223,16338,...,6.700384,West,Mountain,27005.754244,"[56007, 56013, 56023, 56035, 56041]",3823.0,12229.0,16603.0,8406.0,0.506294
3138,50,56,Wyoming,Teton,56039,23464,12142,11322,9832,9168,...,6.424592,West,Mountain,10351.784301,"[56013, 56023, 56029, 56035]",9848.0,4341.0,14677.0,-5507.0,-0.375213
3139,50,56,Wyoming,Uinta,56041,20226,10224,10002,8935,8722,...,6.316408,West,Mountain,5391.631764,"[56023, 56037]",1591.0,7496.0,9402.0,5905.0,0.628058
3140,50,56,Wyoming,Washakie,56043,7805,3963,3842,3266,3151,...,7.055957,West,Mountain,5798.138762,"[56003, 56013, 56017, 56019, 56025, 56029]",651.0,3245.0,4012.0,2594.0,0.64656
3141,50,56,Wyoming,Weston,56045,6927,3624,3303,3273,2963,...,7.231579,West,Mountain,6210.804116,"[56005, 56009, 56011, 56027]",360.0,3107.0,3542.0,2747.0,0.775551


In [120]:
info_df[info_df.loc[:, 'tot_votes_dem':].isna().any(axis=1)]

Unnamed: 0,sumlev,state_fips,state,county,fips,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,...,age_pop_hispanic,region,division,area,neighbors,tot_votes_dem,tot_votes_gop,tot_votes,votes_diff,per_diff
548,50,15,Hawaii,Kalawao,15005,86,41,45,12,11,...,7.0,West,Pacific,31.057603,[15009],,,,,
629,50,17,Illinois,Hardin,17069,3821,1933,1888,1844,1767,...,6.752688,Midwest,East North Central,459.650851,"[17059, 17151, 17165]",0.0,0.0,0.0,0.0,


# 4. add income data

Median income statistics taken from [data.census.gov](https://data.census.gov/cedsci/table?q=s1901&tid=ACSST1Y2018.S1901) (2017 ACS 1-Year Estimates). 

- `S1903_C03_001E` -- all households
- `S1903_C03_003E` -- black
- `S1903_C03_004E` -- native
- `S1903_C03_005E` -- asian
- `S1903_C03_006E` -- pacific
- `S1903_C03_007E` -- other
- `S1903_C03_008E` -- two or more
- `S1903_C03_009E` -- hispanic
- `S1903_C03_010E` -- white only, not hispanic

In [121]:
inc_cols = [f'S1903_C03_{i:03d}E' for i in range(1,11) if i != 2]
inc_col_names = ['median_income'] + [f'median_income_{race}'
                   for race in ['black', 'native', 'asian', 'pacific', 'other', 'twoplus', 'hispanic', 'white']]
inc_dict = dict(zip(inc_cols, inc_col_names))
inc_dict.update({'GEO_ID':'fips'})

# can't use dtype 'int' here because of entries like `250000+` and `-`
inc_df = pd.read_csv(
  '../data/external/ACSST5Y2018.S1903/ACSST5Y2018.S1903_data_with_overlays.csv',
  usecols=['GEO_ID', 'NAME'] + inc_cols,
)
inc_df = inc_df.drop(0, axis=0)
inc_df = inc_df.rename(columns=inc_dict)
# inc_df['median_income'] = inc_df['median_income'].astype(float)

# joplin and kansas city
inc_df.at[inc_df['fips'] == '1600000US2937592', 'fips'] = '29JOP'
inc_df.at[inc_df['fips'] == '1600000US2938000', 'fips'] = '29KAN'
inc_df['fips'] = inc_df['fips'].str[-5:]

inc_df['county'], inc_df['state'] = zip(*inc_df['NAME'].str.split(', ').tolist())
inc_df.at[inc_df['fips'] == '29JOP', 'county'] = 'Joplin'
inc_df.at[inc_df['fips'] == '29KAN', 'county'] = 'Kansas City'
inc_df = inc_df.drop('NAME', axis=1)
inc_df['county'] = inc_df[['county']].apply(remove_county_terms)

# rio arriba taken from datausa.io
inc_df.at[inc_df['fips'] == '35039', 'median_income'] = 33_422

inc_df = inc_df.replace({'-': np.nan, '2,500-':2500, '250,000+':250000})

# can't do int because of nan
inc_df[inc_col_names] = inc_df[inc_col_names].astype(float)

inc_df.tail()

Unnamed: 0,fips,median_income,median_income_black,median_income_native,median_income_asian,median_income_pacific,median_income_other,median_income_twoplus,median_income_hispanic,median_income_white,county,state
3218,72149,19855.0,25714.0,,,,19535.0,17871.0,19807.0,,Villalba Municipio,Puerto Rico
3219,72151,16013.0,14852.0,,,,29063.0,19213.0,15992.0,,Yabucoa Municipio,Puerto Rico
3220,72153,14954.0,13986.0,,,,12204.0,12650.0,14927.0,,Yauco Municipio,Puerto Rico
3221,29JOP,42782.0,32500.0,39663.0,,,,41033.0,47208.0,43473.0,Joplin,Missouri
3222,29KAN,52405.0,33899.0,48929.0,49367.0,33563.0,31768.0,50538.0,44003.0,65637.0,Kansas City,Missouri


In [122]:
inc_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3222 entries, 1 to 3222
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   fips                    3222 non-null   object 
 1   median_income           3222 non-null   float64
 2   median_income_black     2019 non-null   float64
 3   median_income_native    1423 non-null   float64
 4   median_income_asian     1405 non-null   float64
 5   median_income_pacific   281 non-null    float64
 6   median_income_other     1689 non-null   float64
 7   median_income_twoplus   2190 non-null   float64
 8   median_income_hispanic  2555 non-null   float64
 9   median_income_white     3161 non-null   float64
 10  county                  3222 non-null   object 
 11  state                   3222 non-null   object 
dtypes: float64(9), object(3)
memory usage: 327.2+ KB


In [123]:
# income_df = pd.read_csv('../data/income_df.csv')

In [124]:
inc_cols = ['state', 'county', 'fips']\
       + inc_df.select_dtypes(include='number').columns.tolist()
inc_df = inc_df[inc_cols]
inc_df.tail()

Unnamed: 0,state,county,fips,median_income,median_income_black,median_income_native,median_income_asian,median_income_pacific,median_income_other,median_income_twoplus,median_income_hispanic,median_income_white
3218,Puerto Rico,Villalba Municipio,72149,19855.0,25714.0,,,,19535.0,17871.0,19807.0,
3219,Puerto Rico,Yabucoa Municipio,72151,16013.0,14852.0,,,,29063.0,19213.0,15992.0,
3220,Puerto Rico,Yauco Municipio,72153,14954.0,13986.0,,,,12204.0,12650.0,14927.0,
3221,Missouri,Joplin,29JOP,42782.0,32500.0,39663.0,,,,41033.0,47208.0,43473.0
3222,Missouri,Kansas City,29KAN,52405.0,33899.0,48929.0,49367.0,33563.0,31768.0,50538.0,44003.0,65637.0


In [125]:
save_df(inc_df, 'inc_df')

In [126]:
info_df = info_df.merge(inc_df.loc[:, 'fips':], on='fips', how='left')
info_df.tail()

Unnamed: 0,sumlev,state_fips,state,county,fips,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,...,per_diff,median_income,median_income_black,median_income_native,median_income_asian,median_income_pacific,median_income_other,median_income_twoplus,median_income_hispanic,median_income_white
3137,50,56,Wyoming,Sweetwater,56037,42343,21808,20535,17223,16338,...,0.506294,73008.0,,62188.0,72614.0,138053.0,74189.0,55284.0,61921.0,76469.0
3138,50,56,Wyoming,Teton,56039,23464,12142,11322,9832,9168,...,-0.375213,83831.0,,,98125.0,,36433.0,17188.0,45361.0,95222.0
3139,50,56,Wyoming,Uinta,56041,20226,10224,10002,8935,8722,...,0.628058,58235.0,,68125.0,,,55701.0,39205.0,39816.0,61330.0
3140,50,56,Wyoming,Washakie,56043,7805,3963,3842,3266,3151,...,0.64656,53426.0,,,,,62054.0,36118.0,50035.0,54815.0
3141,50,56,Wyoming,Weston,56045,6927,3624,3303,3273,2963,...,0.775551,52867.0,,,,,,,37870.0,55032.0


# 5. add educational attainment data

Educational attainment statistics taken from [data.census.gov](https://data.census.gov/cedsci/table?tid=ACSST1Y2018.S1501&g=0400000US04) (2017 ACS 5-Year Estimates).

- `S1501_C01_006E` -- population > 25yo
- `S1501_C01_007E` -- less than 9th grade
- `S1501_C01_008E` -- some high school
- `S1501_C01_009E` -- high school or GED
- `S1501_C01_010E` -- some college
- `S1501_C01_011E` -- associate's
- `S1501_C01_012E` -- bachelor's
- `S1501_C01_013E` -- graduate or professional

In addition, there are ethnic / sex breakdowns:
- general patterns:
  - `CO1_XXXE` -- ethnic total
  - `CO3_XXXE` -- ethnic male
  - `CO5_XXXE` -- ethnic female
- `S1501_C01_031E` -- white alone
- `S1501_C01_032E` -- white alone, high school graduate or higher
- `S1501_C01_033E` -- white alone, bachelor's degree or higher
- etc.

In [127]:
# general educational attainment columns
edu_cols = [f'S1501_C01_{i:03d}E' for i in range(6,14)]
edu_col_names = ['pop_25p', 'no_hs', 'some_hs', 'hs', 'some_college', 
         'associates', 'bachelors', 'graduate']
edu_dict = dict(zip(edu_cols, edu_col_names))
edu_dict.update({'GEO_ID':'fips'})

# education/ethnicity/sex columns
edu_eth_sex_cols = [f'S1501_C{i:02d}_{j:03d}E' for i in range(1,6,2) for j in range(31,55)]
edu_eth_sex_col_names = [f'tot_edu_{race}{sex}{edu}' 
             for sex in ['', '_male', '_female']
             for race in ['white', 'black', 'native', 'asian', 'pacific', 'other', 'twoplus', 'hispanic']
             for edu in ['', '_hsplus', '_4yplus']]
edu_eth_sex_dict = dict(zip(edu_eth_sex_cols, edu_eth_sex_col_names))
edu_dict.update(edu_eth_sex_dict)

edu_df = pd.read_csv('../data/external/ACSST5Y2018.S1501/ACSST5Y2018.S1501_data_with_overlays.csv',
           usecols=['GEO_ID', 'NAME']+edu_cols+edu_eth_sex_cols)
edu_df = edu_df.drop(0, axis=0)
for col in (edu_cols+edu_eth_sex_cols):
  edu_df[col] = edu_df[col].astype(int)
edu_df.rename(
  columns=edu_dict,
  inplace=True
)

# joplin and kansas city
edu_df.at[edu_df['fips'] == '1600000US2937592', 'fips'] = '29JOP'
edu_df.at[edu_df['fips'] == '1600000US2938000', 'fips'] = '29KAN'
edu_df['fips'] = edu_df['fips'].str[-5:]

edu_df['county'], edu_df['state'] = zip(*edu_df['NAME'].str.split(', ').tolist())
edu_df.at[edu_df['fips'] == '29JOP', 'county'] = 'Joplin'
edu_df.at[edu_df['fips'] == '29KAN', 'county'] = 'Kansas City'
edu_df = edu_df.drop('NAME', axis=1)
edu_df['county'] = edu_df[['county']].apply(remove_county_terms)

edu_df.head()

Unnamed: 0,fips,tot_edu_white,tot_edu_white_male,tot_edu_white_female,tot_edu_white_hsplus,tot_edu_white_male_hsplus,tot_edu_white_female_hsplus,tot_edu_white_4yplus,tot_edu_white_male_4yplus,tot_edu_white_female_4yplus,...,pop_25p,no_hs,some_hs,hs,some_college,associates,bachelors,graduate,county,state
1,1001,28726,13834,14892,26130,12588,13542,8440,4573,3867,...,37166,956,3248,12119,7554,2998,5903,4388,Autauga,Alabama
2,1003,126316,60310,66006,116288,54788,61500,41648,19863,21785,...,146989,3978,10332,40579,32266,13759,30431,15644,Baldwin,Alabama
3,1005,9171,4846,4325,7264,3657,3607,1578,814,764,...,18173,1490,3411,6486,3287,1279,1417,803,Barbour,Alabama
4,1007,12002,6037,5965,10483,5181,5302,1570,674,896,...,15780,903,1747,7471,2938,908,1197,616,Bibb,Alabama
5,1009,35774,17200,18574,29814,14167,15647,4775,1900,2875,...,39627,2967,4894,13489,8492,4775,3217,1793,Blount,Alabama


In [128]:
edu_df['edu'] = (edu_df['some_hs'] + 2*edu_df['hs'] \
         + 3*edu_df['some_college'] + 4*edu_df['associates'] \
         + 5*edu_df['bachelors'] + 6*edu_df['graduate'])\
        / edu_df['pop_25p']
for race in ['white', 'black', 'native', 'asian', 'pacific', 'other', 'twoplus', 'hispanic']:
  for sex in ['', '_male', '_female']:
    edu_df[f'edu_{race}{sex}'] = (2*edu_df[f'tot_edu_{race}{sex}_hsplus']
                    + 5*edu_df[f'tot_edu_{race}{sex}_4yplus'])\
                   / edu_df[f'tot_edu_{race}{sex}']
    edu_df[f'per_edu_{race}{sex}_nohs'] = (edu_df[f'tot_edu_{race}{sex}']
                       - edu_df[f'tot_edu_{race}{sex}_hsplus'])\
                      / edu_df[f'tot_edu_{race}{sex}']
# edu_df = edu_df.fillna(-1)
edu_df.tail()

Unnamed: 0,fips,tot_edu_white,tot_edu_white_male,tot_edu_white_female,tot_edu_white_hsplus,tot_edu_white_male_hsplus,tot_edu_white_female_hsplus,tot_edu_white_4yplus,tot_edu_white_male_4yplus,tot_edu_white_female_4yplus,...,edu_twoplus_male,per_edu_twoplus_male_nohs,edu_twoplus_female,per_edu_twoplus_female_nohs,edu_hispanic,per_edu_hispanic_nohs,edu_hispanic_male,per_edu_hispanic_male_nohs,edu_hispanic_female,per_edu_hispanic_female_nohs
3218,72149,13,1,12,13,1,12,1,1,0,...,1.823389,0.303103,3.119681,0.233599,2.517601,0.244641,2.129082,0.266536,2.861788,0.225244
3219,72151,0,0,0,0,0,0,0,0,0,...,1.096154,0.596154,2.145078,0.393782,2.322749,0.302041,1.979422,0.345668,2.631746,0.262777
3220,72153,29,16,13,19,6,13,5,0,5,...,2.716535,0.330709,2.036364,0.49697,2.6701,0.278716,2.389952,0.310348,2.912327,0.251367
3221,29JOP,29516,13887,15629,26830,12641,14189,7607,3766,3841,...,3.767347,0.116327,2.785542,0.028916,2.179577,0.306338,2.316,0.232,2.072327,0.36478
3222,29KAN,196115,96625,99490,184949,90966,93983,86232,41929,44303,...,3.411234,0.108556,4.008708,0.097242,2.252631,0.303633,2.234676,0.305608,2.272133,0.301487


In [129]:
save_df(edu_df, 'edu_df')

In [130]:
# edu_df = pd.read_csv('../data/processed/edu_df.csv')

In [131]:
info_df = info_df.merge(
  edu_df[['fips']+edu_df.select_dtypes(include='number').columns.tolist()], 
  on='fips', 
  how='left'
)
info_df.tail()

Unnamed: 0,sumlev,state_fips,state,county,fips,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,...,edu_twoplus_male,per_edu_twoplus_male_nohs,edu_twoplus_female,per_edu_twoplus_female_nohs,edu_hispanic,per_edu_hispanic_nohs,edu_hispanic_male,per_edu_hispanic_male_nohs,edu_hispanic_female,per_edu_hispanic_female_nohs
3137,50,56,Wyoming,Sweetwater,56037,42343,21808,20535,17223,16338,...,2.785124,0.0,3.838235,0.0,2.143614,0.275305,1.724343,0.311353,2.624929,0.233921
3138,50,56,Wyoming,Teton,56039,23464,12142,11322,9832,9168,...,6.903846,0.0,,,1.986849,0.334145,1.184049,0.407975,2.717209,0.266977
3139,50,56,Wyoming,Uinta,56041,20226,10224,10002,8935,8722,...,2.377778,0.005556,1.847059,0.076471,1.793727,0.214717,1.818605,0.160465,1.766917,0.273183
3140,50,56,Wyoming,Washakie,56043,7805,3963,3842,3266,3151,...,1.777778,0.111111,2.326087,0.0,1.985759,0.240506,1.716049,0.32716,2.269481,0.149351
3141,50,56,Wyoming,Weston,56045,6927,3624,3303,3273,2963,...,2.0,0.0,4.125,0.0,1.673684,0.242105,1.6125,0.2875,2.0,0.0


# 6. add mask usage statistics

In [132]:
with urlopen('https://raw.githubusercontent.com/nytimes/covid-19-data/master/mask-use/mask-use-by-county.csv') as response:
  mask_df = pd.read_csv(response)
  
mask_df.rename(columns={'COUNTYFP':'fips'}, inplace=True)
mask_df['fips'] = mask_df['fips'].apply('{0:0>5}'.format)
mask_df.columns = mask_df.columns.str.lower()

mask_df.head()

Unnamed: 0,fips,never,rarely,sometimes,frequently,always
0,1001,0.053,0.074,0.134,0.295,0.444
1,1003,0.083,0.059,0.098,0.323,0.436
2,1005,0.067,0.121,0.12,0.201,0.491
3,1007,0.02,0.034,0.096,0.278,0.572
4,1009,0.053,0.114,0.18,0.194,0.459


In [133]:
mask_df['mask'] = mask_df['rarely'] + 2*mask_df['sometimes']\
          + 3*mask_df['frequently'] + 4*mask_df['always']
mask_df.tail()

Unnamed: 0,fips,never,rarely,sometimes,frequently,always,mask
3137,56037,0.061,0.295,0.23,0.146,0.268,2.265
3138,56039,0.095,0.157,0.16,0.247,0.34,2.578
3139,56041,0.098,0.278,0.154,0.207,0.264,2.263
3140,56043,0.204,0.155,0.069,0.285,0.287,2.296
3141,56045,0.142,0.129,0.148,0.207,0.374,2.542


In [134]:
save_df(mask_df, 'mask_df')

In [135]:
info_df = info_df.merge(mask_df, on='fips', how='left')
info_df.tail()

Unnamed: 0,sumlev,state_fips,state,county,fips,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,...,edu_hispanic_male,per_edu_hispanic_male_nohs,edu_hispanic_female,per_edu_hispanic_female_nohs,never,rarely,sometimes,frequently,always,mask
3137,50,56,Wyoming,Sweetwater,56037,42343,21808,20535,17223,16338,...,1.724343,0.311353,2.624929,0.233921,0.061,0.295,0.23,0.146,0.268,2.265
3138,50,56,Wyoming,Teton,56039,23464,12142,11322,9832,9168,...,1.184049,0.407975,2.717209,0.266977,0.095,0.157,0.16,0.247,0.34,2.578
3139,50,56,Wyoming,Uinta,56041,20226,10224,10002,8935,8722,...,1.818605,0.160465,1.766917,0.273183,0.098,0.278,0.154,0.207,0.264,2.263
3140,50,56,Wyoming,Washakie,56043,7805,3963,3842,3266,3151,...,1.716049,0.32716,2.269481,0.149351,0.204,0.155,0.069,0.285,0.287,2.296
3141,50,56,Wyoming,Weston,56045,6927,3624,3303,3273,2963,...,1.6125,0.2875,2.0,0.0,0.142,0.129,0.148,0.207,0.374,2.542


In [136]:
info_df.columns

Index(['sumlev', 'state_fips', 'state', 'county', 'fips', 'tot_pop',
       'tot_male', 'tot_female', 'tot_pop_white_male', 'tot_pop_white_female',
       ...
       'edu_hispanic_male', 'per_edu_hispanic_male_nohs',
       'edu_hispanic_female', 'per_edu_hispanic_female_nohs', 'never',
       'rarely', 'sometimes', 'frequently', 'always', 'mask'],
      dtype='object', length=206)

In [137]:
info_df['pop_density'] = info_df['tot_pop'] / info_df['area']

In [138]:
# def per_population(df, divisor='tot_pop', ignore=['tot_pop']):
  
#   cols = [c for c in df.columns.tolist() if c[:4]=='tot_']
  
#   for c in cols:
#     if c not in ignore:
#       df[c.replace('tot_', 'per_')] = df[c] / df[divisor]
  
#   return df

# edu_cols_to_ignore = [c for c in info_df.columns if 'edu' in c]
# info_df = info_df.pipe(
#   per_population, 
#   ignore=['tot_pop', 'tot_dem', 'tot_gop']+edu_cols_to_ignore
# )
# info_df.head()

# import CSSE data

>Note: New York Times data has a few caveats, including treating New York City, Kansas City, and Joplin as single entities rather than including them in their respective counties. Read their [README](https://github.com/nytimes/covid-19-data/blob/master/README.md) for more information.

In [139]:
# with open('../data/processed/info_df.p', 'rb') as f:
#   info_df = pickle.load(f)
  
info_df.head()

Unnamed: 0,sumlev,state_fips,state,county,fips,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,...,per_edu_hispanic_male_nohs,edu_hispanic_female,per_edu_hispanic_female_nohs,never,rarely,sometimes,frequently,always,mask,pop_density
0,50,1,Alabama,Autauga,1001,55869,27092,28777,20138,21077,...,0.164835,3.020661,0.088843,0.053,0.074,0.134,0.295,0.444,3.003,36.287947
1,50,1,Alabama,Baldwin,1003,223234,108247,114987,89845,95902,...,0.340487,3.2827,0.151899,0.083,0.059,0.098,0.323,0.436,2.968,54.215293
2,50,1,Alabama,Barbour,1005,24686,13064,11622,5894,5341,...,0.513924,1.11236,0.668539,0.067,0.121,0.12,0.201,0.491,2.928,10.769826
3,50,1,Alabama,Bibb,1007,22394,11929,10465,8482,8181,...,0.444444,1.56338,0.21831,0.02,0.034,0.096,0.278,0.572,3.348,13.890616
4,50,1,Alabama,Blount,1009,57826,28472,29354,24494,25682,...,0.675749,1.077058,0.535902,0.053,0.114,0.18,0.194,0.459,2.892,34.624193


In [140]:
with urlopen('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv') as response:
  csse_cases = pd.read_csv(response, dtype={'UID': str})

csse_cases.columns = csse_cases.columns.str.lower()
csse_cases = csse_cases.drop(columns=['iso2', 'iso3', 'code3', 'fips', 'admin2', 'province_state', 'country_region', 'combined_key'])
csse_cases = csse_cases.rename(
  columns={
  'uid': 'fips',
  'long_': 'lon'
  }
)
csse_cases['fips'] = csse_cases['fips'].apply(lambda x: x[-5:])
csse_cases.head()

Unnamed: 0,fips,lat,lon,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,...,11/28/20,11/29/20,11/30/20,12/1/20,12/2/20,12/3/20,12/4/20,12/5/20,12/6/20,12/7/20
0,1001,32.539527,-86.644082,0,0,0,0,0,0,0,...,2735,2751,2780,2818,2873,2893,2945,2979,3005,3043
1,1003,30.72775,-87.722071,0,0,0,0,0,0,0,...,8733,8820,8890,9051,9163,9341,9501,9626,9728,9821
2,1005,31.868263,-85.387129,0,0,0,0,0,0,0,...,1173,1175,1178,1189,1206,1214,1217,1219,1223,1224
3,1007,32.996421,-87.125115,0,0,0,0,0,0,0,...,1179,1188,1196,1204,1239,1252,1270,1283,1293,1299
4,1009,33.982109,-86.567906,0,0,0,0,0,0,0,...,2922,2946,2997,3061,3100,3158,3231,3281,3299,3324


In [141]:


# csse_deaths.columns = csse_deaths.columns.str.lower()
# csse_deaths = csse_deaths.drop(columns=['lat', 'long_', 'population', 'iso2', 'iso3', 'code3', 'fips', 'admin2', 'province_state', 'country_region', 'combined_key'])
# csse_deaths = csse_deaths.rename(
#   columns={
#   'uid': 'fips'
#   }
# )
# csse_deaths['fips'] = csse_deaths['fips'].apply(lambda x: x[-5:])
# csse_deaths.head()

In [142]:
info_df = info_df.merge(csse_cases[['fips', 'lat', 'lon']], on='fips', how='left')
info_df.tail()

Unnamed: 0,sumlev,state_fips,state,county,fips,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,...,per_edu_hispanic_female_nohs,never,rarely,sometimes,frequently,always,mask,pop_density,lat,lon
3137,50,56,Wyoming,Sweetwater,56037,42343,21808,20535,17223,16338,...,0.233921,0.061,0.295,0.23,0.146,0.268,2.265,1.567925,41.659439,-108.882788
3138,50,56,Wyoming,Teton,56039,23464,12142,11322,9832,9168,...,0.266977,0.095,0.157,0.16,0.247,0.34,2.578,2.266662,43.935225,-110.58908
3139,50,56,Wyoming,Uinta,56041,20226,10224,10002,8935,8722,...,0.273183,0.098,0.278,0.154,0.207,0.264,2.263,3.751369,41.287818,-110.547578
3140,50,56,Wyoming,Washakie,56043,7805,3963,3842,3266,3151,...,0.149351,0.204,0.155,0.069,0.285,0.287,2.296,1.346122,43.904516,-107.680187
3141,50,56,Wyoming,Weston,56045,6927,3624,3303,3273,2963,...,0.0,0.142,0.129,0.148,0.207,0.374,2.542,1.115315,43.839612,-104.567488


## save info_df

In [143]:
save_df(info_df, 'info_df', csv_=True)

## convert to long-form data

This will include some code up above but will provide basis for automatically grabbing CSSE data and updating our site.

In [83]:
fips = set(info_df['fips'])

def preprocess_csse(df, name):
  df.columns = df.columns.str.lower()
  columns_to_drop = ['lat', 'long_', 'iso2', 'iso3', 'code3', 'fips', 
           'admin2', 'province_state', 'country_region', 
           'combined_key']
  if 'population' in df.columns.tolist():
    columns_to_drop.append('population')
  df = df.drop(columns=columns_to_drop)
  df = df.rename(columns={'uid': 'fips'})
  df['fips'] = df['fips'].apply(lambda x: x[-5:])  
  # convert to long-form
  df = pd.DataFrame(df.set_index('fips').unstack()).rename(columns={0: name})
  return df


def make_csse_df():
  
  with urlopen('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv') as response:
    cases = pd.read_csv(response, dtype={'UID': str})
  with urlopen('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv') as response:
    deaths = pd.read_csv(response, dtype={'UID': str})
  
  cases = preprocess_csse(cases, 'cases')
  deaths = preprocess_csse(deaths, 'deaths')
  
  df = pd.concat([cases, deaths], axis=1)     # merge along multi-index
  
  df.index = df.index.set_levels(         # change date to pd.datetime
    [pd.to_datetime(df.index.levels[0]), 
    df.index.levels[1]]
    )
  
  df = df.reset_index().rename(columns={'level_0': 'date'})
  df = df[df['fips'].isin(fips)]
  
  # df[['new_cases', 'new_deaths']] = df[['cases', 'deaths']] - df.groupby(by='fips')[['cases', 'deaths']].shift()
  # df[['new_cases_c', 'new_deaths_c']] = df[['new_cases', 'new_deaths']].clip(lower=0)
  df = df.fillna(0)
  num_cols = df.select_dtypes(include='number').columns
  df[num_cols] = df[num_cols].astype(int)
  
  return df

In [84]:
csse_df = make_csse_df()

In [85]:
csse_df.tail()

Unnamed: 0,date,fips,cases,deaths
1072134,2020-12-07,56037,2176,11
1072135,2020-12-07,56039,1810,2
1072136,2020-12-07,56041,1221,6
1072138,2020-12-07,56043,556,10
1072139,2020-12-07,56045,422,2


In [86]:
csse_df.select_dtypes(include='number').columns

Index(['cases', 'deaths'], dtype='object')

In [87]:
save_df(csse_df, 'csse_df')

## engineer per capita columns

In [88]:
csse_df = csse_df.merge(
    info_df[['fips', 'tot_pop']], 
    on='fips', 
    suffixes=('_x','')
)

# df_all = df_all.drop(['county_x', 'state_x'], axis=1)
csse_df[['cases_per_100k', 'deaths_per_100k']] = csse_df[['cases', 'deaths']].div(csse_df['tot_pop'], axis=0) * 100_000
csse_df = csse_df.drop(columns=['tot_pop'])
csse_df = csse_df.sort_values(by=['date', 'fips'])

print(csse_df.shape)
csse_df.head()

(1008582, 6)


Unnamed: 0,date,fips,cases,deaths,cases_per_100k,deaths_per_100k
0,2020-01-22,1001,0,0,0.0,0.0
321,2020-01-22,1003,0,0,0.0,0.0
642,2020-01-22,1005,0,0,0.0,0.0
963,2020-01-22,1007,0,0,0.0,0.0
1284,2020-01-22,1009,0,0,0.0,0.0


In [89]:
csse_df.tail()

Unnamed: 0,date,fips,cases,deaths,cases_per_100k,deaths_per_100k
1007297,2020-12-07,56037,2176,11,5138.984012,25.97832
1007618,2020-12-07,56039,1810,2,7713.944766,8.523696
1007939,2020-12-07,56041,1221,6,6036.784337,29.664788
1008260,2020-12-07,56043,556,10,7123.638693,128.122998
1008581,2020-12-07,56045,422,2,6092.103364,28.872528


In [90]:
def add_change_cols(df, cols, pre='new_', clip=False):
    df = df.sort_values(by=['date', 'fips'])
    new_cols = [pre + c for c in cols]
    df[new_cols] = df[cols] - df.groupby(by='fips')[cols].shift()
    df[new_cols] = df[new_cols].fillna(0)
    df[new_cols] = df[new_cols].astype(int)
    if clip:
        df[new_cols] = df[new_cols].clip(lower=0)
    return (df, new_cols)

def add_window_cols(df, cols, window=7):
    df = df.sort_values(by=['date', 'fips'])
    new_cols = [c + '_' + str(window) + 'd' for c in cols]
    col_dict = dict(zip(cols, new_cols))
    df = (df.merge(df.sort_values(by=['date', 'fips'])
            .groupby('fips')
            .rolling(window, on='date', min_periods=0)[cols].mean()
            .rename(columns=col_dict), on=['fips', 'date']))
#     df[new_cols] = df[new_cols].astype(int)
    return (df, new_cols)

def add_savgol_cols(df, cols, window=7, clip=False):
    def my_savgol(x, w):
        if len(x) >= w:
            return savgol_filter(x, w, 1)
        else:
            new_window = int(np.ceil(len(x) / 2) * 2 - 1)
            if new_window <= 1:
                return x
            else:
                return savgol_filter(x, new_window, 1)
    df = df.sort_values(by=['date', 'fips'])
    new_cols = [c + '_' + str(window) + 'sg' for c in cols]
    df[new_cols] = df.groupby(by='fips')[cols].transform(lambda x: my_savgol(x, window))
    if clip:
        df[new_cols] = df[new_cols].clip(lower=0)
    return (df, new_cols)


cols = ['cases', 'deaths', 'cases_per_100k', 'deaths_per_100k']
csse_df, new_cols = add_change_cols(csse_df, cols, pre='new_', clip=True)
csse_df, window_cols = add_window_cols(csse_df, new_cols, 15)
csse_df, sg_cols = add_savgol_cols(csse_df, new_cols, 15)
csse_df['days'] = ((csse_df['date'] - csse_df['date'].max()) / np.timedelta64(1, 'D')).astype('int')

# def add_cols(df, cols, window=15, sg=True, delta=False):
#     df, new_cols = add_change_cols(df, cols, pre='new_', clip=True)
#     df, window_cols = add_window_cols(df, new_cols, window)
#     if sg:
#         df, sg_cols = add_savgol_cols(df, new_cols, window, clip=True)
#     if delta:
#         df, delta_new_cols = add_change_cols(df, new_cols, pre='delta_')
#         df, delta_window_cols = add_window_cols(df, delta_new_cols, window)
#         if sg:
#             df, delta_sg_cols = add_savgol_cols(df, delta_new_cols, window)
    
#     df['mortality_rate'] = df['deaths'] / nyt_df['cases']
#     df[f'mortality_rate_{window}d'] = df[f'new_deaths_{window}d'] / df[f'new_cases_{window}d']
#     df.loc[(df[f'new_deaths_{window}d'] ==0), f'mortality_rate_{window}d'] = 0

#     return df

In [78]:
csse_df.tail()

Unnamed: 0,date,fips,cases,deaths,tot_pop,cases_per_100k,deaths_per_100k,new_cases,new_deaths,new_cases_per_100k,...,new_deaths_per_100k_15d_x,new_cases_15d_y,new_deaths_15d_y,new_cases_per_100k_15d_y,new_deaths_per_100k_15d_y,new_cases_15sg,new_deaths_15sg,new_cases_per_100k_15sg,new_deaths_per_100k_15sg,days
1008577,2020-12-07,56037,2176,11,42343,5138.984012,25.97832,64,1,151,...,0.733333,50.333333,0.333333,118.466667,0.733333,37.083333,0.808333,87.366667,1.758333,0
1008578,2020-12-07,56039,1810,2,23464,7713.944766,8.523696,25,0,106,...,0.0,29.266667,0.0,124.266667,0.0,24.241667,0.0,102.791667,0.0,0
1008579,2020-12-07,56041,1221,6,20226,6036.784337,29.664788,23,1,113,...,0.533333,20.066667,0.133333,98.8,0.533333,17.916667,0.383333,88.0,1.533333,0
1008580,2020-12-07,56043,556,10,7805,7123.638693,128.122998,10,2,128,...,2.466667,16.066667,0.2,205.466667,2.466667,13.716667,0.425,175.391667,5.341667,0
1008581,2020-12-07,56045,422,2,6927,6092.103364,28.872528,2,0,28,...,1.866667,3.333333,0.133333,47.8,1.866667,1.958333,-0.016667,27.825,-0.233333,0


In [59]:
# csse_df[(csse_df['date'].map(lambda x: x.month) == 5) & (csse_df['fips'] == '36047')]

In [91]:
save_df(csse_df, 'csse_df')