In [211]:
# General imports
import re
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

In [212]:
# Import data
initial_data = '../data/initial/'

# Import mask use by county
df_masks = pd.read_csv(initial_data + 'covid_mask_use.csv')

# Import cases by county
df_counties = pd.read_csv(initial_data + 'covid_us_counties.csv')

In [213]:
df_masks.dropna(inplace=True)
df_masks.head()

Unnamed: 0,county_fips_code,never,rarely,sometimes,frequently,always
0,15007,0.0,0.021,0.046,0.108,0.825
1,36011,0.0,0.033,0.072,0.162,0.732
2,48209,0.0,0.007,0.022,0.115,0.855
3,51740,0.0,0.019,0.122,0.225,0.634
4,17019,0.0,0.067,0.08,0.192,0.661


In [214]:
df_counties.dropna(inplace=True)
df_counties.head()

Unnamed: 0,date,county,state_name,county_fips_code,confirmed_cases,deaths
0,2020-07-14,Adair,Iowa,19001.0,17,0
1,2020-07-15,Adair,Iowa,19001.0,17,0
2,2020-07-16,Adair,Iowa,19001.0,19,0
3,2020-07-17,Adair,Iowa,19001.0,19,0
4,2020-07-18,Adair,Iowa,19001.0,20,0


In [215]:
df_masks.shape

(3142, 6)

In [216]:
df_counties.shape

(648129, 6)

In [217]:
# Convert county_fips_code to integer value
df_counties.county_fips_code = df_counties.county_fips_code.astype(int)

In [218]:
# Filter counties that are only present in the df_masks dataset
df_counties = pd.merge(df_counties, df_masks.county_fips_code, how='inner', on='county_fips_code')
df_counties.shape

(634109, 6)

In [219]:
# Convert 'date' column to datetime object
df_counties.date = pd.to_datetime(df_counties.date)

In [220]:
# Number of unique counties?
unique_county_codes = np.unique(df_counties.county_fips_code)
print('Number of unique counties: {}'.format(len(unique_county_codes)))

Number of unique counties: 3132


In [221]:
# Filter a dataframe's by the counties present in the df_masks dataframe
# We assume we the input dataframe has a 'county_fips_code' attribute
def filter_county_rows(df):
    return df[pd.Series(map(lambda x: x in unique_county_codes, df.county_fips_code))]

In [222]:
# Keep only counties present in the df_counties data set
# df_masks = df_masks[pd.Series(map(lambda x: x in unique_county_codes, df_masks.county_fips_code))]
df_masks = filter_county_rows(df_masks)

In [223]:
# Sort masks data set by county code (ascending)
df_masks = df_masks.sort_values('county_fips_code').reset_index().drop('index', axis=1)
df_masks.head()

Unnamed: 0,county_fips_code,never,rarely,sometimes,frequently,always
0,1001,0.053,0.074,0.134,0.295,0.444
1,1003,0.083,0.059,0.098,0.323,0.436
2,1005,0.067,0.121,0.12,0.201,0.491
3,1007,0.02,0.034,0.096,0.278,0.572
4,1009,0.053,0.114,0.18,0.194,0.459


In [224]:
# Sort counties data set by county code and then date
df_counties = df_counties.sort_values(['county_fips_code', 'date']).reset_index().drop('index', axis=1)
df_counties.head()

Unnamed: 0,date,county,state_name,county_fips_code,confirmed_cases,deaths
0,2020-03-24,Autauga,Alabama,1001,1,0
1,2020-03-25,Autauga,Alabama,1001,4,0
2,2020-03-26,Autauga,Alabama,1001,6,0
3,2020-03-27,Autauga,Alabama,1001,6,0
4,2020-03-28,Autauga,Alabama,1001,6,0


In [225]:
df_masks.describe()

Unnamed: 0,county_fips_code,never,rarely,sometimes,frequently,always
count,3132.0,3132.0,3132.0,3132.0,3132.0,3132.0
mean,30391.265964,0.080044,0.083106,0.121512,0.207805,0.507526
std,15162.386237,0.058582,0.055443,0.057973,0.063328,0.15183
min,1001.0,0.0,0.0,0.001,0.029,0.115
25%,18178.5,0.034,0.04,0.079,0.164,0.393
50%,29172.0,0.068,0.073,0.116,0.204,0.497
75%,45083.5,0.113,0.115,0.157,0.247,0.613
max,56045.0,0.432,0.384,0.422,0.549,0.889


In [226]:
# Calculate the average percentages that counties report each mask-wearing frequency measure
wearing_freqs = ['never', 'rarely', 'sometimes', 'frequently', 'always']
wearing_freq_avgs = list(map(lambda freq: df_masks[freq].mean(), wearing_freqs))
wearing_freq_avgs

[0.08004406130268177,
 0.08310632183908029,
 0.12151213282247755,
 0.20780523627075373,
 0.5075261813537669]

In [227]:
# Given a list of 5 mask wearing frequency proportions for a county, compute a mask-wearing "score" for that county
def mask_score(proportions):
    weights = [-1, -0.5, 0, 0.5, 1]
    terms = list(map(lambda i: weights[i] * (proportions[i] / wearing_freq_avgs[i]), range(0, 5)))
    return sum(terms)

In [228]:
def proportions(row):
    return list(map(lambda freq: row[freq], wearing_freqs))

In [229]:
mask_score(proportions(df_masks.iloc[0]))

0.4772828222866382

In [230]:
# Map each county to a 'mask_score'
df_masks['mask_score'] = df_masks.apply(lambda r: mask_score(proportions(r)), axis=1)

In [231]:
df_masks

Unnamed: 0,county_fips_code,never,rarely,sometimes,frequently,always,mask_score
0,1001,0.053,0.074,0.134,0.295,0.444,0.477283
1,1003,0.083,0.059,0.098,0.323,0.436,0.244343
2,1005,0.067,0.121,0.120,0.201,0.491,-0.113958
3,1007,0.020,0.034,0.096,0.278,0.572,1.341511
4,1009,0.053,0.114,0.180,0.194,0.459,0.023166
...,...,...,...,...,...,...,...
3127,56037,0.061,0.295,0.230,0.146,0.268,-1.657573
3128,56039,0.095,0.157,0.160,0.247,0.340,-0.867197
3129,56041,0.098,0.278,0.154,0.207,0.264,-1.878649
3130,56043,0.204,0.155,0.069,0.285,0.287,-2.229910


In [232]:
# Standardize the 'mask_score' -> mean = 0, stdev = 1
df_masks.mask_score = (df_masks.mask_score - df_masks.mask_score.mean()) / df_masks.mask_score.std()

# Normalize the 'mask_score' -> min score = 0, max score = 1
df_masks.mask_score = (df_masks.mask_score - df_masks.mask_score.min()) / (df_masks.mask_score.max() - df_masks.mask_score.min())

In [233]:
df_masks

Unnamed: 0,county_fips_code,never,rarely,sometimes,frequently,always,mask_score
0,1001,0.053,0.074,0.134,0.295,0.444,0.796765
1,1003,0.083,0.059,0.098,0.323,0.436,0.765844
2,1005,0.067,0.121,0.120,0.201,0.491,0.718280
3,1007,0.020,0.034,0.096,0.278,0.572,0.911489
4,1009,0.053,0.114,0.180,0.194,0.459,0.736483
...,...,...,...,...,...,...,...
3127,56037,0.061,0.295,0.230,0.146,0.268,0.513371
3128,56039,0.095,0.157,0.160,0.247,0.340,0.618291
3129,56041,0.098,0.278,0.154,0.207,0.264,0.484024
3130,56043,0.204,0.155,0.069,0.285,0.287,0.437395


In [234]:
# Import the population density dataset
df_pops = pd.read_csv(initial_data + 'census-population-landarea.csv')
df_pops.head()

Unnamed: 0,fips,PST045212,PST040210,PST120212,POP010210,LND110210,POP060210
0,0,313914040,308747508,1.7,308745538,3531905.43,87.4
1,1000,4822023,4779745,0.9,4779736,50645.33,94.4
2,1001,55514,54571,1.7,54571,594.44,91.8
3,1003,190790,182265,4.7,182265,1589.78,114.6
4,1005,27201,27457,-0.9,27457,884.88,31.0


In [235]:
# Keep only the relevant columns
df_pops = df_pops[['fips', 'PST045212', 'LND110210']].rename({'fips' : 'county_fips_code', 'PST045212' : 'population', 'LND110210' : 'landarea'}, axis=1)
df_pops

Unnamed: 0,county_fips_code,population,landarea
0,0,313914040,3531905.43
1,1000,4822023,50645.33
2,1001,55514,594.44
3,1003,190790,1589.78
4,1005,27201,884.88
...,...,...,...
3190,56037,45267,10426.65
3191,56039,21675,3995.38
3192,56041,21025,2081.26
3193,56043,8464,2238.55
