# Exploring Correlations with Heatmaps and Scatterplots

In [1]:
import numpy as np
import numpy.polynomial.polynomial as poly      # linear regression
import pandas as pd

import pickle

import re

# opening external coordinates
import json

# opening urls
from urllib.request import urlopen

# benchmarking
from time import time

# date/time operations
from datetime import datetime, timedelta
from pytz import timezone

# plotting
import altair as alt
from altair import datum
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

# for US map
from vega_datasets import data

# large datasets
alt.data_transformers.enable('data_server');

pd.options.display.max_rows = 250
pd.options.display.max_columns = 250

In [2]:
#hide
def optimize(df):
    '''
    Optimizes the data types in a pandas dataframe.
    '''
    dft = df.copy()
    # converts to datetime if possible
    dft = dft.apply(lambda col:
        pd.to_datetime(col, errors='ignore') if col.dtypes=='object' else col)
    
    # if there are less than half as many unique values as there are rows, convert to category
    for col in dft.select_dtypes(include='object'):
        if len(dft[col].unique()) / len(df[col]) < 0.5:
            dft[col] = dft[col].astype('category')
            
    # downcasts numeric columns if possible
    dft = dft.apply(lambda col: 
        pd.to_numeric(col, downcast='integer') if col.dtypes=='int64' else col)
    dft = dft.apply(lambda col: 
        pd.to_numeric(col, downcast='float') if col.dtypes=='float64' else col)
    
    return dft

In [20]:
with open('../data/processed/nyt_df.p', 'rb') as f:
    nyt_df = pickle.load(f)
nyt_df.head()

Unnamed: 0,date,county,state,fips,cases,deaths,cases_per_100k,deaths_per_100k,new_cases,new_deaths,new_cases_per_100k,new_deaths_per_100k,new_cases_15d,new_deaths_15d,new_cases_per_100k_15d,new_deaths_per_100k_15d,new_cases_15sg,new_deaths_15sg,new_cases_per_100k_15sg,new_deaths_per_100k_15sg,delta_new_cases,delta_new_deaths,delta_new_cases_per_100k,delta_new_deaths_per_100k,delta_new_cases_15d,delta_new_deaths_15d,delta_new_cases_per_100k_15d,delta_new_deaths_per_100k_15d,delta_new_cases_15sg,delta_new_deaths_15sg,delta_new_cases_per_100k_15sg,delta_new_deaths_per_100k_15sg,days,mortality_rate,mortality_rate_15d
0,2020-01-21,Snohomish,Washington,53061,1,0,0.121642,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,
1,2020-01-22,Snohomish,Washington,53061,1,0,0.121642,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0,
2,2020-01-23,Snohomish,Washington,53061,1,0,0.121642,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0.0,
3,2020-01-24,Cook,Illinois,17031,1,0,0.019417,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.091667,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.025,0.0,0.0,0.0,3,0.0,
4,2020-01-24,Snohomish,Washington,53061,1,0,0.121642,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,0.0,


In [4]:
with open('../data/processed/info_df.p', 'rb') as f:
    info_df = pickle.load(f)
info_df.head()

Unnamed: 0,state_fips,state,county,fips,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,tot_pop_black_male,tot_pop_black_female,tot_pop_native_male,tot_pop_native_female,tot_pop_asian_male,tot_pop_asian_female,tot_pop_pacific_male,tot_pop_pacific_female,tot_pop_twoplus_male,tot_pop_twoplus_female,tot_pop_hispanic_male,tot_pop_hispanic_female,tot_pop_white,tot_pop_black,tot_pop_native,tot_pop_asian,tot_pop_pacific,tot_pop_twoplus,tot_pop_hispanic,age_pop,age_male,age_female,age_pop_white_male,age_pop_white_female,age_pop_black_male,age_pop_black_female,age_pop_native_male,age_pop_native_female,age_pop_asian_male,age_pop_asian_female,age_pop_pacific_male,age_pop_pacific_female,age_pop_twoplus_male,age_pop_twoplus_female,age_pop_hispanic_male,age_pop_hispanic_female,age_pop_white,age_pop_black,age_pop_native,age_pop_asian,age_pop_pacific,age_pop_twoplus,age_pop_hispanic,area_land,lon,lat,neighbors,tot_dem,tot_gop,tot_votes,per_gop,median_income,median_income_black,median_income_native,median_income_asian,median_income_pacific,median_income_other,median_income_twoplus,median_income_hispanic,median_income_white,tot_edu_white,tot_edu_white_male,tot_edu_white_female,tot_edu_white_hsplus,tot_edu_white_male_hsplus,tot_edu_white_female_hsplus,tot_edu_white_4yplus,tot_edu_white_male_4yplus,tot_edu_white_female_4yplus,tot_edu_black,tot_edu_black_male,tot_edu_black_female,tot_edu_black_hsplus,tot_edu_black_male_hsplus,tot_edu_black_female_hsplus,tot_edu_black_4yplus,tot_edu_black_male_4yplus,tot_edu_black_female_4yplus,tot_edu_native,tot_edu_native_male,tot_edu_native_female,tot_edu_native_hsplus,tot_edu_native_male_hsplus,tot_edu_native_female_hsplus,tot_edu_native_4yplus,tot_edu_native_male_4yplus,tot_edu_native_female_4yplus,tot_edu_asian,tot_edu_asian_male,tot_edu_asian_female,tot_edu_asian_hsplus,tot_edu_asian_male_hsplus,tot_edu_asian_female_hsplus,tot_edu_asian_4yplus,tot_edu_asian_male_4yplus,tot_edu_asian_female_4yplus,tot_edu_pacific,tot_edu_pacific_male,tot_edu_pacific_female,tot_edu_pacific_hsplus,tot_edu_pacific_male_hsplus,tot_edu_pacific_female_hsplus,tot_edu_pacific_4yplus,tot_edu_pacific_male_4yplus,tot_edu_pacific_female_4yplus,tot_edu_other,tot_edu_other_male,tot_edu_other_female,tot_edu_other_hsplus,tot_edu_other_male_hsplus,tot_edu_other_female_hsplus,tot_edu_other_4yplus,tot_edu_other_male_4yplus,tot_edu_other_female_4yplus,tot_edu_twoplus,tot_edu_twoplus_male,tot_edu_twoplus_female,tot_edu_twoplus_hsplus,tot_edu_twoplus_male_hsplus,tot_edu_twoplus_female_hsplus,tot_edu_twoplus_4yplus,tot_edu_twoplus_male_4yplus,tot_edu_twoplus_female_4yplus,tot_edu_hispanic,tot_edu_hispanic_male,tot_edu_hispanic_female,tot_edu_hispanic_hsplus,tot_edu_hispanic_male_hsplus,tot_edu_hispanic_female_hsplus,tot_edu_hispanic_4yplus,tot_edu_hispanic_male_4yplus,tot_edu_hispanic_female_4yplus,pop_25p,no_hs,some_hs,hs,some_college,associates,bachelors,graduate,edu,edu_white,per_edu_white_nohs,edu_white_male,per_edu_white_male_nohs,edu_white_female,per_edu_white_female_nohs,edu_black,per_edu_black_nohs,edu_black_male,per_edu_black_male_nohs,edu_black_female,per_edu_black_female_nohs,edu_native,per_edu_native_nohs,edu_native_male,per_edu_native_male_nohs,edu_native_female,per_edu_native_female_nohs,edu_asian,per_edu_asian_nohs,edu_asian_male,per_edu_asian_male_nohs,edu_asian_female,per_edu_asian_female_nohs,edu_pacific,per_edu_pacific_nohs,edu_pacific_male,per_edu_pacific_male_nohs,edu_pacific_female,per_edu_pacific_female_nohs,edu_other,per_edu_other_nohs,edu_other_male,per_edu_other_male_nohs,edu_other_female,per_edu_other_female_nohs,edu_twoplus,per_edu_twoplus_nohs,edu_twoplus_male,per_edu_twoplus_male_nohs,edu_twoplus_female,per_edu_twoplus_female_nohs,edu_hispanic,per_edu_hispanic_nohs,edu_hispanic_male,per_edu_hispanic_male_nohs,edu_hispanic_female,per_edu_hispanic_female_nohs,never,rarely,sometimes,frequently,always,mask,pop_density,per_male,per_female,per_pop_white_male,per_pop_white_female,per_pop_black_male,per_pop_black_female,per_pop_native_male,per_pop_native_female,per_pop_asian_male,per_pop_asian_female,per_pop_pacific_male,per_pop_pacific_female,per_pop_twoplus_male,per_pop_twoplus_female,per_pop_hispanic_male,per_pop_hispanic_female,per_pop_white,per_pop_black,per_pop_native,per_pop_asian,per_pop_pacific,per_pop_twoplus,per_pop_hispanic,per_votes
0,1,Alabama,Autauga,1001,55869,27092.0,28777.0,20138.0,21077.0,5171.0,5927.0,105.0,138.0,282.0,364.0,20.0,20.0,492.0,464.0,884.0,787.0,41215,11098,243,646,40,956,1671,8.422041,8.181973,8.648052,8.518373,8.928216,7.480758,8.104269,9.733333,9.826087,7.276596,8.381868,8.4,5.85,5.195122,5.519397,6.382353,7.072427,8.727963,7.81375,9.786008,7.899381,7.125,5.35251,6.707361,1539.602123,-86.643648,32.538666,"[01021, 01047, 01051, 01085, 01101]",5908.0,18110.0,24661.0,0.754018,58786.0,27643.0,,,,,,83423.0,65047.0,28726,13834,14892,26130,12588,13542,8440,4573,3867,6786,3042,3744,5459,2436,3023,1296,573,723,78,39,39,61,25,36,25,25,0,317,95,222,278,62,216,118,43,75,32,5,27,32,5,27,0,0,0,262,93,169,197,67,130,0,0,0,380,135,245,269,73,196,92,0,92,939,455,484,821,380,441,346,230,116,37166,956,3248,12119,7554,2998,5903,4388,3.174487,3.28831,0.090371,3.472676,0.090068,3.117043,0.090653,2.563808,0.19555,2.543393,0.199211,2.580395,0.192575,3.166667,0.217949,4.487179,0.358974,1.846154,0.076923,3.615142,0.123028,3.568421,0.347368,3.635135,0.027027,2.0,0.0,2.0,0.0,2.0,0.0,1.503817,0.248092,1.44086,0.27957,1.538462,0.230769,2.626316,0.292105,1.081481,0.459259,3.477551,0.2,3.591054,0.125666,4.197802,0.164835,3.020661,0.088843,0.053,0.074,0.134,0.295,0.444,3.003,36.287947,0.48492,0.51508,0.36045,0.377258,0.092556,0.106087,0.001879,0.00247,0.005048,0.006515,0.000358,0.000358,0.008806,0.008305,0.015823,0.014087,0.737708,0.198643,0.004349,0.011563,0.000716,0.017111,0.029909,0.441408
1,1,Alabama,Baldwin,1003,223234,108247.0,114987.0,89845.0,95902.0,9308.0,9907.0,753.0,754.0,911.0,1435.0,53.0,70.0,1832.0,1930.0,5545.0,4989.0,185747,19215,1507,2346,123,3762,10534,8.987202,8.806627,9.157192,9.150993,9.481721,7.587129,8.095791,9.14741,8.851459,7.246981,7.887108,8.150943,8.628571,5.757642,5.864767,6.497565,6.719182,9.321749,7.849388,8.999336,7.638534,8.422764,5.8126,6.602525,4117.546676,-87.722603,30.729584,"[01025, 01053, 01097, 01099, 01129]",18409.0,72780.0,94090.0,0.798123,55962.0,31112.0,53289.0,34763.0,,45634.0,53456.0,43279.0,59418.0,126316,60310,66006,116288,54788,61500,41648,19863,21785,12006,5593,6413,9565,4129,5436,2164,808,1356,1015,523,492,790,410,380,145,81,64,1180,426,754,980,271,709,243,147,96,9,0,9,9,0,9,0,0,0,938,469,469,695,394,301,262,119,143,1712,853,859,1529,744,785,559,199,360,5119,2749,2370,3823,1813,2010,1389,637,752,146989,3978,10332,40579,32266,13759,30431,15644,3.329113,3.489788,0.079388,3.463621,0.09156,3.513696,0.068267,2.494586,0.203315,2.19882,0.261756,2.752534,0.152347,2.270936,0.221675,2.342256,0.216061,2.195122,0.227642,2.690678,0.169492,2.997653,0.36385,2.517241,0.059682,2.0,0.0,,,2.0,0.0,2.878465,0.259062,2.948827,0.159915,2.808102,0.358209,3.418808,0.106893,2.910903,0.127784,3.923166,0.086147,2.850361,0.253174,2.477628,0.340487,3.2827,0.151899,0.083,0.059,0.098,0.323,0.436,2.968,54.215293,0.484904,0.515096,0.40247,0.429603,0.041696,0.044379,0.003373,0.003378,0.004081,0.006428,0.000237,0.000314,0.008207,0.008646,0.024839,0.022349,0.832073,0.086076,0.006751,0.010509,0.000551,0.016852,0.047188,0.421486
2,1,Alabama,Barbour,1005,24686,13064.0,11622.0,5894.0,5341.0,6260.0,5547.0,52.0,43.0,55.0,61.0,21.0,10.0,153.0,132.0,629.0,488.0,11235,11807,95,116,31,285,1117,8.784412,8.463564,9.14507,9.475568,10.196592,7.767732,8.491437,9.346154,10.465116,8.109091,10.065574,6.285714,7.7,6.418301,5.810606,6.434022,5.766393,9.818336,8.107733,9.852632,9.137931,6.741935,6.136842,6.142346,2292.144655,-85.387579,31.868235,"[01011, 01045, 01067, 01109, 01113]",4848.0,5431.0,10390.0,0.528359,34186.0,23013.0,,50417.0,,26793.0,19760.0,30417.0,47031.0,9171,4846,4325,7264,3657,3607,1578,814,764,8137,4304,3833,5551,2776,2775,552,240,312,72,72,0,42,42,0,0,0,0,88,40,48,72,27,45,5,0,5,1,0,1,0,0,0,0,0,0,345,230,115,100,76,24,44,44,0,183,80,103,153,50,103,9,0,9,573,395,178,251,192,59,76,60,16,18173,1490,3411,6486,3287,1279,1417,803,2.38062,2.444444,0.207938,2.349154,0.245357,2.551214,0.166012,1.703576,0.317808,1.568773,0.355019,1.854944,0.276024,1.166667,0.416667,1.166667,0.416667,,,1.920455,0.181818,1.35,0.325,2.395833,0.0625,0.0,1.0,,,0.0,1.0,1.217391,0.710145,1.617391,0.669565,0.417391,0.791304,1.918033,0.163934,1.25,0.375,2.436893,0.0,1.539267,0.561955,1.731646,0.513924,1.11236,0.668539,0.067,0.121,0.12,0.201,0.491,2.928,10.769826,0.529207,0.470793,0.238759,0.216357,0.253585,0.224702,0.002106,0.001742,0.002228,0.002471,0.000851,0.000405,0.006198,0.005347,0.02548,0.019768,0.455116,0.478287,0.003848,0.004699,0.001256,0.011545,0.045248,0.420886
3,1,Alabama,Bibb,1007,22394,11929.0,10465.0,8482.0,8181.0,2912.0,1807.0,50.0,41.0,21.0,25.0,5.0,1.0,116.0,130.0,343.0,280.0,16663,4719,91,46,6,246,623,8.606145,8.349484,8.89871,8.645838,9.105122,7.706387,8.455451,8.52,8.219512,7.52381,9.8,6.4,3.0,6.422414,6.3,7.186589,6.975,8.871332,7.993219,8.384615,8.76087,5.833333,6.357724,7.091493,1612.167481,-87.125115,32.996421,"[01021, 01065, 01073, 01105, 01117, 01125]",1874.0,6733.0,8748.0,0.78227,45340.0,34000.0,,,,,20329.0,42708.0,50769.0,12002,6037,5965,10483,5181,5302,1570,674,896,3316,2146,1170,2296,1377,919,200,83,117,8,8,0,8,8,0,0,0,0,37,16,21,37,16,21,37,16,21,0,0,0,0,0,0,0,0,0,9,9,0,9,9,0,0,0,0,149,108,41,108,89,19,6,6,0,313,171,142,206,95,111,0,0,0,15780,903,1747,7471,2938,908,1197,616,2.459823,2.400933,0.126562,2.27464,0.141792,2.528751,0.111148,1.686369,0.3076,1.476701,0.358341,2.07094,0.21453,2.0,0.0,2.0,0.0,,,7.0,0.0,7.0,0.0,7.0,0.0,,,,,,,2.0,0.0,2.0,0.0,,,1.651007,0.275168,1.925926,0.175926,0.926829,0.536585,1.316294,0.341853,1.111111,0.444444,1.56338,0.21831,0.02,0.034,0.096,0.278,0.572,3.348,13.890616,0.532687,0.467313,0.378762,0.365321,0.130035,0.080691,0.002233,0.001831,0.000938,0.001116,0.000223,4.5e-05,0.00518,0.005805,0.015317,0.012503,0.744083,0.210726,0.004064,0.002054,0.000268,0.010985,0.02782,0.39064
4,1,Alabama,Blount,1009,57826,28472.0,29354.0,24494.0,25682.0,453.0,419.0,143.0,139.0,73.0,90.0,14.0,7.0,345.0,385.0,2950.0,2632.0,50176,872,282,163,21,730,5582,8.651714,8.49136,8.807249,8.771209,9.131843,8.030905,7.947494,10.125874,9.446043,8.30137,9.066667,8.142857,10.571429,6.486957,6.535065,6.4,6.06193,8.955796,7.990826,9.79078,8.723926,8.952381,6.512329,6.240595,1670.103911,-86.568495,33.98143,"[01043, 01055, 01073, 01095, 01115, 01127]",2150.0,22808.0,25384.0,0.913855,48695.0,,65385.0,99219.0,,,44934.0,35495.0,49872.0,35774,17200,18574,29814,14167,15647,4775,1900,2875,596,281,315,411,192,219,22,10,12,132,22,110,94,14,80,13,13,0,124,43,81,104,43,61,62,25,37,18,0,18,18,0,18,0,0,0,211,106,105,120,56,64,90,28,62,440,212,228,361,154,207,24,22,2,2610,1468,1142,1006,476,530,82,48,34,39627,2967,4894,13489,8492,4775,3217,1793,2.606581,2.334181,0.166601,2.199651,0.176337,2.45876,0.157586,1.563758,0.310403,1.544484,0.316726,1.580952,0.304762,1.916667,0.287879,4.227273,0.363636,1.454545,0.272727,4.177419,0.16129,4.906977,0.0,3.790123,0.246914,2.0,0.0,,,2.0,0.0,3.270142,0.43128,2.377358,0.471698,4.171429,0.390476,1.913636,0.179545,1.971698,0.273585,1.859649,0.092105,0.927969,0.614559,0.811989,0.675749,1.077058,0.535902,0.053,0.114,0.18,0.194,0.459,2.892,34.624193,0.492374,0.507626,0.423581,0.444125,0.007834,0.007246,0.002473,0.002404,0.001262,0.001556,0.000242,0.000121,0.005966,0.006658,0.051015,0.045516,0.867707,0.01508,0.004877,0.002819,0.000363,0.012624,0.096531,0.438972


In [5]:
with open('../data/processed/geo_altair.p', 'rb') as f:
    geo_altair = pickle.load(f)

In [6]:
# #hide
# la = timezone('US/Pacific')
# last = nyt_df['date'].max()
# df_slice = nyt_df[nyt_df['date'] == last]
# df_slice.head()

In [None]:
# #hide
# dem_df_ac = optimize(pd.read_csv('../data/processed/dem_df_ac.csv', 
#                                  dtype={'fips':'str', 'cluster':'int'}))
# print(dem_df_ac.shape)
# dem_df_ac.head()

# Exploring Correlations

In [None]:
df = nyt_df.merge(info_df.loc[:, 'fips':'tot_pop'], on='fips', how='left')

In [8]:
def column_selector(info_df, columns=None, mask=[], exclude=[]):
    
    if columns is None:
        columns = []
        
    # only select from numeric columns
    all_columns = info_df.select_dtypes(include='number').columns.tolist()
        
    # if we want to look at a group of columns instead of inputting manually
    if len(mask+exclude) > 0:
        new_columns = all_columns
        if len(mask) > 0:
            new_columns = list(set([c for c in new_columns if all(m in set(re.findall('[a-z]+', c)) for m in mask)]))
        if len(exclude) > 0:
            new_columns = list(set([c for c in new_columns if all(e not in set(re.findall('[a-z]+', c)) for e in exclude)]))
        columns += new_columns
        
    return columns

In [52]:
def corr(x, y, w, useweight=True):
    x_ids = ~np.isnan(x)
    y_ids = ~np.isnan(y)
    ids = x_ids & y_ids
    if useweight:
        try:
            [xx, xy], [_, yy] = np.cov(x[ids], y[ids], aweights=w[ids])
        except:
            print(x.name)
            print(y.name)
    else:
        [xx, xy], [_, yy] = np.cov(x[ids], y[ids])
    c = xy / np.sqrt(xx * yy)
    return c

In [10]:
def df_merger(nyt_df, info_df, columns=None, features=None, date='latest', weight='tot_pop'):
    
    # make sure columns and features are valid
    all_features = nyt_df.select_dtypes(include='number').columns.tolist()
    features = [f for f in features if f in all_features]
    all_columns = info_df.select_dtypes(include='number').columns.tolist()
    columns = sorted([c for c in columns if c in all_columns])
    
    # merge nyt_df and info_df
    ## only process specific date and features
    left_columns = list(set(['date', 'county', 'state', 'fips'] + features))
    if date=='latest':
        left = nyt_df[nyt_df['date']==nyt_df['date'].max()][left_columns]
    elif date=='all':
        left = nyt_df[left_columns]
    else:
        left = nyt_df[nyt_df['date']==date][left_columns]
        
    ## only process specific columns of info_df
    right_columns = list(set(['fips', weight] + columns))
    right = info_df[right_columns]
    
    df = left.merge(right, on='fips', how='left')
    
    return df

In [23]:
def make_correlation_heatmap(
    nyt_df, info_df, columns=None, date='latest',
    features=[
        'cases_per_100k', 
        'new_cases_per_100k_15d',
        'delta_new_cases_per_100k_15d',
        'deaths_per_100k',
        'new_deaths_per_100k_15d',
        'delta_new_deaths_per_100k_15d',
        'mortality_rate',
        'mortality_rate_15d'
    ],
    useweight=True, weight='tot_pop', size=50, print_info=True
):
    
    df = df_merger(nyt_df, info_df, columns, features, date, weight)

    # build weighted correlation matrix from df
    x_columns = columns + features
    
    wcm = pd.DataFrame(index=columns, columns=x_columns)
    
    for y in columns:
        for x in x_columns:
            wcm.loc[y, x] = corr(df[y], df[x], df[weight])
    
    wcm = (wcm.reset_index().rename(columns={'index':'y_feature'}).dropna()
              .melt('y_feature', var_name='x_feature', value_name='corr'))
    wcm['corr'] = np.round(wcm['corr'].astype(float), 4)

    if print_info:
        print('positive correlations')
        print(
            wcm[(wcm['corr'] >= 0.4) & (wcm['corr'] != 1)]
            .sort_values(by=['corr', 'y_feature']).iloc[::2, :]
            .sort_values(by=['y_feature', 'x_feature'])
        )
        print('\nnegative correlations')
        print(
            wcm[(wcm['corr'] <= -0.4) & (wcm['corr'] != -1)]
            .sort_values(by=['corr', 'y_feature']).iloc[::2, :]
            .sort_values(by=['y_feature', 'x_feature'])
        )
    
    # build altair chart
    base = alt.Chart(wcm).encode(
        alt.X(
            'x_feature:O',
            sort=x_columns
        ),
        alt.Y(
            'y_feature:O',
            sort=columns
        )
    )
    heatmap = base.mark_rect().encode(
        color=alt.Color(
            'corr:Q',
            scale=alt.Scale(
                scheme='redblue',
                domain=[-1, 0, 1]
            )
        ),
        tooltip=[
            alt.Tooltip('x_feature:O'),
            alt.Tooltip('y_feature:O'),
            alt.Tooltip('corr:Q', title='correlation')
        ]
    )
    
    # text
    text = base.mark_text(baseline='middle').encode(
        text=alt.Text('corr:Q',format='.2f'),
        color=alt.condition(
            np.abs(alt.datum.corr) <= 0.5,
            alt.value('black'),
            alt.value('white')
        )
    )
    
    return (heatmap + text).configure_view(step=size)

In [49]:
nyt_df.groupby(by='date').get_group('2020-08-10').name

AttributeError: 'DataFrame' object has no attribute 'name'

In [24]:
columns=['per_gop', 'mask', 'edu', 'median_income', 'age_pop', 'pop_density']
columns += column_selector(info_df, mask=['per', 'pop'], exclude=['male', 'female', 'tot'])
columns = sorted(list(set(columns)))
make_correlation_heatmap(nyt_df, info_df, columns, features=[])

positive correlations
           y_feature         x_feature    corr
40               edu     median_income  0.7343
66               edu     per_pop_asian  0.4254
41              mask     median_income  0.4295
67              mask     per_pop_asian  0.4586
93              mask  per_pop_hispanic  0.4599
68     median_income     per_pop_asian  0.6061
147          per_gop     per_pop_white  0.6785
135    per_pop_asian   per_pop_twoplus  0.4415
139  per_pop_pacific   per_pop_twoplus  0.8468

negative correlations
            y_feature      x_feature    corr
53                edu        per_gop -0.4201
54               mask        per_gop -0.6458
145              mask  per_pop_white -0.5242
69            per_gop  per_pop_asian -0.5732
160           per_gop    pop_density -0.4718
148     per_pop_asian  per_pop_white -0.4925
149     per_pop_black  per_pop_white -0.4285
150  per_pop_hispanic  per_pop_white -0.7649


First, let's discuss features that not quite independent from each other:

selected positive correlations (> 0.4):
- educational attainment and median income
- educational attainment and percent asian
- mask discipline and median income
- mask discipline and percent asian
- mask discipline and percent hispanic
- median income and percent asian
- percent GOP and percent white

selected negative correlations (< -0.4):
- educational attainment and percent GOP
- mask discipline and percent GOP
- mask discipline and percent white
- percent Asian and percent GOP
- population density and percent GOP

It's important to understand these correlations, since they introduce possible confounding variables when exploring correlations with COVID-specific features (eg `new_cases`).

# Exploring Time Series Correlations

New COVID-19 cases/deaths have varied widely depending on time of year. While we could explore correlations with total cases and deaths, it would be more interesting to see how different factors are correlated depending on the date. We compare these features to a 15-day window of new cases and deaths.

In [61]:
def make_correlation_timeseries(
    nyt_df, info_df, columns=None, feature='new_cases_per_100k_15d', 
    useweight=True, weight='tot_pop'
):
    


1.5677231595552148

In [90]:
def make_correlation_timeseries(
    nyt_df, info_df, columns=None, feature='new_cases_per_100k_15d', 
    useweight=True, weight='tot_pop', divisions=1, divide_on=None
):
    
    def _timeseries(left, right):
        
        df = df_merger(
            left, right, columns=columns, features=[feature], date='all', 
            weight=weight
        ).dropna(axis=0)

        # set up skeleton for weighted correlation matrix
        wcm = pd.DataFrame(index=sorted(df['date'].unique()), columns=columns)

        # slice out the dataframe so we process less data

        grouped = df.groupby(by='date')

        # populate weighted correlation matrix
        # FIND A WAY TO OPTIMIZE THIS CODE
        for d, g in grouped:
            if len(g) > 1:
                for c in columns:
                    wcm.loc[d, c] = corr(g[c], g[feature], g[weight], useweight=useweight)

        width = 5*len(wcm.dropna())
        # convert to long-form dataframe for altair
        wcm = (wcm.reset_index().rename(columns={'index':'date'}).dropna()
                  .melt('date', var_name='feature', value_name='corr'))
        wcm['corr'] = wcm['corr'].astype(float)

        # build altair chart
        base = alt.Chart(wcm).encode(
            alt.X(
                'monthdate(date):T',
                axis=alt.Axis(format='%b %d')
            ),
            alt.Y('feature:O')
        )
        heatmap = base.mark_rect().encode(
            color=alt.Color(
                'corr:Q',
                scale=alt.Scale(
                    scheme='redblue',
                    domain=[-0.5, 0, 0.5]
                )
            ),
            tooltip=[
                alt.Tooltip('date:T', title='date'),
                alt.Tooltip('feature:O'),
                alt.Tooltip('corr:Q', title='correlation')
            ]
        )

        return heatmap
    
    if divisions==1:
        return _timeseries(nyt_df, info_df).configure_view(step=15, continuousWidth=750)
    
    elif (divisions>1) and (divide_on is not None):
#         qcut_labels = np.arange(divisions)
        info_df['qcut'] = pd.qcut(info_df[divide_on], divisions)
        charts = []
        for i in sorted(info_df['qcut'].unique()):
            charts.append(
                _timeseries(nyt_df, info_df[info_df['qcut']==i]).properties(
                    title=f'{divide_on} = {str(i)}'
                )
            )
        return alt.vconcat(*charts[::-1]).configure_view(
            step=15, continuousWidth=750
        ).resolve_scale(x='shared')

In [76]:
make_correlation_timeseries(
    nyt_df, 
    info_df, 
    columns=['pop_density', 'per_gop', 'mask', 'edu', 'age_pop'], 
    feature='new_cases_per_100k_15d'
)

  del sys.path[0]


In [91]:
make_correlation_timeseries(
    nyt_df, 
    info_df, 
    columns=['pop_density', 'per_gop', 'mask', 'edu', 'age_pop'], 
    feature='new_cases_per_100k_15d',
    divisions=5,
    divide_on='pop_density'
)

  del sys.path[0]


## population density, percent GOP, mask, edu, age

Given that we are discussing the rate in which a disease is spread, it would make sense that population density (`pop_density`) has the highest positive correlation with number of total cases per capita (`cases_per_100k`).

Other features are not completely independent from population density. 
- percent GOP voters is negatively correlated with population density
- mask discipline, educational attainment, and average age is weakly correlated with population density  

Because of this, it makes sense that the percentage of GOP voters is negatively correlated with total cases per capita. However, educational attainment and average age are still negatively correlated with total cases per capita, which may indicate a stronger relationship between these features and total cases.

It would most likely be more useful to look at *recent* cases per capita (`new_cases_per_100k_15d`), as more populous areas will have higher case totals overall--so much so that their earlier numbers will likely still eclipse recent statistics of less populous counties (New York City was hit especially hard in mid-March).

Notice how in recent windows, population density, mask discipline, and educational attainment all have *lower* correlations, whereas percent of GOP voters have an increase in correlation.

In [25]:
make_correlation_heatmap(nyt_df, info_df, columns=['pop_density', 'per_gop', 'mask', 'edu', 'age_pop'])

positive correlations
      y_feature       x_feature    corr
55  pop_density  mortality_rate  0.4827

negative correlations
   y_feature        x_feature    corr
8        edu          per_gop -0.4201
7       mask          per_gop -0.6458
41   per_gop  deaths_per_100k -0.4035
1    per_gop      pop_density -0.4718


In [66]:
make_correlation_timeseries(nyt_df, info_df, columns=['pop_density', 'per_gop', 'mask', 'edu', 'age_pop'], feature='new_cases_per_100k_15d')

  del sys.path[0]


In [64]:
alt.vconcat(
    make_correlation_timeseries(nyt_df, info_df[info_df['pop_density'] < info_df['pop_density'].median()], columns=['pop_density', 'per_gop', 'mask', 'edu', 'age_pop']),
    make_correlation_timeseries(nyt_df, info_df[info_df['pop_density'] >= info_df['pop_density'].median()], columns=['pop_density', 'per_gop', 'mask', 'edu', 'age_pop'])
).resolve_scale(x='shared')

  del sys.path[0]


  del sys.path[0]


In [16]:
make_correlation_timeseries(nyt_df, info_df, columns=['pop_density', 'per_gop', 'mask', 'edu', 'age_pop'], feature='new_deaths_per_100k_15d')

  if __name__ == '__main__':


In [22]:
make_correlation_timeseries(nyt_df, info_df, columns=['pop_density', 'per_gop', 'mask', 'edu', 'age_pop'], feature='mortality_rate_15d', useweight=False)

  avg = a.mean(axis)
  ret, rcount, out=ret, casting='unsafe', subok=False)
  
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  if __name__ == '__main__':


Even with NYC out of the picture, we see that COVID-19 primarily hit urban centers first and has since moved on to more rural areas.

In [26]:
make_correlation_timeseries(nyt_df[nyt_df['fips']!='36NYC'], info_df, columns=['pop_density', 'per_gop', 'mask', 'edu', 'age_pop'])

  if __name__ == '__main__':


## age

In [28]:
columns = column_selector(info_df, mask=['age'], exclude=['tot', 'male', 'female'])
make_correlation_timeseries(nyt_df, info_df, columns)

  if __name__ == '__main__':


In [29]:
make_correlation_heatmap(nyt_df, info_df, columns, print_info=False)

## edu

In general, counties with lower educational attainment seem to be experiencing a higher density of cases than those with higher educational attainment. This is true across all ethnicities, but what's interesting is how hispanic populations are disproportionately affected.

In [30]:
columns = column_selector(info_df, mask=['edu'], exclude=['tot', 'male', 'female'])
make_correlation_timeseries(nyt_df, info_df, columns)

  if __name__ == '__main__':


## ethnic percentages

In [32]:
columns = column_selector(info_df, mask=['per', 'pop'], exclude=['tot', 'male', 'female'])
make_correlation_timeseries(nyt_df, info_df, columns)

  if __name__ == '__main__':


## white age, education, income

In [33]:
columns = column_selector(info_df, mask=['white'], exclude=['tot'])
make_correlation_timeseries(nyt_df, info_df, columns)

  if __name__ == '__main__':


In [35]:
columns = ['per_pop_white', 'pop_density', 'per_gop', 'mask', 'edu', 'age_pop', 'median_income']
make_correlation_heatmap(nyt_df, info_df, columns, print_info=False)

## hispanic age, education, income

In [92]:
make_correlation_timeseries(
    nyt_df, 
    info_df, 
    columns=['pop_density', 'per_gop', 'mask', 'edu', 'age_pop'], 
    feature='new_cases_per_100k_15d',
    divisions=5,
    divide_on='per_pop_hispanic'
)

  del sys.path[0]


In [37]:
columns = column_selector(info_df, mask=['hispanic'], exclude=['tot'])
make_correlation_timeseries(nyt_df, info_df, columns)

  if __name__ == '__main__':


In [38]:
columns = ['per_pop_hispanic', 'pop_density', 'per_gop', 'mask', 'edu_hispanic', 'age_pop_hispanic', 'median_income_hispanic']
make_correlation_heatmap(nyt_df, info_df, columns, print_info=False)

## black age, education, income

In [39]:
columns = column_selector(info_df, mask=['black'], exclude=['tot'])
make_correlation_timeseries(nyt_df, info_df, columns)

  if __name__ == '__main__':


## asian age, education, income

In [40]:
columns = column_selector(info_df, mask=['asian'], exclude=['tot'])
make_correlation_timeseries(nyt_df, info_df, columns)

  if __name__ == '__main__':


In [240]:
make_correlation_timeseries(df, columns=['pop_density', 'per_gop', 'mask', 'edu', 'age_pop'], feature='new_deaths_per_100k_15d')

  if __name__ == '__main__':


In [241]:
make_correlation_timeseries(
    df[df['fips']!='36NYC'], 
    columns=['pop_density', 'per_gop', 'mask', 'edu', 'age_pop'], 
    feature='new_deaths_per_100k_15d'
)

  if __name__ == '__main__':


The following are, upon inspection, positively correlated with the total number of cases and deaths per cluster:
- population density (`pop_density`)
- percent black (`per_black`)
- percent hispanic (`per_hispanic`)
- mask discipline (`mask`)

The following are negatively correlated with the total number of cases and deaths per county:
- percent white (`per_white`)
- adjusted percent votes GOP in the 2016 general election (`per_gop`)

## new cases and deaths

The following features are positively correlated with the number of *new* cases and deaths per cluster:
- percent black
- percent hispanic

And the following are negatively correlated:
- percent white
- percent of population who voted in the general election 2016 (`per_total_votes`)
- median income (`median_income`)
- education coefficient (`education`)

# Scatter Plots (with color and size axes)

In each chart:

- size -- total cases

In [None]:
#hide
#https://colorbrewer2.org/

color_dict = dict(
    # blue-red
    per_gop=dict(
        range=['#2166ac', '#92c5de', '#F7F7F7', '#f4a582', '#b2182b'],
        mid=0.5
    ),
    # pink-green
    mask=dict(
        range=['#c51b7d', '#f1b6da', '#F7F7F7', '#b8e186', '#4d9221'],
        mid=info_df['mask'].median()
    ),
    age_pop=dict(
        range=['#c51b7d', '#f1b6da', '#F7F7F7', '#b8e186', '#4d9221'],
        mid=info_df['age_pop'].median()
    ),
    # purple-orange
    median_income=dict(
        range=['#542788', '#b2abd2', '#F7F7F7', '#fdb863', '#b35806'],
        mid=61937
    ),
    # green-purple
    education=dict(
        range=['#1b7837', '#a6dba0', '#F7F7F7', '#c2a5cf', '#762a83'],
        mid=info_df['edu'].median()
    )
)

# COVID-19 Map

In [None]:
with open('../data/processed/geo_altair.p', 'rb') as f:
    geo_altair = pickle.load(f)

In [None]:
def make_map(df, c_col, s_col='cases', 
             c_range=None, c_mid=None):
    
#     base_cols = ['cases', 'cases_per_100k', 'deaths', 'deaths_per_100k', 'mortality_rate']
#     if s_col not in base_cols:
#         s_col = 'cases'

#     cols = ['date', 'state', 'county', 'total_pop', 'lat', 'lon', 'fips'] + base_cols + [c_col, s_col]
#     cols = list(set(cols))
#     smax = np.max(df[s_col])
#     smax = smax + (-smax) % 10

    if c_range==None:
        c_range = color_dict[c_col]['range']
    if c_mid==None:
        c_mid = color_dict[c_col]['mid']

#     df = df[cols]

    states = alt.topo_feature(data.us_10m.url, feature='states')
    

    background = alt.Chart(states).mark_geoshape(
        fill='#F2F2F2',
        stroke='white'
    ).properties(
        title='total cases',
        width=720,
        height=480
    ).project('albersUsa')
    
    counties = alt.Chart(geo_altair).mark_geoshape(
        stroke='white'
    ).encode(
        color=alt.Color(
            f'{c_col}:Q',
            scale=alt.Scale(
                range=c_range,
                domain=[np.percentile(df[c_col], 1),
                        c_mid,
                        np.percentile(df[c_col], 99)],
                interpolate={
                    'type':'rgb', 
                    'gamma':0.75
                }
            ),
        ),
        tooltip=[
            'state:N', 'county:N', 'cases:Q', 'deaths:Q', 
            alt.Tooltip(
                'mortality_rate:Q',
                format='.3f'
            ),
            alt.Tooltip(
                f'{s_col}:Q',
                format='.2f',
            ),
            alt.Tooltip(
                f'{c_col}:Q',
                format='.2f',
            )
        ]
    ).transform_lookup(
        lookup='id',
        from_=alt.LookupData(data=df[cols], key='fips', fields=cols)
    )
    
    return background + counties

In [None]:
make_map(df_slice, c_col='new_cases_per_100k_15sg',
         c_range=['#2c7bb6','#abd9e9','#ffffbf','#fdae61','#d7191c'],
         c_mid=0)

In [None]:
def make_map(df, c_col, s_col='cases', 
             c_range=None, c_mid=None):
    
    base_cols = ['cases', 'cases_per_100k', 'deaths', 'deaths_per_100k', 'mortality_rate']
#     if s_col not in base_cols:
#         s_col = 'cases'

    cols = ['date', 'state', 'county'] + base_cols
    cols = list(set(cols))
    smax = np.max(df[s_col])
    smax = smax + (-smax) % 10

    if c_range==None:
        c_range = color_dict[c_col]['range']
    if c_mid==None:
        c_mid = color_dict[c_col]['mid']

    df = df[cols]

    states = alt.topo_feature(data.us_10m.url, feature='states')
    

    background = alt.Chart(states).mark_geoshape(
        fill='#F2F2F2',
        stroke='white'
    ).properties(
        title='total cases',
        width=720,
        height=480
    ).project('albersUsa')
    
    counties = alt.Chart(geo_altair).mark_geoshape(
        stroke='white'
    ).encode(
        color=alt.Color(
            f'{c_col}:Q',
            scale=alt.Scale(
                range=c_range,
                domain=[np.percentile(df[c_col], 1),
                        c_mid,
                        np.percentile(df[c_col], 99)],
                interpolate={
                    'type':'rgb', 
                    'gamma':0.75
                }
            ),
        ),
    ).transform_lookup(
        lookup='id',
        from_=alt.LookupData(data=df[cols], key='fips', fields=cols)
    )

    counties = alt.Chart(df).mark_circle(
        stroke='black', strokeWidth=0.25, opacity=0.8
    ).encode(
        latitude='lat:Q',
        longitude='lon:Q',
        size=alt.Size(
            f'{s_col}:Q',
#             scale=alt.Scale(
#                 domain=[1,smax],
#                 range=[10,2000]
#             ),
        ),
        
        tooltip=[
            'state:N', 'county:N', 'cases:Q', 'deaths:Q', 
            alt.Tooltip(
                'mortality_rate:Q',
                format='.3f'
            ),
            alt.Tooltip(
                f'{s_col}:Q',
                format='.2f',
            ),
            alt.Tooltip(
                f'{c_col}:Q',
                format='.2f',
            )
        ]
    )
    
    return background + circles

In [None]:
make_map(df_slice, c_col='delta_new_cases_per_100k_15sg', s_col='new_cases_per_100k_15sg',
         c_range=['#2c7bb6','#abd9e9','#ffffbf','#fdae61','#d7191c'],
         c_mid=0)

## COVID-19 Density and Mortality Rate vs. Population Density

In [None]:
df_slice.merge(info_df[['fips', 'tot_pop', 'pop_density', 'per_gop']], on='fips', how='left').info()

In [None]:
def make_bubble_chart(df, x_col, y_col, c_col, s_col='cases',
                      x_scale='linear', y_scale='linear', 
                      c_range=None, c_mid=None, line=True, split=False, clip=False):
    
#     base_cols = ['cases', 'cases_per_100k', 'deaths', 'deaths_per_100k', 'mortality_rate']
#     if s_col not in base_cols:
#         s_col = 'cases'

#     cols = ['date', 'state', 'county', 'total_pop'] + base_cols + [x_col, y_col, c_col]
#     cols = list(set(cols))
    smax = np.max(df[s_col])
    smax = smax + (-smax) % 10

    if c_range==None:
        c_range = color_dict[c_col]['range']
    if c_mid==None:
        c_mid = color_dict[c_col]['mid']

#     df = df[cols]
    
    base = alt.Chart(df).properties(
        title=f'{y_col} vs. {x_col} ({c_col}, cases)',
        width=720,
        height=480
    )
    
    circles = base.mark_circle(stroke='black', strokeWidth=0.25, opacity=0.8).encode(
        x=alt.X(
            f'{x_col}:Q',
            scale=alt.Scale(type=x_scale, zero=False),
            impute=alt.ImputeParams(value=0)
        ),
        y=alt.Y(
            f'{y_col}:Q',
            scale=alt.Scale(type=y_scale, zero=False)
        ),
        size=alt.Size(
            f'{s_col}:Q',
            scale=alt.Scale(
                domain=[1,smax],
                range=[10,2000]
            ),
        ),
        color=alt.Color(
            f'{c_col}:Q',
            scale=alt.Scale(
                range=c_range,
                domain=[np.percentile(info_df[c_col], 1),
                        c_mid,
                        np.percentile(info_df[c_col], 99)],
                interpolate={
                    'type':'rgb', 
                    'gamma':0.75
                }
            ),
        ),
        tooltip=[
            'state:N', 'county:N', 'cases:Q', 'deaths:Q', 
            alt.Tooltip(
                'mortality_rate:Q',
                format='.3f'
            ),
            alt.Tooltip(
                f'{c_col}:Q',
                format='.2f',
            ),
            alt.Tooltip(
                f'{y_col}:Q',
                format='.2f',
            ),
            alt.Tooltip(
                f'{x_col}:Q',
                format='.2f',
            )
        ]
    ).transform_lookup(
        lookup='fips',
        from_=alt.LookupData(data=info_df, key='fips', fields=[x_col, c_col])
    )

    def make_line(df, split=False, side=None):
        df_ = df.merge(info_df[['fips', 'tot_pop', x_col, c_col]], on='fips', how='left')
        if split:
            if side=='gt':
                df_ = df_[df_[c_col]>c_mid]
                color = c_range[-1]
            elif side=='lt':
                df_ = df_[df_[c_col]<=c_mid]
                color = c_range[0]
        else:
            color = 'black'
        pfit = poly.polyfit(
            df_[x_col], 
            df_[y_col], 
            1, 
            w=df_['tot_pop']
        )
        fit = poly.polyval(np.unique(df_[x_col]), pfit)
        line_df = pd.DataFrame({
            'x': np.unique(df_[x_col]),
            'y': fit
        })
        return alt.Chart(line_df).mark_line(
            color=color,
            strokeDash=[2,1],
            clip=True
        ).encode(
            x='x:Q',
            y=alt.Y(
                'y:Q', 
                scale=alt.Scale(domain=[
                    df_[y_col].min(),
                    df_[y_col].max()
                ])
            )
        )
    
    if line:
        if split:
            _line0 = make_line(df, True, 'lt')
            _line1 = make_line(df, True, 'gt')
            return circles + _line0 + _line1
        else:
            _line = make_line(df)
            return circles + _line
        
    else:
        return circles

In [None]:
make_bubble_chart(df_slice, 'pop_density', 'cases_per_100k', 'age_pop',
                  x_scale='sqrt', y_scale='sqrt', split=False)

In [None]:
make_bubble_chart(df_slice, 'per_gop', 'cases_per_100k', 'per_gop',
                  x_scale='linear', y_scale='sqrt', split=False)

By far the largest predictor of total cases/deaths is population density. This does not appear to be an unfair characterization, given the fact that the disease spreads from person-to-person. Dense urban centers tend to lean Democratic, whereas sparse rural counties tend to lean Republican. Therefore, it makes sense that urban Democratic centers are being hit harder than their rural Republican counterparts.

The effects of urban density are further illustrated when discussing the mortality rate. In New York City, over 10% of those diagnosed have passed due to complications from the virus.

In [None]:
make_bubble_chart(df_slice, 'pop_density', 'mortality_rate', 'per_gop',
                  x_scale='sqrt', y_scale='sqrt', line=True, split=True)

In [None]:
make_bubble_chart(df_slice, 'per_gop', 'mortality_rate', 'per_gop',
                  x_scale='linear', y_scale='sqrt', line=True, split=False)

## COVID-19 Density vs. Mask Discipline

In [None]:
make_bubble_chart(df_slice, 'mask', 'cases_per_100k', 'per_gop',
                  x_scale='sqrt', y_scale='sqrt', line=True, split=False)

## COVID-19 Density vs. Educational Attainment

Educational attainment (`edu`) is defined as the weighted average of the highest degree of education for persons 25 and older within a given county:

- no HS -- 0
- some HS -- 1
- HS diploma or GED -- 2
- some college -- 3
- associate's degree -- 4
- bachelor's degree -- 5
- graduate or professional degree - 6

In [None]:
make_bubble_chart(df_slice, 'education', 'cases_per_100k', 'per_gop',
                  x_scale='sqrt', y_scale='sqrt', line=True, split=False)

In [None]:
#hide_input
params = dict(
    chart_title='Cases per 100k vs. Educational Attainment (% GOP)',
    x_col='education',
    x_scale='linear',
    y_col='cases_per_100k',
    y_scale='sqrt',
    s_col='cases',
    c_col='per_gop',
    w_col='total_pop'
)

case_edu = make_bubble_chart(df_slice, params)
case_edu

In [None]:
#hide_input
params = dict(
    chart_title='Mortality Rate vs. Educational Attainment (% GOP)',
    x_col='edu',
    x_scale='linear',
    y_col='new_cases_per_100k_15sg',
    y_scale='sqrt',
    s_col='cases',
    c_col='per_gop',
    w_col='total_pop'
)

new_case_edu = make_bubble_chart(df_slice, params)
new_case_edu

## COVID-19 Density vs. Mask Discipline

Mask discipline (`edu`) is defined as the weighted average of the answer to the following question: "How often do you wear a mask in public when you expect to be within six feet of another person?"

- never -- 0
- rarely -- 1
- sometimes -- 2
- frequently -- 3
- always -- 4

In [None]:
#hide_input
params = dict(
    chart_title='Cases per 100k vs. Mask Discipline (% GOP)',
    x_col='mask',
    x_scale='linear',
    y_col='cases_per_100k',
    y_scale='sqrt',
    s_col='cases',
    c_col='per_gop',
    w_col='total_pop'
)

mask_edu = make_bubble_chart(df_slice, params)
mask_edu

In [None]:
#hide_input
params = dict(
    chart_title='New Cases per 100k vs. Mask Discipline (% GOP)',
    x_col='mask',
    x_scale='linear',
    y_col='new_cases_per_100k_15sg',
    y_scale='sqrt',
    s_col='cases',
    c_col='per_gop',
    w_col='total_pop'
)

mask_edu = make_bubble_chart(df_slice, params)
mask_edu

In [None]:
#hide_input
params = dict(
    chart_title='Change in New Cases per 100k vs. Mask Discipline (% GOP)',
    x_col='mask',
    x_scale='linear',
    y_col='delta_new_cases_per_100k_15sg',
    y_scale='sqrt',
    s_col='cases',
    c_col='per_gop',
    w_col='total_pop'
)

mask_edu = make_bubble_chart(df_slice, params)
mask_edu

In [None]:
#hide_input
params = dict(
    chart_title='Case Density vs. Population Density (Mask Discipline)',
    x_col='pop_density',
    x_scale='sqrt',
    y_col='new_cases_per_100k_15sg',
    y_scale='sqrt',
    s_col='cases',
    c_col='mask',
    w_col='total_pop'
)

mask_edu = make_bubble_chart(df_slice, params)
mask_edu

# New Cases per 100k vs. Population Density, Education, and Mask Discipline

Mask use seems to increase with educational attainment and is somehow a partisan issue, with Republican counties having worse mask discipline. With the size of the circles representing new cases, we see that counties with a high education attainment and mask discipline ratines (top-right quadrant) seem to have a lower number of new cases than those with low education and mask discipline (bottom-left quadrant).

In [None]:
#hide_input
params = dict(
    chart_title='Case Density vs. Population Density, Education, and Mask Discipline',
    x_col='edu',
    x_scale='linear',
    y_col='mask',
    y_scale='linear',
    s_col='new_cases_per_100k_15sg',
    c_col='per_gop',
    w_col='total_pop'
)

medley = make_bubble_chart(df_slice, params)
medley

This is not particularly easy to read, so maybe we'll use matplotlib instead.

In [None]:
%matplotlib inline

sns.set_style('whitegrid')

fig = plt.figure(figsize=(17,10))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df_slice['edu'], 
           df_slice['mask'], 
           df_slice['new_cases_per_100k_15sg'], 
           s=df_slice['cases'] / 200,
           c=df_slice['per_gop'],
           cmap='RdBu_r')
ax.set_xlabel('educational attainment')
ax.set_ylabel('mask discipline')
ax.set_zlabel('new cases per 100k')
ax.view_init(20,280)
plt.show()

## New Cases per 100k vs. Percent GOP

Recent COVID-19 cases, however, seem to be more prevalent in more GOP-weighted counties.

In [None]:
make_bubble_chart(df_slice, 'per_gop', 'new_cases_per_100k_15sg', 'per_gop',
                  x_scale='linear', y_scale='sqrt', line=True, split=False)

This is most likely skewed by the fact that New York City -- a prior epicenter of COVID-19 cases and the largest group in this dataset -- has relatively few cases now.

In [None]:
make_bubble_chart(df_slice, 'per_gop', 'new_cases_per_100k_15sg', 'per_gop',
                  x_scale='linear', y_scale='sqrt', line=True, split=True)

In [None]:
make_bubble_chart(df_slice, 'per_gop', 'delta_new_cases_per_100k_15sg', 'per_gop',
                  x_scale='linear', y_scale='sqrt', line=True, split=False)

In [None]:
make_bubble_chart(df_slice, 'per_gop', 'delta_new_cases_per_100k_15sg', 'per_gop',
                  x_scale='linear', y_scale='sqrt', line=True, split=True)

## Change in New Cases per 100k vs. Percent GOP

Changes in cases do not seem to be predicted by political affiliation.

In [None]:
#hide_input
params = dict(
    chart_title='Change in New Cases vs. Percent GOP',
    x_col='per_gop',
    x_scale='linear',
    y_col='delta_new_cases_per_100k_15sg',
    y_scale='linear',
    s_col='cases',
    c_col='per_gop',
    w_col='total_pop'
)

del_case_gop = make_bubble_chart(df_slice, params)
del_case_gop

In [None]:
#hide_input
params = dict(
    chart_title='Change in New Cases vs. Percent GOP',
    x_col='per_gop',
    x_scale='linear',
    y_col='delta_new_cases_per_100k_15sg',
    y_scale='linear',
    s_col='cases',
    c_col='per_gop',
    w_col='total_pop'
)

del_case_gop_split = make_bubble_chart(df_slice, params, split=True)
del_case_gop_split

## Case Rate Charts

Counties in the upper right portion of this chart face the highest risk of a worsening pandemic.

In [None]:
make_bubble_chart(df_slice, 'new_cases_per_100k_15sg', 'delta_new_cases_per_100k_15sg', 'per_gop',
                  x_scale='sqrt', y_scale='sqrt', line=True, split=True)