# Exploring Trends

In [23]:
import numpy as np
import numpy.polynomial.polynomial as poly      # linear regression
import pandas as pd

import math

import pickle

import re

# opening external coordinates
import json

# opening urls
from urllib.request import urlopen

# benchmarking
from time import time

# date/time operations
from datetime import datetime, timedelta
from pytz import timezone

# plotting
import altair as alt
from altair import datum
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

# for US map
from vega_datasets import data

In [24]:
# large datasets
# alt.data_transformers.enable('data_server');

pd.options.display.max_rows = 250
pd.options.display.max_columns = 250

In [25]:
#hide
def optimize(df):
    '''
    Optimizes the data types in a pandas dataframe.
    '''
    dft = df.copy()
    # converts to datetime if possible
    dft = dft.apply(
        lambda col:
        pd.to_datetime(col, errors='ignore')
        if col.dtypes=='object' 
        else col
    )
    
    # if there are less than half as many unique values as there are rows, convert to category
    for col in dft.select_dtypes(include='object'):
        if len(dft[col].unique()) / len(df[col]) < 0.5:
            dft[col] = dft[col].astype('category')
            
    # downcasts numeric columns if possible
    dft = dft.apply(
        lambda col: 
        pd.to_numeric(col, downcast='integer') 
        if col.dtypes=='int64' 
        else col
    )
    dft = dft.apply(
        lambda col: 
        pd.to_numeric(col, downcast='float') 
        if col.dtypes=='float64' 
        else col
    )
    
    return dft

In [26]:
with open('../data/processed/csse_df.p', 'rb') as f:
    csse_df = pickle.load(f)
csse_df.tail()

Unnamed: 0,date,fips,cases,deaths,cases_per_100k,deaths_per_100k,new_cases,new_deaths,new_cases_per_100k,new_deaths_per_100k,new_cases_15d,new_deaths_15d,new_cases_per_100k_15d,new_deaths_per_100k_15d,new_cases_15sg,new_deaths_15sg,new_cases_per_100k_15sg,new_deaths_per_100k_15sg,days
1008577,2020-12-07,56037,2176,11,5138.984012,25.97832,64,1,151,2,50.333333,0.333333,118.466667,0.733333,37.083333,0.808333,87.366667,1.758333,0
1008578,2020-12-07,56039,1810,2,7713.944766,8.523696,25,0,106,0,29.266667,0.0,124.266667,0.0,24.241667,0.0,102.791667,0.0,0
1008579,2020-12-07,56041,1221,6,6036.784337,29.664788,23,1,113,4,20.066667,0.133333,98.8,0.533333,17.916667,0.383333,88.0,1.533333,0
1008580,2020-12-07,56043,556,10,7123.638693,128.122998,10,2,128,25,16.066667,0.2,205.466667,2.466667,13.716667,0.425,175.391667,5.341667,0
1008581,2020-12-07,56045,422,2,6092.103364,28.872528,2,0,28,0,3.333333,0.133333,47.8,1.866667,1.958333,-0.016667,27.825,-0.233333,0


In [38]:
with open('../data/processed/info_df.p', 'rb') as f:
    info_df = pickle.load(f)
    
with open('../data/processed/geo_altair.p', 'rb') as f:
    geo_altair = pickle.load(f)

In [7]:
# info_df.to_csv('../data/processed/info_df.csv', index=False)

# Helper Functions

In [28]:
def column_selector(info_df, columns='none', mask=[], exclude=[]):
    
    # only select from numeric columns
    all_columns = info_df.select_dtypes(include='number').columns.tolist()
    
    # empty container if we don't have a list going already
    if columns is 'none':
        columns = []
    elif columns is 'all':
        columns = all_columns
    
    # includes all columns that have all elements in mask
    # excludes all columns that have any elements in exclude
    if len(mask) > 0:
        mask_columns = list(set([
            c for c in all_columns
            if all(m in set(re.findall('[0-9a-z]+', c))
            for m in mask)
        ]))
        columns += mask_columns
    if len(exclude) > 0:
        columns = list(set([
            c for c in columns 
            if all(e not in set(re.findall('[0-9a-z]+', c)) 
            for e in exclude)
        ]))
        
    return sorted(list(set(columns)))

In [29]:
column_selector(info_df, mask=['edu'], exclude=['tot', 'per', 'male', 'female'])

['edu',
 'edu_asian',
 'edu_black',
 'edu_hispanic',
 'edu_native',
 'edu_other',
 'edu_pacific',
 'edu_twoplus',
 'edu_white']

In [30]:
def corr(x, y, w, useweight=True):
    
    # only uses elements that are not nan from both lists
    x_ids = ~np.isnan(x)
    y_ids = ~np.isnan(y)
    ids = x_ids & y_ids
    
    if useweight:
        try:
            [xx, xy], [_, yy] = np.cov(x[ids], y[ids], aweights=w[ids])
        except:
            print(x.name)
            print(y.name)
    else:
        [xx, xy], [_, yy] = np.cov(x[ids], y[ids])
    
    return xy / np.sqrt(xx * yy)

In [31]:
def df_merger(nyt_df, info_df, x_cols=None, y_cols=None, date='latest', weight='tot_pop'):
    '''
    x_cols : predictor columns
    y_cols : target columns
    date :   'latest', 'all', or specific date
    weight : weight column
    '''
    
    # make sure x and y are valid
    all_y = nyt_df.columns.tolist()
    for y in y_cols:
        if '_per_100k' in y:
            y_cols.append(y.replace('_per_100k', ''))
    y_cols = sorted(list(set([y for y in y_cols if y in all_y])))
    
    all_x = info_df.columns.tolist()
    x_cols = sorted(list(set([c for c in x_cols if c in all_x])))
    
    ## only process specific date and y_cols
    left_columns = list(set(['date', 'fips'] + y_cols))
    if date=='latest':
        left_df = nyt_df[nyt_df['date']==nyt_df['date'].max()][left_columns]
    elif date=='all':
        left_df = nyt_df[left_columns]
    else:
        left_df = nyt_df[nyt_df['date']==date][left_columns]

    ## only process specific x_cols
    right_columns = list(set(['fips', 'state', 'county', weight] + x_cols))
    right_df = info_df[right_columns]
    
    # https://stackoverflow.com/a/47118728/14083095
    # fills nyt_df with entries for counties that do not log cases
    # for more accurate aggregate per capita calculations
    
    # create multiindex that has every fips with every date
    mux = pd.MultiIndex \
            .from_product(
                [left_df['date'].unique(), 
                 right_df['fips'].unique()], 
                names=('date', 'fips')
            )
    # reindex data to multiindex, fill nan entries with 0
    left_df = left_df.set_index(['date','fips']) \
                     .reindex(mux)               \
                     .swaplevel(0,1)             \
                     .reset_index()              \
                     .fillna(0)
   
    df = left_df.merge(right_df, on='fips', how='outer', suffixes=('_x', ''))
    df = df.drop([x for x in df.columns if x[-2:]=='_x'], axis=1)
    
    return df

In [32]:
import statsmodels.api as sm
import scipy.stats as stats
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [39]:
merged = df_merger(
    csse_df, info_df, 
    x_cols=column_selector(info_df, 'all', exclude=['male', 'female', 'tot', 'lat', 'lon']),
    y_cols=['new_cases_per_100k_15d', 'cases_per_100k']
)

merged.head()

Unnamed: 0,fips,date,new_cases_per_100k_15d,cases_per_100k,new_cases_15d,cases,age_pop_white,edu_twoplus,median_income_pacific,some_college,per_edu_native_nohs,state,median_income_hispanic,tot_pop,per_edu_other_nohs,edu_native,age_pop_hispanic,rarely,edu_asian,never,median_income_white,per_edu_white_nohs,age_pop_twoplus,votes_diff,per_edu_hispanic_nohs,associates,no_hs,mask,median_income_other,per_edu_black_nohs,median_income_native,edu_black,age_pop_black,median_income_asian,per_diff,per_edu_pacific_nohs,sometimes,pop_density,median_income,edu_other,age_pop,area,edu_pacific,age_pop_pacific,bachelors,graduate,frequently,age_pop_asian,always,edu_hispanic,per_edu_twoplus_nohs,some_hs,county,median_income_black,per_edu_asian_nohs,hs,pop_25p,age_pop_native,edu,edu_white,median_income_twoplus
0,1001,2020-12-07,50.4,5446.669888,28.4,3043,8.727963,2.626316,,7554,0.217949,Alabama,83423.0,55869,0.248092,3.166667,6.707361,0.074,3.615142,0.053,65047.0,0.090371,5.35251,12335.0,0.125666,2998,956,3.003,,0.19555,,2.563808,7.81375,,0.444184,0.0,0.134,36.287947,58786.0,1.503817,8.422041,1539.602123,2.0,7.125,5903,4388,0.295,7.899381,0.444,3.591054,0.292105,3248,Autauga,27643.0,0.123028,12119,37166,9.786008,3.174487,3.28831,
1,1003,2020-12-07,47.933333,4399.419443,108.133333,9821,9.321749,3.418808,,32266,0.221675,Alabama,43279.0,223234,0.259062,2.270936,6.602525,0.059,2.690678,0.083,59418.0,0.079388,5.8126,58966.0,0.253174,13759,3978,2.968,45634.0,0.203315,53289.0,2.494586,7.849388,34763.0,0.537623,0.0,0.098,54.215293,55962.0,2.878465,8.987202,4117.546676,2.0,8.422764,30431,15644,0.323,7.638534,0.436,2.850361,0.106893,10332,Baldwin,31112.0,0.169492,40579,146989,8.999336,3.329113,3.489788,53456.0
2,1005,2020-12-07,17.066667,4958.275946,4.266667,1224,9.818336,1.918033,,3287,0.416667,Alabama,30417.0,24686,0.710145,1.166667,6.142346,0.121,1.920455,0.067,47031.0,0.207938,6.136842,806.0,0.561955,1279,1490,2.928,26793.0,0.317808,,1.703576,8.107733,50417.0,0.076631,1.0,0.12,10.769826,34186.0,1.217391,8.784412,2292.144655,0.0,6.741935,1417,803,0.201,9.137931,0.491,1.539267,0.163934,3411,Barbour,23013.0,0.181818,6486,18173,9.852632,2.38062,2.444444,19760.0
3,1007,2020-12-07,48.0,5800.660891,10.866667,1299,8.871332,1.651007,,2938,0.0,Alabama,42708.0,22394,0.0,2.0,7.091493,0.034,7.0,0.02,50769.0,0.126562,6.357724,5539.0,0.341853,908,903,3.348,,0.3076,,1.686369,7.993219,,0.57728,,0.096,13.890616,45340.0,2.0,8.606145,1612.167481,,5.833333,1197,616,0.278,8.76087,0.572,1.316294,0.275168,1747,Bibb,34000.0,0.0,7471,15780,8.384615,2.459823,2.400933,20329.0
4,1009,2020-12-07,65.333333,5748.279321,38.0,3324,8.955796,1.913636,,8492,0.287879,Alabama,35495.0,57826,0.43128,1.916667,6.240595,0.114,4.177419,0.053,49872.0,0.166601,6.512329,22071.0,0.614559,4775,2967,2.892,,0.310403,65385.0,1.563758,7.990826,99219.0,0.800022,0.0,0.18,34.624193,48695.0,3.270142,8.651714,1670.103911,2.0,8.952381,3217,1793,0.194,8.723926,0.459,0.927969,0.179545,4894,Blount,,0.16129,13489,39627,9.79078,2.606581,2.334181,44934.0


In [47]:
y = merged['new_cases_per_100k_15d']
drop_cols = [c for c in merged.select_dtypes(include='number').columns.tolist() if 'cases' in c]
X = merged.select_dtypes(include='number').drop(drop_cols, axis=1).fillna(0.5)
ss = StandardScaler()
Z = ss.fit_transform(X)

In [35]:
from sklearn.ensemble import RandomForestRegressor

In [41]:
reg = RandomForestRegressor()
reg.fit(Z, y)

importance = reg.feature_importances_

In [42]:
importance_df = pd.DataFrame(columns=['feature', 'importance'])
importance_df['feature'] = X.columns.tolist()
importance_df['importance'] = importance
importance_df.sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance
41,always,0.102264
14,per_edu_white_nohs,0.070933
20,mask,0.052699
9,age_pop_hispanic,0.043352
25,age_pop_black,0.037469
12,never,0.034582
34,area,0.031104
49,age_pop_native,0.028683
15,age_pop_twoplus,0.028529
0,age_pop_white,0.028317


In [43]:
from xgboost import XGBRegressor

In [44]:
reg = XGBRegressor()
reg.fit(Z, y)

importance = reg.feature_importances_

importance_df = pd.DataFrame(columns=['feature', 'importance'])
importance_df['feature'] = X.columns.tolist()
importance_df['importance'] = importance
importance_df.sort_values(by='importance', ascending=False)



Unnamed: 0,feature,importance
20,mask,0.150528
41,always,0.086462
14,per_edu_white_nohs,0.048309
45,median_income_black,0.046006
26,median_income_asian,0.026577
12,never,0.025848
18,associates,0.025655
25,age_pop_black,0.023524
47,hs,0.023058
48,pop_25p,0.021876


In [50]:
# https://machinelearningmastery.com/calculate-feature-importance-with-python/
wls = LinearRegression()
wls.fit(Z, y)
coefs = wls.coef_

[x for _, x in sorted(zip(coefs, X.columns))]

['per_edu_white_nohs',
 'age_pop_twoplus',
 'per_edu_other_nohs',
 'edu_white',
 'median_income_black',
 'associates',
 'per_edu_twoplus_nohs',
 'area',
 'edu',
 'per_edu_asian_nohs',
 'votes_diff',
 'hs',
 'graduate',
 'edu_black',
 'some_college',
 'rarely',
 'median_income_asian',
 'per_edu_native_nohs',
 'never',
 'age_pop_hispanic',
 'median_income_hispanic',
 'some_hs',
 'age_pop_pacific',
 'tot_pop',
 'per_edu_hispanic_nohs',
 'bachelors',
 'median_income_native',
 'no_hs',
 'per_edu_black_nohs',
 'age_pop_white',
 'median_income',
 'edu_pacific',
 'always',
 'median_income_twoplus',
 'age_pop',
 'age_pop_black',
 'median_income_pacific',
 'edu_native',
 'sometimes',
 'per_diff',
 'edu_twoplus',
 'age_pop_native',
 'per_edu_pacific_nohs',
 'mask',
 'frequently',
 'edu_other',
 'pop_25p',
 'age_pop_asian',
 'edu_hispanic',
 'pop_density',
 'edu_asian',
 'median_income_white',
 'median_income_other']

In [53]:
Z = stats.zscore(X)
Z = sm.add_constant(Z)
weights = merged['tot_pop']

wls = sm.WLS(y, Z, weights=weights)

fit = wls.fit()
fit.summary()

0,1,2,3
Dep. Variable:,new_cases_per_100k_15d,R-squared:,0.377
Model:,WLS,Adj. R-squared:,0.367
Method:,Least Squares,F-statistic:,36.74
Date:,"Tue, 08 Dec 2020",Prob (F-statistic):,2.3e-275
Time:,17:18:33,Log-Likelihood:,-15819.0
No. Observations:,3142,AIC:,31740.0
Df Residuals:,3090,BIC:,32060.0
Df Model:,51,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,62.4210,0.725,86.076,0.000,60.999,63.843
x1,-2.1729,0.864,-2.514,0.012,-3.868,-0.478
x2,-1.0621,1.090,-0.974,0.330,-3.199,1.075
x3,-0.4340,0.259,-1.676,0.094,-0.942,0.074
x4,9.5206,1.322,7.203,0.000,6.929,12.112
x5,-1.6939,0.862,-1.965,0.050,-3.384,-0.003
x6,-2.1308,0.910,-2.343,0.019,-3.914,-0.347
x7,-35.6586,3.259,-10.943,0.000,-42.048,-29.269
x8,-0.4998,1.225,-0.408,0.683,-2.903,1.903

0,1,2,3
Omnibus:,409.77,Durbin-Watson:,1.588
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2265.085
Skew:,0.49,Prob(JB):,0.0
Kurtosis:,7.043,Cond. No.,1.21e+16


In [56]:
[x for _, x in sorted(zip(fit.params[1:], X.columns))]

['tot_pop',
 'edu',
 'per_edu_white_nohs',
 'median_income_white',
 'age_pop_native',
 'age_pop_twoplus',
 'age_pop_asian',
 'edu_hispanic',
 'age_pop_black',
 'per_edu_hispanic_nohs',
 'median_income_twoplus',
 'median_income_other',
 'edu_native',
 'age_pop_white',
 'median_income_hispanic',
 'median_income_black',
 'per_edu_native_nohs',
 'edu_other',
 'median_income_native',
 'edu_black',
 'edu_twoplus',
 'pop_density',
 'per_edu_black_nohs',
 'associates',
 'per_edu_other_nohs',
 'median_income_pacific',
 'no_hs',
 'per_edu_twoplus_nohs',
 'votes_diff',
 'per_edu_asian_nohs',
 'age_pop_pacific',
 'median_income_asian',
 'per_diff',
 'edu_pacific',
 'age_pop',
 'per_edu_pacific_nohs',
 'age_pop_hispanic',
 'bachelors',
 'area',
 'edu_asian',
 'graduate',
 'pop_25p',
 'edu_white',
 'some_hs',
 'hs',
 'some_college',
 'median_income',
 'frequently',
 'always',
 'sometimes',
 'rarely',
 'never',
 'mask']

In [49]:
wls = sm.WLS(y, X, weights=weights)

fit = wls.fit()
fit.summary()

0,1,2,3
Dep. Variable:,new_cases_per_100k_15d,R-squared (uncentered):,0.89
Model:,WLS,Adj. R-squared (uncentered):,0.888
Method:,Least Squares,F-statistic:,488.3
Date:,"Tue, 08 Dec 2020",Prob (F-statistic):,0.0
Time:,17:10:19,Log-Likelihood:,-15819.0
No. Observations:,3142,AIC:,31740.0
Df Residuals:,3091,BIC:,32050.0
Df Model:,51,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age_pop_white,-3.1686,1.310,-2.418,0.016,-5.738,-0.599
edu_twoplus,-1.0522,1.072,-0.982,0.326,-3.154,1.050
median_income_pacific,-1.979e-05,1.24e-05,-1.595,0.111,-4.41e-05,4.53e-06
some_college,9.527e-05,2.63e-05,3.626,0.000,4.37e-05,0.000
per_edu_native_nohs,-7.5520,3.855,-1.959,0.050,-15.111,0.007
median_income_hispanic,-8.542e-05,3.61e-05,-2.366,0.018,-0.000,-1.46e-05
tot_pop,-0.0001,9.76e-06,-10.904,0.000,-0.000,-8.73e-05
per_edu_other_nohs,-2.0599,4.873,-0.423,0.673,-11.615,7.495
edu_native,-1.7507,0.697,-2.513,0.012,-3.117,-0.385

0,1,2,3
Omnibus:,409.158,Durbin-Watson:,1.588
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2280.909
Skew:,0.486,Prob(JB):,0.0
Kurtosis:,7.059,Cond. No.,1.18e+16


In [46]:
x_cols = ['per_pop_hispanic']
y_cols = 'new_cases_per_100k_15d'

merged = df_merger(
    nyt_df, info_df, 
    x_cols=x_cols,
    y_cols=[y_cols]
)

y = merged[y_cols]
X = merged[x_cols].fillna(0.5)
X = stats.zscore(X)
X = sm.add_constant(X)
weights = merged['tot_pop']

wls = sm.WLS(y, X, weights=weights)

fit = wls.fit()
fit.summary()

0,1,2,3
Dep. Variable:,new_cases_per_100k_15d,R-squared:,0.018
Model:,WLS,Adj. R-squared:,0.018
Method:,Least Squares,F-statistic:,56.98
Date:,"Fri, 04 Sep 2020",Prob (F-statistic):,5.72e-14
Time:,15:48:28,Log-Likelihood:,-22248.0
No. Observations:,3140,AIC:,44500.0
Df Residuals:,3138,BIC:,44510.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,173.3941,3.040,57.029,0.000,167.433,179.356
x1,16.6015,2.199,7.549,0.000,12.289,20.914

0,1,2,3
Omnibus:,2696.453,Durbin-Watson:,1.504
Prob(Omnibus):,0.0,Jarque-Bera (JB):,335935.817
Skew:,3.472,Prob(JB):,0.0
Kurtosis:,53.194,Cond. No.,1.81


# Exploring Correlations

In [10]:
def make_correlation_table(
    nyt_df, info_df, x_cols=None, y_cols=None,
    date='latest', useweight=True, weight='tot_pop',
    threshold=0.4
):
    
    df = df_merger(nyt_df, info_df, x_cols, y_cols, date, weight)
    
    wct = pd.DataFrame(index=x_cols, columns=y_cols)
    
    for y in y_cols:
        for x in x_cols:
            wct.loc[x, y] = corr(df[y], df[x], df[weight])
    
    wct = wct[(wct >= threshold) | (wct <= -1 * threshold)].dropna()
    
    return wct.sort_values(by=y_cols[0], ascending=False)

In [11]:
columns = column_selector(info_df, 'all')
make_correlation_table(nyt_df, info_df, x_cols=columns, y_cols=['new_cases_per_100k_15d'], threshold=0.15)

Unnamed: 0,new_cases_per_100k_15d,new_cases_15d
rarely,0.486194,-0.299946
never,0.447001,-0.250148
sometimes,0.442087,-0.252474
per_gop,0.338017,-0.473156
frequently,0.337927,-0.280812
per_pop_white_male,0.253933,-0.52338
per_pop_white,0.245929,-0.524056
per_pop_white_female,0.236658,-0.523692
per_edu_white_male_nohs,0.188598,-0.270928
per_edu_white_nohs,0.182522,-0.239919


Counties with a high hispanic population have a disproportionately high number of COVID-19 cases per capita in the last 15 days, while counties with high white population seem to have a disproportionately low number of cases. Counties with higher educational attainment and higher income tend to have had less COVID cases per capita in the last 15 days.

In [12]:
def make_correlation_heatmap(
    nyt_df, info_df, date='latest', x_cols=None,
    y_cols=[
        'cases_per_100k', 
        'new_cases_per_100k_15d',
        'delta_new_cases_per_100k_15d',
        'deaths_per_100k',
        'new_deaths_per_100k_15d',
        'delta_new_deaths_per_100k_15d',
        'mortality_rate',
        'mortality_rate_15d'
    ],
    useweight=True, weight='tot_pop', size=50, print_corr=True,
    threshold=0.4
):
    
    df = df_merger(nyt_df, info_df, x_cols, y_cols, date, weight)

    # build weighted correlation matrix from df
    wcm_cols = x_cols + y_cols
    
    wcm = pd.DataFrame(index=x_cols, columns=wcm_cols)
    
    for y in wcm_cols:
        for x in x_cols:
            wcm.loc[x, y] = corr(df[x], df[y], df[weight])
    
    wcm = (wcm.reset_index().rename(columns={'index':'y_feature'}).dropna()
              .melt('y_feature', var_name='x_feature', value_name='corr'))
    wcm['corr'] = np.round(wcm['corr'].astype(float), 4)

    if print_corr:
        print('positive correlations')
        print(
            wcm[(wcm['corr'] >= threshold) & (wcm['corr'] != 1)]
            .sort_values(by=['corr', 'y_feature']).iloc[::2, :]
            .sort_values(by=['y_feature', 'x_feature'])
        )
        print('\nnegative correlations')
        print(
            wcm[(wcm['corr'] <= -1 * threshold) & (wcm['corr'] != -1)]
            .sort_values(by=['corr', 'y_feature']).iloc[::2, :]
            .sort_values(by=['y_feature', 'x_feature'])
        )
    
    # build altair chart
    base = alt.Chart(wcm).encode(
        alt.X(
            'x_feature:O',
            sort=x_cols
        ),
        alt.Y(
            'y_feature:O',
#             sort=columns
        )
    )
    heatmap = base.mark_rect().encode(
        color=alt.Color(
            'corr:Q',
            scale=alt.Scale(
                scheme='redblue',
                domain=[-1, 0, 1]
            )
        ),
        tooltip=[
            alt.Tooltip('x_feature:O'),
            alt.Tooltip('y_feature:O'),
            alt.Tooltip('corr:Q', title='correlation')
        ]
    )
    
    # text
    text = base.mark_text(baseline='middle').encode(
        text=alt.Text('corr:Q',format='.2f'),
        color=alt.condition(
            np.abs(alt.datum.corr) <= 0.5,
            alt.value('black'),
            alt.value('white')
        )
    )
    
    return (heatmap + text).configure_view(step=size)

In [13]:
columns = column_selector(
    info_df, 
    ['per_gop', 'mask', 'edu', 'median_income', 'age_pop', 'pop_density'], 
    mask=['per', 'pop'], 
    exclude=['male', 'female', 'tot']
)
make_correlation_heatmap(nyt_df, info_df, x_cols=columns, y_cols=['cases_per_100k', 'new_cases_per_100k_15d'], size=50)

positive correlations
            y_feature         x_feature    corr
66                edu     per_pop_asian  0.4250
41               mask     median_income  0.4294
67               mask     per_pop_asian  0.4586
93               mask  per_pop_hispanic  0.4599
16      median_income               edu  0.7337
44      per_pop_asian     median_income  0.6062
135     per_pop_asian   per_pop_twoplus  0.4405
215  per_pop_hispanic     new_cases_15d  0.5013
127   per_pop_twoplus   per_pop_pacific  0.8461
63      per_pop_white           per_gop  0.6777
207       pop_density             cases  0.5730

negative correlations
            y_feature      x_feature    corr
53                edu        per_gop -0.4200
54               mask        per_gop -0.6443
145              mask  per_pop_white -0.5241
212           per_gop  new_cases_15d -0.4732
69            per_gop  per_pop_asian -0.5709
149     per_pop_black  per_pop_white -0.4284
150  per_pop_hispanic  per_pop_white -0.7645
206     per_pop_whi

First, let's discuss features that not quite independent from each other:

selected positive correlations (> 0.4):
- educational attainment and median income
- educational attainment and percent asian
- mask discipline and median income
- mask discipline and percent asian
- mask discipline and percent hispanic
- median income and percent asian
- percent GOP and percent white

selected negative correlations (< -0.4):
- educational attainment and percent GOP
- mask discipline and percent GOP
- mask discipline and percent white
- percent Asian and percent GOP
- population density and percent GOP

Since there seems to be multicollinearity, we can't simply throw our data into a multiple linear regression.

## correlating cases per capita in the last 15 days

There are some (weak) correlations to recent cases per capita:

positive:
- percent GOP
- percent black
- percent hispanic

negative:
- median age
- educational attainment
- mask discipline
- median income
- percent asian
- percent white
- population density

# smoothed percentile timeseries heatmap

In [24]:
merged = df_merger(
    nyt_df, info_df, date='all',
    x_cols=column_selector(info_df, 'all', exclude=['male', 'female', 'tot']),
    y_cols=['new_cases_per_100k_15d', 'cases_per_100k']
)

merged.head()

Unnamed: 0,fips,date,new_cases_per_100k_15d,cases_per_100k,new_cases_15d,cases,sometimes,hs,state,per_votes,median_income_twoplus,median_income_other,per_edu_black_nohs,age_pop_asian,no_hs,mask,per_pop_white,per_pop_black,edu_other,some_hs,area_land,frequently,age_pop,age_pop_twoplus,edu_pacific,lon,per_edu_pacific_nohs,per_pop_twoplus,per_pop_native,median_income_white,per_pop_asian,county,edu_twoplus,per_pop_hispanic,per_edu_asian_nohs,median_income_asian,per_edu_other_nohs,never,edu,edu_native,age_pop_hispanic,tot_pop,edu_hispanic,lat,age_pop_native,rarely,edu_black,edu_white,median_income,median_income_hispanic,per_edu_hispanic_nohs,per_edu_white_nohs,median_income_native,edu_asian,per_edu_native_nohs,median_income_pacific,per_pop_pacific,bachelors,age_pop_black,always,graduate,pop_density,associates,median_income_black,per_edu_twoplus_nohs,per_gop,age_pop_pacific,some_college,pop_25p,age_pop_white
0,1001,2020-01-21,0.0,0.0,0.0,0.0,0.134,12119,Alabama,0.441408,,,0.19555,7.899381,956,3.003,0.737708,0.198643,1.503817,3248,1539.602123,0.295,8.422041,5.35251,2.0,-86.643648,0.0,0.017111,0.004349,65047.0,0.011563,Autauga,2.626316,0.029909,0.123028,,0.248092,0.053,3.174487,3.166667,6.707361,55869,3.591054,32.538666,9.786008,0.074,2.563808,3.28831,58786.0,83423.0,0.125666,0.090371,,3.615142,0.217949,,0.000716,5903,7.81375,0.444,4388,36.287947,2998,27643.0,0.292105,0.754018,7.125,7554,37166,8.727963
1,1001,2020-01-22,0.0,0.0,0.0,0.0,0.134,12119,Alabama,0.441408,,,0.19555,7.899381,956,3.003,0.737708,0.198643,1.503817,3248,1539.602123,0.295,8.422041,5.35251,2.0,-86.643648,0.0,0.017111,0.004349,65047.0,0.011563,Autauga,2.626316,0.029909,0.123028,,0.248092,0.053,3.174487,3.166667,6.707361,55869,3.591054,32.538666,9.786008,0.074,2.563808,3.28831,58786.0,83423.0,0.125666,0.090371,,3.615142,0.217949,,0.000716,5903,7.81375,0.444,4388,36.287947,2998,27643.0,0.292105,0.754018,7.125,7554,37166,8.727963
2,1001,2020-01-23,0.0,0.0,0.0,0.0,0.134,12119,Alabama,0.441408,,,0.19555,7.899381,956,3.003,0.737708,0.198643,1.503817,3248,1539.602123,0.295,8.422041,5.35251,2.0,-86.643648,0.0,0.017111,0.004349,65047.0,0.011563,Autauga,2.626316,0.029909,0.123028,,0.248092,0.053,3.174487,3.166667,6.707361,55869,3.591054,32.538666,9.786008,0.074,2.563808,3.28831,58786.0,83423.0,0.125666,0.090371,,3.615142,0.217949,,0.000716,5903,7.81375,0.444,4388,36.287947,2998,27643.0,0.292105,0.754018,7.125,7554,37166,8.727963
3,1001,2020-01-24,0.0,0.0,0.0,0.0,0.134,12119,Alabama,0.441408,,,0.19555,7.899381,956,3.003,0.737708,0.198643,1.503817,3248,1539.602123,0.295,8.422041,5.35251,2.0,-86.643648,0.0,0.017111,0.004349,65047.0,0.011563,Autauga,2.626316,0.029909,0.123028,,0.248092,0.053,3.174487,3.166667,6.707361,55869,3.591054,32.538666,9.786008,0.074,2.563808,3.28831,58786.0,83423.0,0.125666,0.090371,,3.615142,0.217949,,0.000716,5903,7.81375,0.444,4388,36.287947,2998,27643.0,0.292105,0.754018,7.125,7554,37166,8.727963
4,1001,2020-01-25,0.0,0.0,0.0,0.0,0.134,12119,Alabama,0.441408,,,0.19555,7.899381,956,3.003,0.737708,0.198643,1.503817,3248,1539.602123,0.295,8.422041,5.35251,2.0,-86.643648,0.0,0.017111,0.004349,65047.0,0.011563,Autauga,2.626316,0.029909,0.123028,,0.248092,0.053,3.174487,3.166667,6.707361,55869,3.591054,32.538666,9.786008,0.074,2.563808,3.28831,58786.0,83423.0,0.125666,0.090371,,3.615142,0.217949,,0.000716,5903,7.81375,0.444,4388,36.287947,2998,27643.0,0.292105,0.754018,7.125,7554,37166,8.727963


In [25]:
merged.shape

(769300, 70)

In [26]:
info_df['tot_pop'].describe()

count    3.140000e+03
mean     1.047088e+05
std      3.580188e+05
min      8.600000e+01
25%      1.090000e+04
50%      2.567300e+04
75%      6.770050e+04
max      1.003911e+07
Name: tot_pop, dtype: float64

In [29]:
info_df.head()

Unnamed: 0,state_fips,state,county,fips,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,tot_pop_black_male,tot_pop_black_female,tot_pop_native_male,tot_pop_native_female,tot_pop_asian_male,tot_pop_asian_female,tot_pop_pacific_male,tot_pop_pacific_female,tot_pop_twoplus_male,tot_pop_twoplus_female,tot_pop_hispanic_male,tot_pop_hispanic_female,tot_pop_white,tot_pop_black,tot_pop_native,tot_pop_asian,tot_pop_pacific,tot_pop_twoplus,tot_pop_hispanic,age_pop,age_male,age_female,age_pop_white_male,age_pop_white_female,age_pop_black_male,age_pop_black_female,age_pop_native_male,age_pop_native_female,age_pop_asian_male,age_pop_asian_female,age_pop_pacific_male,age_pop_pacific_female,age_pop_twoplus_male,age_pop_twoplus_female,age_pop_hispanic_male,age_pop_hispanic_female,age_pop_white,age_pop_black,age_pop_native,age_pop_asian,age_pop_pacific,age_pop_twoplus,age_pop_hispanic,region,division,area_land,lon,lat,neighbors,tot_dem,tot_gop,tot_votes,per_gop,median_income,median_income_black,median_income_native,median_income_asian,median_income_pacific,median_income_other,median_income_twoplus,median_income_hispanic,median_income_white,tot_edu_white,tot_edu_white_male,tot_edu_white_female,tot_edu_white_hsplus,tot_edu_white_male_hsplus,tot_edu_white_female_hsplus,tot_edu_white_4yplus,tot_edu_white_male_4yplus,tot_edu_white_female_4yplus,tot_edu_black,tot_edu_black_male,tot_edu_black_female,tot_edu_black_hsplus,tot_edu_black_male_hsplus,tot_edu_black_female_hsplus,tot_edu_black_4yplus,tot_edu_black_male_4yplus,tot_edu_black_female_4yplus,tot_edu_native,tot_edu_native_male,tot_edu_native_female,tot_edu_native_hsplus,tot_edu_native_male_hsplus,tot_edu_native_female_hsplus,tot_edu_native_4yplus,tot_edu_native_male_4yplus,tot_edu_native_female_4yplus,tot_edu_asian,tot_edu_asian_male,tot_edu_asian_female,tot_edu_asian_hsplus,tot_edu_asian_male_hsplus,tot_edu_asian_female_hsplus,tot_edu_asian_4yplus,tot_edu_asian_male_4yplus,tot_edu_asian_female_4yplus,tot_edu_pacific,tot_edu_pacific_male,tot_edu_pacific_female,tot_edu_pacific_hsplus,tot_edu_pacific_male_hsplus,tot_edu_pacific_female_hsplus,tot_edu_pacific_4yplus,tot_edu_pacific_male_4yplus,tot_edu_pacific_female_4yplus,tot_edu_other,tot_edu_other_male,tot_edu_other_female,tot_edu_other_hsplus,tot_edu_other_male_hsplus,tot_edu_other_female_hsplus,tot_edu_other_4yplus,tot_edu_other_male_4yplus,tot_edu_other_female_4yplus,tot_edu_twoplus,tot_edu_twoplus_male,tot_edu_twoplus_female,tot_edu_twoplus_hsplus,tot_edu_twoplus_male_hsplus,tot_edu_twoplus_female_hsplus,tot_edu_twoplus_4yplus,tot_edu_twoplus_male_4yplus,tot_edu_twoplus_female_4yplus,tot_edu_hispanic,tot_edu_hispanic_male,tot_edu_hispanic_female,tot_edu_hispanic_hsplus,tot_edu_hispanic_male_hsplus,tot_edu_hispanic_female_hsplus,tot_edu_hispanic_4yplus,tot_edu_hispanic_male_4yplus,tot_edu_hispanic_female_4yplus,pop_25p,no_hs,some_hs,hs,some_college,associates,bachelors,graduate,edu,edu_white,per_edu_white_nohs,edu_white_male,per_edu_white_male_nohs,edu_white_female,per_edu_white_female_nohs,edu_black,per_edu_black_nohs,edu_black_male,per_edu_black_male_nohs,edu_black_female,per_edu_black_female_nohs,edu_native,per_edu_native_nohs,edu_native_male,per_edu_native_male_nohs,edu_native_female,per_edu_native_female_nohs,edu_asian,per_edu_asian_nohs,edu_asian_male,per_edu_asian_male_nohs,edu_asian_female,per_edu_asian_female_nohs,edu_pacific,per_edu_pacific_nohs,edu_pacific_male,per_edu_pacific_male_nohs,edu_pacific_female,per_edu_pacific_female_nohs,edu_other,per_edu_other_nohs,edu_other_male,per_edu_other_male_nohs,edu_other_female,per_edu_other_female_nohs,edu_twoplus,per_edu_twoplus_nohs,edu_twoplus_male,per_edu_twoplus_male_nohs,edu_twoplus_female,per_edu_twoplus_female_nohs,edu_hispanic,per_edu_hispanic_nohs,edu_hispanic_male,per_edu_hispanic_male_nohs,edu_hispanic_female,per_edu_hispanic_female_nohs,never,rarely,sometimes,frequently,always,mask,pop_density,per_male,per_female,per_pop_white_male,per_pop_white_female,per_pop_black_male,per_pop_black_female,per_pop_native_male,per_pop_native_female,per_pop_asian_male,per_pop_asian_female,per_pop_pacific_male,per_pop_pacific_female,per_pop_twoplus_male,per_pop_twoplus_female,per_pop_hispanic_male,per_pop_hispanic_female,per_pop_white,per_pop_black,per_pop_native,per_pop_asian,per_pop_pacific,per_pop_twoplus,per_pop_hispanic,per_votes
0,1,Alabama,Autauga,1001,55869,27092.0,28777.0,20138.0,21077.0,5171.0,5927.0,105.0,138.0,282.0,364.0,20.0,20.0,492.0,464.0,884.0,787.0,41215,11098,243,646,40,956,1671,8.422041,8.181973,8.648052,8.518373,8.928216,7.480758,8.104269,9.733333,9.826087,7.276596,8.381868,8.4,5.85,5.195122,5.519397,6.382353,7.072427,8.727963,7.81375,9.786008,7.899381,7.125,5.35251,6.707361,South,East South Central,1539.602123,-86.643648,32.538666,"[01021, 01047, 01051, 01085, 01101]",5908.0,18110.0,24661.0,0.754018,58786.0,27643.0,,,,,,83423.0,65047.0,28726,13834,14892,26130,12588,13542,8440,4573,3867,6786,3042,3744,5459,2436,3023,1296,573,723,78,39,39,61,25,36,25,25,0,317,95,222,278,62,216,118,43,75,32,5,27,32,5,27,0,0,0,262,93,169,197,67,130,0,0,0,380,135,245,269,73,196,92,0,92,939,455,484,821,380,441,346,230,116,37166,956,3248,12119,7554,2998,5903,4388,3.174487,3.28831,0.090371,3.472676,0.090068,3.117043,0.090653,2.563808,0.19555,2.543393,0.199211,2.580395,0.192575,3.166667,0.217949,4.487179,0.358974,1.846154,0.076923,3.615142,0.123028,3.568421,0.347368,3.635135,0.027027,2.0,0.0,2.0,0.0,2.0,0.0,1.503817,0.248092,1.44086,0.27957,1.538462,0.230769,2.626316,0.292105,1.081481,0.459259,3.477551,0.2,3.591054,0.125666,4.197802,0.164835,3.020661,0.088843,0.053,0.074,0.134,0.295,0.444,3.003,36.287947,0.48492,0.51508,0.36045,0.377258,0.092556,0.106087,0.001879,0.00247,0.005048,0.006515,0.000358,0.000358,0.008806,0.008305,0.015823,0.014087,0.737708,0.198643,0.004349,0.011563,0.000716,0.017111,0.029909,0.441408
1,1,Alabama,Baldwin,1003,223234,108247.0,114987.0,89845.0,95902.0,9308.0,9907.0,753.0,754.0,911.0,1435.0,53.0,70.0,1832.0,1930.0,5545.0,4989.0,185747,19215,1507,2346,123,3762,10534,8.987202,8.806627,9.157192,9.150993,9.481721,7.587129,8.095791,9.14741,8.851459,7.246981,7.887108,8.150943,8.628571,5.757642,5.864767,6.497565,6.719182,9.321749,7.849388,8.999336,7.638534,8.422764,5.8126,6.602525,South,East South Central,4117.546676,-87.722603,30.729584,"[01025, 01053, 01097, 01099, 01129]",18409.0,72780.0,94090.0,0.798123,55962.0,31112.0,53289.0,34763.0,,45634.0,53456.0,43279.0,59418.0,126316,60310,66006,116288,54788,61500,41648,19863,21785,12006,5593,6413,9565,4129,5436,2164,808,1356,1015,523,492,790,410,380,145,81,64,1180,426,754,980,271,709,243,147,96,9,0,9,9,0,9,0,0,0,938,469,469,695,394,301,262,119,143,1712,853,859,1529,744,785,559,199,360,5119,2749,2370,3823,1813,2010,1389,637,752,146989,3978,10332,40579,32266,13759,30431,15644,3.329113,3.489788,0.079388,3.463621,0.09156,3.513696,0.068267,2.494586,0.203315,2.19882,0.261756,2.752534,0.152347,2.270936,0.221675,2.342256,0.216061,2.195122,0.227642,2.690678,0.169492,2.997653,0.36385,2.517241,0.059682,2.0,0.0,,,2.0,0.0,2.878465,0.259062,2.948827,0.159915,2.808102,0.358209,3.418808,0.106893,2.910903,0.127784,3.923166,0.086147,2.850361,0.253174,2.477628,0.340487,3.2827,0.151899,0.083,0.059,0.098,0.323,0.436,2.968,54.215293,0.484904,0.515096,0.40247,0.429603,0.041696,0.044379,0.003373,0.003378,0.004081,0.006428,0.000237,0.000314,0.008207,0.008646,0.024839,0.022349,0.832073,0.086076,0.006751,0.010509,0.000551,0.016852,0.047188,0.421486
2,1,Alabama,Barbour,1005,24686,13064.0,11622.0,5894.0,5341.0,6260.0,5547.0,52.0,43.0,55.0,61.0,21.0,10.0,153.0,132.0,629.0,488.0,11235,11807,95,116,31,285,1117,8.784412,8.463564,9.14507,9.475568,10.196592,7.767732,8.491437,9.346154,10.465116,8.109091,10.065574,6.285714,7.7,6.418301,5.810606,6.434022,5.766393,9.818336,8.107733,9.852632,9.137931,6.741935,6.136842,6.142346,South,East South Central,2292.144655,-85.387579,31.868235,"[01011, 01045, 01067, 01109, 01113]",4848.0,5431.0,10390.0,0.528359,34186.0,23013.0,,50417.0,,26793.0,19760.0,30417.0,47031.0,9171,4846,4325,7264,3657,3607,1578,814,764,8137,4304,3833,5551,2776,2775,552,240,312,72,72,0,42,42,0,0,0,0,88,40,48,72,27,45,5,0,5,1,0,1,0,0,0,0,0,0,345,230,115,100,76,24,44,44,0,183,80,103,153,50,103,9,0,9,573,395,178,251,192,59,76,60,16,18173,1490,3411,6486,3287,1279,1417,803,2.38062,2.444444,0.207938,2.349154,0.245357,2.551214,0.166012,1.703576,0.317808,1.568773,0.355019,1.854944,0.276024,1.166667,0.416667,1.166667,0.416667,,,1.920455,0.181818,1.35,0.325,2.395833,0.0625,0.0,1.0,,,0.0,1.0,1.217391,0.710145,1.617391,0.669565,0.417391,0.791304,1.918033,0.163934,1.25,0.375,2.436893,0.0,1.539267,0.561955,1.731646,0.513924,1.11236,0.668539,0.067,0.121,0.12,0.201,0.491,2.928,10.769826,0.529207,0.470793,0.238759,0.216357,0.253585,0.224702,0.002106,0.001742,0.002228,0.002471,0.000851,0.000405,0.006198,0.005347,0.02548,0.019768,0.455116,0.478287,0.003848,0.004699,0.001256,0.011545,0.045248,0.420886
3,1,Alabama,Bibb,1007,22394,11929.0,10465.0,8482.0,8181.0,2912.0,1807.0,50.0,41.0,21.0,25.0,5.0,1.0,116.0,130.0,343.0,280.0,16663,4719,91,46,6,246,623,8.606145,8.349484,8.89871,8.645838,9.105122,7.706387,8.455451,8.52,8.219512,7.52381,9.8,6.4,3.0,6.422414,6.3,7.186589,6.975,8.871332,7.993219,8.384615,8.76087,5.833333,6.357724,7.091493,South,East South Central,1612.167481,-87.125115,32.996421,"[01021, 01065, 01073, 01105, 01117, 01125]",1874.0,6733.0,8748.0,0.78227,45340.0,34000.0,,,,,20329.0,42708.0,50769.0,12002,6037,5965,10483,5181,5302,1570,674,896,3316,2146,1170,2296,1377,919,200,83,117,8,8,0,8,8,0,0,0,0,37,16,21,37,16,21,37,16,21,0,0,0,0,0,0,0,0,0,9,9,0,9,9,0,0,0,0,149,108,41,108,89,19,6,6,0,313,171,142,206,95,111,0,0,0,15780,903,1747,7471,2938,908,1197,616,2.459823,2.400933,0.126562,2.27464,0.141792,2.528751,0.111148,1.686369,0.3076,1.476701,0.358341,2.07094,0.21453,2.0,0.0,2.0,0.0,,,7.0,0.0,7.0,0.0,7.0,0.0,,,,,,,2.0,0.0,2.0,0.0,,,1.651007,0.275168,1.925926,0.175926,0.926829,0.536585,1.316294,0.341853,1.111111,0.444444,1.56338,0.21831,0.02,0.034,0.096,0.278,0.572,3.348,13.890616,0.532687,0.467313,0.378762,0.365321,0.130035,0.080691,0.002233,0.001831,0.000938,0.001116,0.000223,4.5e-05,0.00518,0.005805,0.015317,0.012503,0.744083,0.210726,0.004064,0.002054,0.000268,0.010985,0.02782,0.39064
4,1,Alabama,Blount,1009,57826,28472.0,29354.0,24494.0,25682.0,453.0,419.0,143.0,139.0,73.0,90.0,14.0,7.0,345.0,385.0,2950.0,2632.0,50176,872,282,163,21,730,5582,8.651714,8.49136,8.807249,8.771209,9.131843,8.030905,7.947494,10.125874,9.446043,8.30137,9.066667,8.142857,10.571429,6.486957,6.535065,6.4,6.06193,8.955796,7.990826,9.79078,8.723926,8.952381,6.512329,6.240595,South,East South Central,1670.103911,-86.568495,33.98143,"[01043, 01055, 01073, 01095, 01115, 01127]",2150.0,22808.0,25384.0,0.913855,48695.0,,65385.0,99219.0,,,44934.0,35495.0,49872.0,35774,17200,18574,29814,14167,15647,4775,1900,2875,596,281,315,411,192,219,22,10,12,132,22,110,94,14,80,13,13,0,124,43,81,104,43,61,62,25,37,18,0,18,18,0,18,0,0,0,211,106,105,120,56,64,90,28,62,440,212,228,361,154,207,24,22,2,2610,1468,1142,1006,476,530,82,48,34,39627,2967,4894,13489,8492,4775,3217,1793,2.606581,2.334181,0.166601,2.199651,0.176337,2.45876,0.157586,1.563758,0.310403,1.544484,0.316726,1.580952,0.304762,1.916667,0.287879,4.227273,0.363636,1.454545,0.272727,4.177419,0.16129,4.906977,0.0,3.790123,0.246914,2.0,0.0,,,2.0,0.0,3.270142,0.43128,2.377358,0.471698,4.171429,0.390476,1.913636,0.179545,1.971698,0.273585,1.859649,0.092105,0.927969,0.614559,0.811989,0.675749,1.077058,0.535902,0.053,0.114,0.18,0.194,0.459,2.892,34.624193,0.492374,0.507626,0.423581,0.444125,0.007834,0.007246,0.002473,0.002404,0.001262,0.001556,0.000242,0.000121,0.005966,0.006658,0.051015,0.045516,0.867707,0.01508,0.004877,0.002819,0.000363,0.012624,0.096531,0.438972


In [35]:
info_df[(info_df['per_gop'] > 0.025) & (info_df['per_gop'] <= 0.075)]['per_gop']

319    0.042509
Name: per_gop, dtype: float64

In [31]:
np.quantile(info_df['per_gop'].tolist(), 0.5)

nan

In [14]:
def df_splitter(info_df, split_on, splits=2, equal_pop=True, mode='verbose'):
    
    if mode not in ['verbose', 'mean', 'percentile']:
        mode = verbose
        
    info_df = info_df[~info_df[split_on].isna()].sort_values(by=split_on)
    
    if equal_pop:
        # https://stackoverflow.com/a/31871770/14083095
        # splitting df into approx equal populations
        info_df['pop_cumsum'] = info_df['tot_pop'].cumsum()
        subpop = info_df['pop_cumsum'].max() / splits
        info_df['split'] = (info_df['pop_cumsum'] / subpop).apply(math.ceil)
    else:
        # splitting df into approx equal shapes
        info_df['split'] = pd.qcut(info_df[split_on], splits)
        
    replace_dict = {}
    to_replace = info_df['split'].unique()
    
    # renaming our splits into something more readable
    for i, s in enumerate(to_replace):
        if mode == 'verbose':
            replace_dict[s] = f"[{info_df.loc[info_df['split']==s,split_on].min():.2f},"\
            f" {info_df.loc[info_df['split']==s,split_on].max():.2f}]"
        elif mode == 'mean':
            replace_dict[s] = np.round(
                info_df.loc[info_df['split']==s,split_on].mean(),
                decimals=3
            )
        else:
            replace_dict[s] = (100/splits) * (int(i)+1)
    info_df['split'] = info_df['split'].replace(replace_dict)
    
    return info_df

In [15]:
def make_heatmap_timeseries(
    nyt_df, info_df, y='new_cases_per_100k', splits=25, split_on=None,
    equal_pop=True, mode='percentile', group=True
):

    y_title = split_on
    y_subtitle = 'county'
    if equal_pop:
        y_subtitle = 'pop'
    # check number of splits and only split on numeric columns
    # otherwise, use names as the different lines (setting splits=1)
    splits = int(splits)
    if split_on in info_df.select_dtypes(exclude='number').columns:
        splits = 1
    y_ = [y]
    # refer to non-per-capita column
    if '_per_100k' in y:
        y_ = [y.replace('_per_100k', '')]
    elif y is 'mortality_rate':
        y_ = ['cases', 'deaths']
    # first split df so that we can plot different lines
    if splits > 1:
        info_df = df_splitter(info_df, split_on, splits, equal_pop, mode)
        merged = df_merger(
            nyt_df, info_df, x_cols=[split_on, 'split'], y_cols=y_, date='all',
            weight='tot_pop'
        )
        # 'split' column generated by df_splitter()
        split_on = 'split'
    else:
        merged = df_merger(
            nyt_df, info_df, x_cols=[split_on], y_cols=y_, date='all'
        )
        
    # recalculate aggregates
    if '_per_100k' in y:
        y_ = y.replace('_per_100k', '')
        data = merged.groupby(by=['date', split_on])[y_].sum().fillna(0)\
               / merged.groupby(by=['date', split_on])['tot_pop'].sum() * 100_000
    elif y is 'mortality_rate':
        data = merged.groupby(by=['date', split_on])['deaths'].sum()\
               / merged.groupby(by=['date', split_on])['cases'].sum()
    else:
        data = merged.groupby(by=['date', split_on])[y].sum().fillna(0)
    data = data.reset_index().rename(columns={0: y})
    
    y_alt = f'{split_on}:O'
    
    # nearest point selection
    nearest = alt.selection(type='single', nearest=True, on='mouseover',
                            fields=['date'], empty='all')
    
    # title
    dx = 160
    dy = splits*9
    title = alt.Chart(data).mark_text(dx=dx, dy=dy, size=20).encode(
        text='monthdate(date):T'
    ).transform_filter(nearest)
    
    w_title = alt.Chart(data).mark_text(dx=dx, dy=dy, stroke='white', strokeWidth=3, size=20).encode(
        text='monthdate(date):T'
    ).transform_filter(nearest)
    
    # right panel: heatmap
    heatmap = alt.Chart(data).mark_rect().encode(
        alt.X(
            'monthdate(date):T',
            axis=alt.Axis(format='%b %d')
        ),
        alt.Y(
            y_alt,
            sort=alt.EncodingSortField(f'{split_on}', order='descending'),
            title=f'{y_title} ({y_subtitle} {mode})'
        ),
        color=alt.Color(
            f'{y}:Q',
            scale=alt.Scale(
                scheme='lightmulti'
            )
        )
    ).add_selection(nearest)
    
    # left panel: bar chart
    bars = alt.Chart(data).mark_bar().encode(
        alt.X(
            f'{y}:Q',
            scale=alt.Scale(
                domain=[0, data[y].max()]
            )
        ),
        alt.Y(
            y_alt,
            sort=alt.EncodingSortField(f'{split_on}', order='descending'),
            title=f'{y_title} ({y_subtitle} {mode})'
        ),
        color=alt.Color(
            f'{y}:Q',
            scale=alt.Scale(
                scheme='lightmulti'
            )
        ),
        tooltip=[
            alt.Tooltip(f'{y}:Q'),
            alt.Tooltip(y_alt),
        ]
    ).transform_filter(nearest)
    
    
    # selects nearest points based on date
    selectors = alt.Chart(data).mark_point().encode(
        x='monthdate(date):T',
        opacity=alt.value(0)
    ).add_selection(nearest)
    
    return (heatmap | bars+w_title+title).properties(
        title=f'{y} vs {y_title}'
    )

# Aggregate Differences

In [16]:
def df_splitter(info_df, split_on, splits=2, equal_pop=True, mode='verbose'):
    
    if mode not in ['verbose', 'mean', 'percentile']:
        mode = verbose
        
    info_df = info_df[~info_df[split_on].isna()].sort_values(by=split_on)
    
    if equal_pop:
        # https://stackoverflow.com/a/31871770/14083095
        # splitting df into approx equal populations
        info_df['pop_cumsum'] = info_df['tot_pop'].cumsum()
        subpop = info_df['pop_cumsum'].max() / splits
        info_df['split'] = (info_df['pop_cumsum'] / subpop).apply(math.ceil)
    else:
        # splitting df into approx equal shapes
        info_df['split'] = pd.qcut(info_df[split_on], splits)
        
    replace_dict = {}
    to_replace = info_df['split'].unique()
    
    # renaming our splits into something more readable
    for i, s in enumerate(to_replace):
        if mode == 'verbose':
            replace_dict[s] = f"[{info_df.loc[info_df['split']==s,split_on].min():.2f},"\
            f" {info_df.loc[info_df['split']==s,split_on].max():.2f}]"
        elif mode == 'mean':
            replace_dict[s] = np.round(
                info_df.loc[info_df['split']==s,split_on].mean(),
                decimals=3
            )
        else:
            replace_dict[s] = (100/splits) * (int(i)+1)
    info_df['split'] = info_df['split'].replace(replace_dict)
    
    return info_df

In [38]:
def make_line_timeseries(
    nyt_df, info_df, y='new_cases_per_100k_15sg', splits=2, split_on=None, 
    equal_pop=True
):
    
    # check number of splits and only split on numeric columns
    # otherwise, use names as the different lines (setting splits=1)
    splits = int(splits)
    if split_on in info_df.select_dtypes(exclude='number').columns:
        splits = 1
    y_ = [y]
    if '_per_100k' in y:
        y_ = [y.replace('_per_100k', '')]
    elif y is 'mortality_rate':
        y_ = ['cases', 'deaths']
    # first split df so that we can plot different lines
    if splits > 1:
        info_df = df_splitter(info_df, split_on, splits, equal_pop)
        merged = df_merger(
            nyt_df, info_df, x_cols=[split_on, 'split'], y_cols=y_, date='all',
            weight='tot_pop'
        )
        # 'split' column generated by df_splitter()
        split_on = 'split'
    else:
        merged = df_merger(
            nyt_df, info_df, x_cols=[split_on], y_cols=y_, date='all'
        )
        
    # recalculate aggregates
    if '_per_100k' in y:
        y_ = y.replace('_per_100k', '')
        data = merged.groupby(by=['date', split_on])[y_].sum().fillna(0)\
               / merged.groupby(by=['date', split_on])['tot_pop'].sum() * 100_000
    elif y is 'mortality_rate':
        data = merged.groupby(by=['date', split_on])['deaths'].sum()\
               / merged.groupby(by=['date', split_on])['cases'].sum()

#     elif y is 'mortality_rate':
#         data = merged.groupby(by=['date', split_on])[y_].sum().fillna(0)\
#                / merged.groupby(by=['date', split_on])['tot_pop'].sum()
    else:
        data = merged.groupby(by=['date', split_on])[y].sum().fillna(0)
    data = data.reset_index().rename(columns={0: y})
    
    
    # nearest point selection
    nearest = alt.selection(type='single', nearest=True, on='mouseover',
                            fields=['date'], empty='none')
    
    # base line chart
    lines = alt.Chart(data).mark_line().encode(
        x='date:T',
        y=alt.Y(
            f'{y}:Q',
            title=y.replace('_', ' ')
        ),
        color=f'{split_on}:O'
    )
    
    # selects nearest points based on date
    selectors = alt.Chart(data).mark_point().encode(
        x='date:T',
        opacity=alt.value(0)
    ).add_selection(nearest)
    
    # marks a point on line where selected
    points = lines.mark_point().encode(
        opacity=alt.condition(nearest, alt.value(1), alt.value(0))
    )
    
    # white background for text
    white_text = lines.mark_text(align='left', dx=5, dy=-5, stroke='white', strokeWidth=3).encode(
        text=alt.condition(nearest, f'{y}:Q', alt.value(' '), format='.1f')
    )
    
    # text showing y value
    text = lines.mark_text(align='left', dx=5, dy=-5).encode(
        text=alt.condition(nearest, f'{y}:Q', alt.value(' '), format='.1f')
    )
    
    # rule showing nearest selector
    rules = alt.Chart(data).mark_rule(color='gray').encode(
        x='date:T',
        size=alt.value(1)
    ).transform_filter(nearest)
    
    return alt.layer(
        lines, selectors, points, rules, white_text, text
    ).configure_axis(
        gridDash=[1,2]
    ).properties(
        width=640, height=384
    )

In [45]:
make_line_timeseries(
    nyt_df, 
    info_df, 
    y='new_cases_per_100k_15d', 
    splits=3,
    split_on='per_gop', 
    equal_pop=True)

In [44]:
make_line_timeseries(
    nyt_df, 
    info_df, 
    y='new_cases_per_100k_15d', 
    splits=3,
    split_on='mask', 
    equal_pop=True)

In [43]:
make_line_timeseries(
    nyt_df, 
    info_df, 
    y='new_cases_per_100k_15d', 
    splits=3,
    split_on='pop_density', 
    equal_pop=True)

# Visualizing Via Heatmap

In [22]:
def make_heatmap_timeseries(
    nyt_df, info_df, y='new_cases_per_100k_15sg', splits=10, split_on=None,
    equal_pop=True, mode='percentile'
):
    y_title = split_on
    y_subtitle = 'county'
    if equal_pop:
        y_subtitle = 'pop'
    # check number of splits and only split on numeric columns
    # otherwise, use names as the different lines (setting splits=1)
    splits = int(splits)
    if split_on in info_df.select_dtypes(exclude='number').columns:
        splits = 1
    y_ = [y]
    if '_per_100k' in y:
        y_ = [y.replace('_per_100k', '')]
    elif y is 'mortality_rate':
        y_ = ['cases', 'deaths']
    # first split df so that we can plot different lines
    if splits > 1:
        info_df = df_splitter(info_df, split_on, splits, equal_pop, mode)
        merged = df_merger(
            nyt_df, info_df, x_cols=[split_on, 'split'], y_cols=y_, date='all',
            weight='tot_pop'
        )
        # 'split' column generated by df_splitter()
        split_on = 'split'
    else:
        merged = df_merger(
            nyt_df, info_df, x_cols=[split_on], y_cols=y_, date='all'
        )
        
    # recalculate aggregates
    if '_per_100k' in y:
        y_ = y.replace('_per_100k', '')
        data = merged.groupby(by=['date', split_on])[y_].sum().fillna(0)\
               / merged.groupby(by=['date', split_on])['tot_pop'].sum() * 100_000
    elif y is 'mortality_rate':
        data = merged.groupby(by=['date', split_on])['deaths'].sum()\
               / merged.groupby(by=['date', split_on])['cases'].sum()
    else:
        data = merged.groupby(by=['date', split_on])[y].sum().fillna(0)
    data = data.reset_index().rename(columns={0: y})
    
    y_alt = f'{split_on}:O'
    
    # nearest point selection
    nearest = alt.selection(type='single', nearest=True, on='mouseover',
                            fields=['date'], empty='all')
    
    # title
    dx = 160
    dy = splits*9
    title = alt.Chart(data).mark_text(dx=dx, dy=dy, size=20).encode(
        text='monthdate(date):T'
    ).transform_filter(nearest)
    
    w_title = alt.Chart(data).mark_text(dx=dx, dy=dy, stroke='white', strokeWidth=3, size=20).encode(
        text='monthdate(date):T'
    ).transform_filter(nearest)
    
    # right panel: heatmap
    heatmap = alt.Chart(data).mark_rect().encode(
        alt.X(
            'monthdate(date):T',
            axis=alt.Axis(format='%b %d')
        ),
        alt.Y(
            y_alt,
            sort=alt.EncodingSortField(f'{split_on}', order='descending'),
            title=f'{y_title} ({y_subtitle} {mode})'
        ),
        color=alt.Color(
            f'{y}:Q',
            scale=alt.Scale(
                scheme='lightmulti'
            )
        )
    ).add_selection(nearest)
    
    # left panel: bar chart
    bars = alt.Chart(data).mark_bar().encode(
        alt.X(
            f'{y}:Q',
            scale=alt.Scale(
                domain=[0, data[y].max()]
            )
        ),
        alt.Y(
            y_alt,
            sort=alt.EncodingSortField(f'{split_on}', order='descending'),
            title=f'{y_title} ({y_subtitle} {mode})'
        ),
        color=alt.Color(
            f'{y}:Q',
            scale=alt.Scale(
                scheme='lightmulti'
            )
        ),
        tooltip=[
            alt.Tooltip(f'{y}:Q'),
            alt.Tooltip(y_alt),
        ]
    ).transform_filter(nearest)
    
    
    # selects nearest points based on date
    selectors = alt.Chart(data).mark_point().encode(
        x='monthdate(date):T',
        opacity=alt.value(0)
    ).add_selection(nearest)
    
    return (heatmap | bars+w_title+title).properties(
        title=f'{y} vs {y_title}'
    )

In [33]:
make_heatmap_timeseries(nyt_df, info_df, y='new_cases_per_100k_15d', splits=5, split_on='pop_density', equal_pop=True, mode='percentile')

In [34]:
make_heatmap_timeseries(nyt_df, info_df, y='new_cases_per_100k_15d', splits=5, split_on='mask', equal_pop=True, mode='percentile')

In [35]:
make_heatmap_timeseries(nyt_df, info_df, y='new_cases_per_100k_15d', splits=, split_on='per_gop', equal_pop=True, mode='percentile')

In [127]:
make_heatmap_timeseries(nyt_df, info_df, y='new_cases_per_100k_15sg', splits=10, split_on='per_pop_black', equal_pop=True, mode='percentile')

In [26]:
make_heatmap_timeseries(nyt_df, info_df, y='new_cases_per_100k_15sg', splits=10, split_on='edu', equal_pop=True, mode='percentile')

In [129]:
make_heatmap_timeseries(nyt_df, info_df, y='new_cases_per_100k_15sg', splits=10, split_on='mask', equal_pop=True, mode='percentile')

In [130]:
make_heatmap_timeseries(nyt_df, info_df, y='new_cases_per_100k_15sg', splits=10, split_on='median_income', equal_pop=True, mode='percentile')