# Exploring Trends

In [21]:
import numpy as np
import numpy.polynomial.polynomial as poly      # linear regression
import pandas as pd

import scipy.stats as stats
from scipy.signal import savgol_filter          # fast smoothing of data

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

from xgboost import XGBRegressor

import statsmodels.api as sm

import math
import pickle
import re

# opening external coordinates
import json

# opening urls
from urllib.request import urlopen

# benchmarking
from time import time

# date/time operations
from datetime import datetime, timedelta
from pytz import timezone

# plotting
import altair as alt
from altair import datum
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

# for US map
from vega_datasets import data

In [2]:
# large datasets
# alt.data_transformers.enable('data_server');

pd.options.display.max_rows = 250
pd.options.display.max_columns = 250

## Import and process CSSE data

In [3]:
#hide
def optimize(df):
    '''
    Optimizes the data types in a pandas dataframe.
    '''
    dft = df.copy()
    # converts to datetime if possible
    dft = dft.apply(
        lambda col:
        pd.to_datetime(col, errors='ignore')
        if col.dtypes=='object' 
        else col
    )
    
    # if there are less than half as many unique values as there are rows, convert to category
    for col in dft.select_dtypes(include='object'):
        if len(dft[col].unique()) / len(df[col]) < 0.5:
            dft[col] = dft[col].astype('category')
            
    # downcasts numeric columns if possible
    dft = dft.apply(
        lambda col: 
        pd.to_numeric(col, downcast='integer') 
        if col.dtypes=='int64' 
        else col
    )
    dft = dft.apply(
        lambda col: 
        pd.to_numeric(col, downcast='float') 
        if col.dtypes=='float64' 
        else col
    )
    
    return dft

In [4]:
with open('../data/processed/info_df.p', 'rb') as f:
    info_df = pickle.load(f)
    
with open('../data/processed/geo_altair.p', 'rb') as f:
    geo_altair = pickle.load(f)

In [5]:
fips = set(info_df['fips'])

def preprocess_csse(df, name):
  df.columns = df.columns.str.lower()
  columns_to_drop = ['lat', 'long_', 'iso2', 'iso3', 'code3', 'fips', 
           'admin2', 'province_state', 'country_region', 
           'combined_key']
  if 'population' in df.columns.tolist():
    columns_to_drop.append('population')
  df = df.drop(columns=columns_to_drop)
  df = df.rename(columns={'uid': 'fips'})
  df['fips'] = df['fips'].apply(lambda x: x[-5:])  
  # convert to long-form
  df = pd.DataFrame(df.set_index('fips').unstack()).rename(columns={0: name})
  return df


def make_csse_df():
  
  with urlopen('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv') as response:
    cases = pd.read_csv(response, dtype={'UID': str})
  with urlopen('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv') as response:
    deaths = pd.read_csv(response, dtype={'UID': str})
  
  cases = preprocess_csse(cases, 'cases')
  deaths = preprocess_csse(deaths, 'deaths')
  
  df = pd.concat([cases, deaths], axis=1)     # merge along multi-index
  
  df.index = df.index.set_levels(         # change date to pd.datetime
    [pd.to_datetime(df.index.levels[0]), 
    df.index.levels[1]]
    )
  
  df = df.reset_index().rename(columns={'level_0': 'date'})
  df = df[df['fips'].isin(fips)]
  
  # df[['new_cases', 'new_deaths']] = df[['cases', 'deaths']] - df.groupby(by='fips')[['cases', 'deaths']].shift()
  # df[['new_cases_c', 'new_deaths_c']] = df[['new_cases', 'new_deaths']].clip(lower=0)
  df = df.fillna(0)
  num_cols = df.select_dtypes(include='number').columns
  df[num_cols] = df[num_cols].astype(int)
  
  return df

In [6]:
csse_df = make_csse_df()
csse_df.tail()

Unnamed: 0,date,fips,cases,deaths
1185694,2021-01-10,56037,3162,24
1185695,2021-01-10,56039,2353,4
1185696,2021-01-10,56041,1708,8
1185698,2021-01-10,56043,805,21
1185699,2021-01-10,56045,485,4


In [8]:
# with open('../data/processed/csse_df.p', 'rb') as f:
#     csse_df = pickle.load(f)
# csse_df.tail()

In [7]:
# info_df.to_csv('../data/processed/info_df.csv', index=False)

## Engineer per capita columns

In [35]:
info_df['per_votes_gop'] = info_df['tot_votes_gop'] / info_df['tot_votes']

In [7]:
csse_df = csse_df.merge(
    info_df[['fips', 'tot_pop']], 
    on='fips', 
    suffixes=('_x','')
)

csse_df[['cases_per_100k', 'deaths_per_100k']] = csse_df[['cases', 'deaths']].div(csse_df['tot_pop'], axis=0) * 100_000
csse_df = csse_df.drop(columns=['tot_pop'])
csse_df = csse_df.sort_values(by=['date', 'fips'])

print(csse_df.shape)
csse_df.tail()

(1115410, 6)


Unnamed: 0,date,fips,cases,deaths,cases_per_100k,deaths_per_100k
1113989,2021-01-10,56037,3162,24,7467.586142,56.679971
1114344,2021-01-10,56039,2353,4,10028.128196,17.047392
1114699,2021-01-10,56041,1708,8,8444.576288,39.553051
1115054,2021-01-10,56043,805,21,10313.901345,269.058296
1115409,2021-01-10,56045,485,4,7001.587989,57.745056


In [8]:
def add_change_cols(df, cols, pre='new_', clip=False):
  df = df.sort_values(by=['date', 'fips'])
  new_cols = [pre + c for c in cols]
  df[new_cols] = df[cols] - df.groupby(by='fips')[cols].shift()
  df[new_cols] = df[new_cols].fillna(0)
  df[new_cols] = df[new_cols].astype(int)
  if clip:
    df[new_cols] = df[new_cols].clip(lower=0)
  return (df, new_cols)

def add_window_cols(df, cols, window=7):
  df = df.sort_values(by=['date', 'fips'])
  new_cols = [c + '_' + str(window) + 'd' for c in cols]
  col_dict = dict(zip(cols, new_cols))
  df = (df.merge(df.sort_values(by=['date', 'fips'])
          .groupby('fips')
          .rolling(window, on='date', min_periods=0)[cols].mean()
          .rename(columns=col_dict), on=['fips', 'date']))
#     df[new_cols] = df[new_cols].astype(int)
  return (df, new_cols)

def add_savgol_cols(df, cols, window=7, clip=False):
  def my_savgol(x, w):
    if len(x) >= w:
      return savgol_filter(x, w, 1)
    else:
      new_window = int(np.ceil(len(x) / 2) * 2 - 1)
      if new_window <= 1:
        return x
      else:
        return savgol_filter(x, new_window, 1)
  df = df.sort_values(by=['date', 'fips'])
  new_cols = [c + '_' + str(window) + 'sg' for c in cols]
  df[new_cols] = df.groupby(by='fips')[cols].transform(lambda x: my_savgol(x, window))
  if clip:
    df[new_cols] = df[new_cols].clip(lower=0)
  return (df, new_cols)


cols = ['cases', 'deaths', 'cases_per_100k', 'deaths_per_100k']
csse_df, new_cols = add_change_cols(csse_df, cols, pre='new_', clip=True)
csse_df, window_cols = add_window_cols(csse_df, new_cols, 15)
csse_df, sg_cols = add_savgol_cols(csse_df, new_cols, 15)
csse_df['days'] = ((csse_df['date'] - csse_df['date'].max()) / np.timedelta64(1, 'D')).astype('int')

csse_df.tail()

Unnamed: 0,date,fips,cases,deaths,cases_per_100k,deaths_per_100k,new_cases,new_deaths,new_cases_per_100k,new_deaths_per_100k,new_cases_15d,new_deaths_15d,new_cases_per_100k_15d,new_deaths_per_100k_15d,new_cases_15sg,new_deaths_15sg,new_cases_per_100k_15sg,new_deaths_per_100k_15sg,days
1115405,2021-01-10,56037,3162,24,7467.586142,56.679971,5,0,11,0,21.133333,0.6,49.466667,1.333333,17.208333,1.175,40.041667,2.633333,0
1115406,2021-01-10,56039,2353,4,10028.128196,17.047392,5,0,21,0,18.2,0.133333,77.2,0.533333,28.75,-0.091667,122.025,-0.366667,0
1115407,2021-01-10,56041,1708,8,8444.576288,39.553051,15,0,74,0,14.2,0.066667,69.666667,0.266667,20.275,0.166667,99.891667,0.666667,0
1115408,2021-01-10,56043,805,21,10313.901345,269.058296,5,0,64,0,3.933333,0.6,49.933333,7.6,2.683333,-0.125,34.033333,-1.65,0
1115409,2021-01-10,56045,485,4,7001.587989,57.745056,0,0,0,0,1.6,0.133333,22.733333,1.866667,0.275,0.283333,3.708333,3.966667,0


# Helper Functions

In [9]:
def column_selector(info_df, columns='none', include=[], exclude=[]):
  
  # only select from numeric columns
  all_columns = info_df.select_dtypes(include='number').columns.tolist()
  
  # empty container if we don't have a list going already
  if columns is 'none':
    columns = []
  elif columns is 'all':
    columns = all_columns
  
  # includes all columns that have all elements in mask
  # excludes all columns that have any elements in exclude
  if len(include) > 0:
    include_columns = list(set([
      c for c in all_columns
      if all(i in set(re.findall('[0-9a-z]+', c))
      for i in include)
    ]))
    columns += include_columns
  if len(exclude) > 0:
    columns = list(set([
      c for c in columns 
      if all(e not in set(re.findall('[0-9a-z]+', c)) 
      for e in exclude)
    ]))
    
  return sorted(list(set(columns)))

In [10]:
column_selector(info_df, include=['edu'], exclude=['tot', 'per', 'male', 'female'])

['edu',
 'edu_asian',
 'edu_black',
 'edu_hispanic',
 'edu_native',
 'edu_other',
 'edu_pacific',
 'edu_twoplus',
 'edu_white']

In [11]:
def corr(x, y, w, useweight=True):
  
  # only uses elements that are not nan from both lists
  x_ids = ~np.isnan(x)
  y_ids = ~np.isnan(y)
  ids = x_ids & y_ids
  
  if useweight:
    try:
      [xx, xy], [_, yy] = np.cov(x[ids], y[ids], aweights=w[ids])
    except:
      print(x.name)
      print(y.name)
  else:
    [xx, xy], [_, yy] = np.cov(x[ids], y[ids])
  
  return xy / np.sqrt(xx * yy)

In [12]:
def df_merger(nyt_df, info_df, x_cols=None, y_cols=None, date='latest', weight='tot_pop'):
  '''
  x_cols : predictor columns
  y_cols : target columns
  date :   'latest', 'all', or specific date
  weight : weight column
  '''
  
  # make sure x and y are valid
  all_y = nyt_df.columns.tolist()
  for y in y_cols:
    if '_per_100k' in y:
      y_cols.append(y.replace('_per_100k', ''))
  y_cols = sorted(list(set([y for y in y_cols if y in all_y])))
  
  all_x = info_df.columns.tolist()
  x_cols = sorted(list(set([c for c in x_cols if c in all_x])))
  
  ## only process specific date and y_cols
  left_columns = list(set(['date', 'fips'] + y_cols))
  if date=='latest':
    left_df = nyt_df[nyt_df['date']==nyt_df['date'].max()][left_columns]
  elif date=='all':
    left_df = nyt_df[left_columns]
  else:
    left_df = nyt_df[nyt_df['date']==date][left_columns]

  ## only process specific x_cols
  right_columns = list(set(['fips', 'state', 'county', weight] + x_cols))
  right_df = info_df[right_columns]
  
  # https://stackoverflow.com/a/47118728/14083095
  # fills nyt_df with entries for counties that do not log cases
  # for more accurate aggregate per capita calculations
  
  # create multiindex that has every fips with every date
  mux = pd.MultiIndex \
      .from_product(
        [left_df['date'].unique(), 
         right_df['fips'].unique()], 
        names=('date', 'fips')
      )
  # reindex data to multiindex, fill nan entries with 0
  left_df = left_df.set_index(['date','fips']) \
                   .reindex(mux)               \
                   .swaplevel(0,1)             \
                   .reset_index()              \
                   .fillna(0)
   
  df = left_df.merge(right_df, on='fips', how='outer', suffixes=('_x', ''))
  df = df.drop([x for x in df.columns if x[-2:]=='_x'], axis=1)
  
  return df

In [13]:
csse_df.columns

Index(['date', 'fips', 'cases', 'deaths', 'cases_per_100k', 'deaths_per_100k',
       'new_cases', 'new_deaths', 'new_cases_per_100k', 'new_deaths_per_100k',
       'new_cases_15d', 'new_deaths_15d', 'new_cases_per_100k_15d',
       'new_deaths_per_100k_15d', 'new_cases_15sg', 'new_deaths_15sg',
       'new_cases_per_100k_15sg', 'new_deaths_per_100k_15sg', 'days'],
      dtype='object')

In [14]:
merged = df_merger(
    csse_df, info_df, 
    x_cols=column_selector(info_df, 'all', exclude=['male', 'female', 'tot', 'lat', 'lon']),
    y_cols=['new_cases_per_100k_15d', 'cases_per_100k']
)

merged.head()

Unnamed: 0,fips,date,cases_per_100k,new_cases_15d,cases,new_cases_per_100k_15d,area,state,edu_hispanic,per_edu_hispanic_nohs,mask,per_edu_white_nohs,per_edu_twoplus_nohs,some_hs,sometimes,associates,always,age_pop_black,edu_twoplus,pop_25p,rarely,median_income,edu_other,edu_white,median_income_asian,age_pop_asian,per_edu_other_nohs,frequently,median_income_hispanic,age_pop_white,per_edu_native_nohs,age_pop_native,median_income_white,per_edu_black_nohs,per_diff,graduate,per_edu_asian_nohs,edu_black,median_income_twoplus,no_hs,age_pop_hispanic,edu_native,some_college,edu_asian,votes_diff,hs,per_edu_pacific_nohs,age_pop_pacific,edu,median_income_other,edu_pacific,bachelors,median_income_black,median_income_pacific,never,median_income_native,tot_pop,county,age_pop,age_pop_twoplus,pop_density
0,1001,2021-01-10,8732.928816,58.666667,4879,104.466667,1539.602123,Alabama,3.591054,0.125666,3.003,0.090371,0.292105,3248,0.134,2998,0.444,7.81375,2.626316,37166,0.074,58786.0,1.503817,3.28831,,7.899381,0.248092,0.295,83423.0,8.727963,0.217949,9.786008,65047.0,0.19555,0.444184,4388,0.123028,2.563808,,956,6.707361,3.166667,7554,3.615142,12335.0,12119,0.0,7.125,3.174487,,2.0,5903,27643.0,,0.053,,55869,Autauga,8.422041,5.35251,36.287947
1,1003,2021-01-10,6865.889605,174.6,15327,77.733333,4117.546676,Alabama,2.850361,0.253174,2.968,0.079388,0.106893,10332,0.098,13759,0.436,7.849388,3.418808,146989,0.059,55962.0,2.878465,3.489788,34763.0,7.638534,0.259062,0.323,43279.0,9.321749,0.221675,8.999336,59418.0,0.203315,0.537623,15644,0.169492,2.494586,53456.0,3978,6.602525,2.270936,32266,2.690678,58966.0,40579,0.0,8.422764,3.329113,45634.0,2.0,30431,31112.0,,0.083,53289.0,223234,Baldwin,8.987202,5.8126,54.215293
2,1005,2021-01-10,6716.35745,17.333333,1658,69.866667,2292.144655,Alabama,1.539267,0.561955,2.928,0.207938,0.163934,3411,0.12,1279,0.491,8.107733,1.918033,18173,0.121,34186.0,1.217391,2.444444,50417.0,9.137931,0.710145,0.201,30417.0,9.818336,0.416667,9.852632,47031.0,0.317808,0.076631,803,0.181818,1.703576,19760.0,1490,6.142346,1.166667,3287,1.920455,806.0,6486,1.0,6.741935,2.38062,26793.0,0.0,1417,23013.0,,0.067,,24686,Barbour,8.784412,6.136842,10.769826
3,1007,2021-01-10,9158.703224,20.8,2051,92.333333,1612.167481,Alabama,1.316294,0.341853,3.348,0.126562,0.275168,1747,0.096,908,0.572,7.993219,1.651007,15780,0.034,45340.0,2.0,2.400933,,8.76087,0.0,0.278,42708.0,8.871332,0.0,8.384615,50769.0,0.3076,0.57728,616,0.0,1.686369,20329.0,903,7.091493,2.0,2938,7.0,5539.0,7471,,5.833333,2.459823,,,1197,34000.0,,0.02,,22394,Bibb,8.606145,6.357724,13.890616
4,1009,2021-01-10,8760.765054,41.333333,5066,70.933333,1670.103911,Alabama,0.927969,0.614559,2.892,0.166601,0.179545,4894,0.18,4775,0.459,7.990826,1.913636,39627,0.114,48695.0,3.270142,2.334181,99219.0,8.723926,0.43128,0.194,35495.0,8.955796,0.287879,9.79078,49872.0,0.310403,0.800022,1793,0.16129,1.563758,44934.0,2967,6.240595,1.916667,8492,4.177419,22071.0,13489,0.0,8.952381,2.606581,,2.0,3217,,,0.053,65385.0,57826,Blount,8.651714,6.512329,34.624193


In [15]:
y = merged['new_cases_per_100k_15d']
drop_cols = [c for c in merged.select_dtypes(include='number').columns.tolist() if 'cases' in c]
X = merged.select_dtypes(include='number').drop(drop_cols, axis=1).fillna(0.5)
ss = StandardScaler()
Z = ss.fit_transform(X)

In [17]:
reg = RandomForestRegressor()
reg.fit(Z, y)

importance = reg.feature_importances_

In [18]:
importance_df = pd.DataFrame(columns=['feature', 'importance'])
importance_df['feature'] = X.columns.tolist()
importance_df['importance'] = importance

In [19]:
importance_df.sort_values(by='importance', ascending=False).head(10)

Unnamed: 0,feature,importance
4,per_edu_white_nohs,0.099213
20,frequently,0.060641
10,age_pop_black,0.044736
0,area,0.041297
52,pop_density,0.041219
41,edu,0.027981
32,no_hs,0.02756
50,age_pop,0.027169
37,votes_diff,0.025515
27,per_diff,0.025123


In [22]:
reg = XGBRegressor()
reg.fit(Z, y)

importance = reg.feature_importances_

importance_df = pd.DataFrame(columns=['feature', 'importance'])
importance_df['feature'] = X.columns.tolist()
importance_df['importance'] = importance
importance_df.sort_values(by='importance', ascending=False).head(10)



Unnamed: 0,feature,importance
4,per_edu_white_nohs,0.08518
10,age_pop_black,0.059532
32,no_hs,0.039301
20,frequently,0.036397
41,edu,0.034988
37,votes_diff,0.034767
50,age_pop,0.032215
52,pop_density,0.027812
38,hs,0.026921
13,rarely,0.023737


In [23]:
# https://machinelearningmastery.com/calculate-feature-importance-with-python/
wls = LinearRegression()
wls.fit(Z, y)
coefs = wls.coef_

[x for _, x in sorted(zip(coefs, X.columns))][:10]

['pop_25p',
 'always',
 'frequently',
 'sometimes',
 'rarely',
 'tot_pop',
 'median_income',
 'age_pop',
 'edu',
 'pop_density']

In [25]:
# Z = stats.zscore(X)
# Z = sm.add_constant(Z)
# weights = merged['tot_pop']

# wls = sm.WLS(y, Z, weights=weights)

# fit = wls.fit()
# fit.summary()

In [26]:
# [x for _, x in sorted(zip(fit.params[1:], X.columns))]

In [27]:
wls = sm.WLS(y, X, weights=weights)

fit = wls.fit()
fit.summary()

0,1,2,3
Dep. Variable:,new_cases_per_100k_15d,R-squared (uncentered):,0.924
Model:,WLS,Adj. R-squared (uncentered):,0.923
Method:,Least Squares,F-statistic:,741.6
Date:,"Mon, 11 Jan 2021",Prob (F-statistic):,0.0
Time:,11:27:35,Log-Likelihood:,-15902.0
No. Observations:,3142,AIC:,31910.0
Df Residuals:,3091,BIC:,32220.0
Df Model:,51,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
area,0.0009,7.63e-05,11.465,0.000,0.001,0.001
edu_hispanic,-2.8558,1.592,-1.794,0.073,-5.977,0.265
per_edu_hispanic_nohs,11.0043,8.128,1.354,0.176,-4.932,26.941
mask,8.3822,4.540,1.846,0.065,-0.520,17.285
per_edu_white_nohs,180.8935,23.523,7.690,0.000,134.771,227.016
per_edu_twoplus_nohs,4.3087,7.262,0.593,0.553,-9.929,18.547
some_hs,0.0003,7.72e-05,3.976,0.000,0.000,0.000
sometimes,56.5010,11.606,4.868,0.000,33.744,79.258
associates,0.0005,5.07e-05,9.042,0.000,0.000,0.001

0,1,2,3
Omnibus:,664.101,Durbin-Watson:,1.68
Prob(Omnibus):,0.0,Jarque-Bera (JB):,21879.459
Skew:,0.219,Prob(JB):,0.0
Kurtosis:,15.92,Cond. No.,1.14e+16


# Exploring Correlations

In [28]:
def make_correlation_table(
    nyt_df, info_df, x_cols=None, y_cols=None,
    date='latest', useweight=True, weight='tot_pop',
    threshold=0.4
):
    
    df = df_merger(nyt_df, info_df, x_cols, y_cols, date, weight)
    
    wct = pd.DataFrame(index=x_cols, columns=y_cols)
    
    for y in y_cols:
        for x in x_cols:
            wct.loc[x, y] = corr(df[y], df[x], df[weight])
    
    wct = wct[(wct >= threshold) | (wct <= -1 * threshold)].dropna()
    
    return wct.sort_values(by=y_cols[0], ascending=False)

In [31]:
columns = column_selector(info_df, 'all', exclude=['tot'])
make_correlation_table(csse_df, info_df, x_cols=columns, y_cols=['new_cases_per_100k_15d'], threshold=0.15)

Unnamed: 0,new_cases_per_100k_15d,new_cases_15d
no_hs,0.476045,0.97332
some_hs,0.476009,0.964239
some_college,0.462654,0.948088
associates,0.460172,0.946078
pop_25p,0.440774,0.947637
hs,0.439447,0.927567
bachelors,0.402274,0.925863
area,0.398379,0.344875
graduate,0.352004,0.871665
per_edu_hispanic_nohs,0.236341,0.183094


Counties with a high hispanic population have a disproportionately high number of COVID-19 cases per capita in the last 15 days, while counties with high white population seem to have a disproportionately low number of cases. Counties with higher educational attainment and higher income tend to have had less COVID cases per capita in the last 15 days.

In [32]:
def make_correlation_heatmap(
    nyt_df, info_df, date='latest', x_cols=None,
    y_cols=[
        'cases_per_100k', 
        'new_cases_per_100k_15d',
        'delta_new_cases_per_100k_15d',
        'deaths_per_100k',
        'new_deaths_per_100k_15d',
        'delta_new_deaths_per_100k_15d',
        'mortality_rate',
        'mortality_rate_15d'
    ],
    useweight=True, weight='tot_pop', size=50, print_corr=True,
    threshold=0.4
):
    
    df = df_merger(nyt_df, info_df, x_cols, y_cols, date, weight)

    # build weighted correlation matrix from df
    wcm_cols = x_cols + y_cols
    
    wcm = pd.DataFrame(index=x_cols, columns=wcm_cols)
    
    for y in wcm_cols:
        for x in x_cols:
            wcm.loc[x, y] = corr(df[x], df[y], df[weight])
    
    wcm = (wcm.reset_index().rename(columns={'index':'y_feature'}).dropna()
              .melt('y_feature', var_name='x_feature', value_name='corr'))
    wcm['corr'] = np.round(wcm['corr'].astype(float), 4)

    if print_corr:
        print('positive correlations')
        print(
            wcm[(wcm['corr'] >= threshold) & (wcm['corr'] != 1)]
            .sort_values(by=['corr', 'y_feature']).iloc[::2, :]
            .sort_values(by=['y_feature', 'x_feature'])
        )
        print('\nnegative correlations')
        print(
            wcm[(wcm['corr'] <= -1 * threshold) & (wcm['corr'] != -1)]
            .sort_values(by=['corr', 'y_feature']).iloc[::2, :]
            .sort_values(by=['y_feature', 'x_feature'])
        )
    
    # build altair chart
    base = alt.Chart(wcm).encode(
        alt.X(
            'x_feature:O',
            sort=x_cols
        ),
        alt.Y(
            'y_feature:O',
#             sort=columns
        )
    )
    heatmap = base.mark_rect().encode(
        color=alt.Color(
            'corr:Q',
            scale=alt.Scale(
                scheme='redblue',
                domain=[-1, 0, 1]
            )
        ),
        tooltip=[
            alt.Tooltip('x_feature:O'),
            alt.Tooltip('y_feature:O'),
            alt.Tooltip('corr:Q', title='correlation')
        ]
    )
    
    # text
    text = base.mark_text(baseline='middle').encode(
        text=alt.Text('corr:Q',format='.2f'),
        color=alt.condition(
            np.abs(alt.datum.corr) <= 0.5,
            alt.value('black'),
            alt.value('white')
        )
    )
    
    return (heatmap + text).configure_view(step=size)

In [36]:
columns = column_selector(
    info_df, 
    ['per_votes_gop', 'mask', 'edu', 'median_income', 'age_pop', 'pop_density'], 
    include=['per', 'pop'], 
    exclude=['male', 'female', 'tot']
)
make_correlation_heatmap(csse_df, info_df, x_cols=columns, y_cols=['cases_per_100k', 'new_cases_per_100k_15d'], size=50)

positive correlations
   y_feature      x_feature    corr
19       edu  median_income  0.7372
20      mask  median_income  0.4284

negative correlations
   y_feature      x_feature    corr
25       edu  per_votes_gop -0.5019
26      mask  per_votes_gop -0.6557


First, let's discuss features that not quite independent from each other:

selected positive correlations (> 0.4):
- educational attainment and median income
- educational attainment and percent asian
- mask discipline and median income
- mask discipline and percent asian
- mask discipline and percent hispanic
- median income and percent asian
- percent GOP and percent white

selected negative correlations (< -0.4):
- educational attainment and percent GOP
- mask discipline and percent GOP
- mask discipline and percent white
- percent Asian and percent GOP
- population density and percent GOP

Since there seems to be multicollinearity, we can't simply throw our data into a multiple linear regression.

## correlating cases per capita in the last 15 days

There are some (weak) correlations to recent cases per capita:

positive:
- percent GOP
- percent black
- percent hispanic

negative:
- median age
- educational attainment
- mask discipline
- median income
- percent asian
- percent white
- population density

# smoothed percentile timeseries heatmap

In [44]:
merged = df_merger(
    csse_df, info_df, date='all',
    x_cols=column_selector(info_df, 'all', exclude=['male', 'female', 'tot']),
    y_cols=['new_cases_per_100k_15d', 'cases_per_100k']
)

merged.head()

Unnamed: 0,fips,date,new_cases_per_100k_15d,cases_per_100k,cases,new_cases_15d,tot_pop,lon,per_votes_gop,edu_hispanic,pop_density,edu_pacific,per_edu_asian_nohs,age_pop_native,county,median_income_black,per_edu_pacific_nohs,per_edu_black_nohs,associates,per_edu_twoplus_nohs,age_pop,age_pop_twoplus,always,bachelors,median_income,no_hs,per_edu_hispanic_nohs,per_edu_white_nohs,pop_25p,edu_native,frequently,median_income_asian,age_pop_white,mask,some_hs,age_pop_pacific,graduate,edu_other,median_income_hispanic,median_income_pacific,per_diff,edu_white,per_edu_other_nohs,area,median_income_white,age_pop_black,age_pop_hispanic,edu_black,state,rarely,votes_diff,sometimes,median_income_other,age_pop_asian,per_edu_native_nohs,edu,edu_twoplus,lat,median_income_twoplus,edu_asian,median_income_native,never,some_college,hs
0,1001,2020-01-22,0.0,0.0,0,0.0,55869,-86.644082,0.714368,3.591054,36.287947,2.0,0.123028,9.786008,Autauga,27643.0,0.0,0.19555,2998,0.292105,8.422041,5.35251,0.444,5903,58786.0,956,0.125666,0.090371,37166,3.166667,0.295,,8.727963,3.003,3248,7.125,4388,1.503817,83423.0,,0.444184,3.28831,0.248092,1539.602123,65047.0,7.81375,6.707361,2.563808,Alabama,0.074,12335.0,0.134,,7.899381,0.217949,3.174487,2.626316,32.539527,,3.615142,,0.053,7554,12119
1,1001,2020-01-23,0.0,0.0,0,0.0,55869,-86.644082,0.714368,3.591054,36.287947,2.0,0.123028,9.786008,Autauga,27643.0,0.0,0.19555,2998,0.292105,8.422041,5.35251,0.444,5903,58786.0,956,0.125666,0.090371,37166,3.166667,0.295,,8.727963,3.003,3248,7.125,4388,1.503817,83423.0,,0.444184,3.28831,0.248092,1539.602123,65047.0,7.81375,6.707361,2.563808,Alabama,0.074,12335.0,0.134,,7.899381,0.217949,3.174487,2.626316,32.539527,,3.615142,,0.053,7554,12119
2,1001,2020-01-24,0.0,0.0,0,0.0,55869,-86.644082,0.714368,3.591054,36.287947,2.0,0.123028,9.786008,Autauga,27643.0,0.0,0.19555,2998,0.292105,8.422041,5.35251,0.444,5903,58786.0,956,0.125666,0.090371,37166,3.166667,0.295,,8.727963,3.003,3248,7.125,4388,1.503817,83423.0,,0.444184,3.28831,0.248092,1539.602123,65047.0,7.81375,6.707361,2.563808,Alabama,0.074,12335.0,0.134,,7.899381,0.217949,3.174487,2.626316,32.539527,,3.615142,,0.053,7554,12119
3,1001,2020-01-25,0.0,0.0,0,0.0,55869,-86.644082,0.714368,3.591054,36.287947,2.0,0.123028,9.786008,Autauga,27643.0,0.0,0.19555,2998,0.292105,8.422041,5.35251,0.444,5903,58786.0,956,0.125666,0.090371,37166,3.166667,0.295,,8.727963,3.003,3248,7.125,4388,1.503817,83423.0,,0.444184,3.28831,0.248092,1539.602123,65047.0,7.81375,6.707361,2.563808,Alabama,0.074,12335.0,0.134,,7.899381,0.217949,3.174487,2.626316,32.539527,,3.615142,,0.053,7554,12119
4,1001,2020-01-26,0.0,0.0,0,0.0,55869,-86.644082,0.714368,3.591054,36.287947,2.0,0.123028,9.786008,Autauga,27643.0,0.0,0.19555,2998,0.292105,8.422041,5.35251,0.444,5903,58786.0,956,0.125666,0.090371,37166,3.166667,0.295,,8.727963,3.003,3248,7.125,4388,1.503817,83423.0,,0.444184,3.28831,0.248092,1539.602123,65047.0,7.81375,6.707361,2.563808,Alabama,0.074,12335.0,0.134,,7.899381,0.217949,3.174487,2.626316,32.539527,,3.615142,,0.053,7554,12119


In [24]:
merged.shape

(1105984, 63)

In [25]:
info_df['tot_pop'].describe()

count    3.142000e+03
mean     1.044683e+05
std      3.334567e+05
min      8.600000e+01
25%      1.090250e+04
50%      2.572600e+04
75%      6.807275e+04
max      1.003911e+07
Name: tot_pop, dtype: float64

In [26]:
info_df.head()

Unnamed: 0,sumlev,state_fips,state,county,fips,tot_pop,tot_male,tot_female,tot_pop_white_male,tot_pop_white_female,tot_pop_black_male,tot_pop_black_female,tot_pop_native_male,tot_pop_native_female,tot_pop_asian_male,tot_pop_asian_female,tot_pop_pacific_male,tot_pop_pacific_female,tot_pop_twoplus_male,tot_pop_twoplus_female,tot_pop_hispanic_male,tot_pop_hispanic_female,tot_pop_white,tot_pop_black,tot_pop_native,tot_pop_asian,tot_pop_pacific,tot_pop_twoplus,tot_pop_hispanic,age_pop,age_male,age_female,age_pop_white_male,age_pop_white_female,age_pop_black_male,age_pop_black_female,age_pop_native_male,age_pop_native_female,age_pop_asian_male,age_pop_asian_female,age_pop_pacific_male,age_pop_pacific_female,age_pop_twoplus_male,age_pop_twoplus_female,age_pop_hispanic_male,age_pop_hispanic_female,age_pop_white,age_pop_black,age_pop_native,age_pop_asian,age_pop_pacific,age_pop_twoplus,age_pop_hispanic,region,division,area,neighbors,tot_votes_dem,tot_votes_gop,tot_votes,votes_diff,per_diff,median_income,median_income_black,median_income_native,median_income_asian,median_income_pacific,median_income_other,median_income_twoplus,median_income_hispanic,median_income_white,tot_edu_white,tot_edu_white_male,tot_edu_white_female,tot_edu_white_hsplus,tot_edu_white_male_hsplus,tot_edu_white_female_hsplus,tot_edu_white_4yplus,tot_edu_white_male_4yplus,tot_edu_white_female_4yplus,tot_edu_black,tot_edu_black_male,tot_edu_black_female,tot_edu_black_hsplus,tot_edu_black_male_hsplus,tot_edu_black_female_hsplus,tot_edu_black_4yplus,tot_edu_black_male_4yplus,tot_edu_black_female_4yplus,tot_edu_native,tot_edu_native_male,tot_edu_native_female,tot_edu_native_hsplus,tot_edu_native_male_hsplus,tot_edu_native_female_hsplus,tot_edu_native_4yplus,tot_edu_native_male_4yplus,tot_edu_native_female_4yplus,tot_edu_asian,tot_edu_asian_male,tot_edu_asian_female,tot_edu_asian_hsplus,tot_edu_asian_male_hsplus,tot_edu_asian_female_hsplus,tot_edu_asian_4yplus,tot_edu_asian_male_4yplus,tot_edu_asian_female_4yplus,tot_edu_pacific,tot_edu_pacific_male,tot_edu_pacific_female,tot_edu_pacific_hsplus,tot_edu_pacific_male_hsplus,tot_edu_pacific_female_hsplus,tot_edu_pacific_4yplus,tot_edu_pacific_male_4yplus,tot_edu_pacific_female_4yplus,tot_edu_other,tot_edu_other_male,tot_edu_other_female,tot_edu_other_hsplus,tot_edu_other_male_hsplus,tot_edu_other_female_hsplus,tot_edu_other_4yplus,tot_edu_other_male_4yplus,tot_edu_other_female_4yplus,tot_edu_twoplus,tot_edu_twoplus_male,tot_edu_twoplus_female,tot_edu_twoplus_hsplus,tot_edu_twoplus_male_hsplus,tot_edu_twoplus_female_hsplus,tot_edu_twoplus_4yplus,tot_edu_twoplus_male_4yplus,tot_edu_twoplus_female_4yplus,tot_edu_hispanic,tot_edu_hispanic_male,tot_edu_hispanic_female,tot_edu_hispanic_hsplus,tot_edu_hispanic_male_hsplus,tot_edu_hispanic_female_hsplus,tot_edu_hispanic_4yplus,tot_edu_hispanic_male_4yplus,tot_edu_hispanic_female_4yplus,pop_25p,no_hs,some_hs,hs,some_college,associates,bachelors,graduate,edu,edu_white,per_edu_white_nohs,edu_white_male,per_edu_white_male_nohs,edu_white_female,per_edu_white_female_nohs,edu_black,per_edu_black_nohs,edu_black_male,per_edu_black_male_nohs,edu_black_female,per_edu_black_female_nohs,edu_native,per_edu_native_nohs,edu_native_male,per_edu_native_male_nohs,edu_native_female,per_edu_native_female_nohs,edu_asian,per_edu_asian_nohs,edu_asian_male,per_edu_asian_male_nohs,edu_asian_female,per_edu_asian_female_nohs,edu_pacific,per_edu_pacific_nohs,edu_pacific_male,per_edu_pacific_male_nohs,edu_pacific_female,per_edu_pacific_female_nohs,edu_other,per_edu_other_nohs,edu_other_male,per_edu_other_male_nohs,edu_other_female,per_edu_other_female_nohs,edu_twoplus,per_edu_twoplus_nohs,edu_twoplus_male,per_edu_twoplus_male_nohs,edu_twoplus_female,per_edu_twoplus_female_nohs,edu_hispanic,per_edu_hispanic_nohs,edu_hispanic_male,per_edu_hispanic_male_nohs,edu_hispanic_female,per_edu_hispanic_female_nohs,never,rarely,sometimes,frequently,always,mask,pop_density,lat,lon
0,50,1,Alabama,Autauga,1001,55869,27092,28777,20138,21077,5171,5927,105,138,282,364,20,20,492,464,884,787,41215,11098,243,646,40,956,1671,8.422041,8.181973,8.648052,8.518373,8.928216,7.480758,8.104269,9.733333,9.826087,7.276596,8.381868,8.4,5.85,5.195122,5.519397,6.382353,7.072427,8.727963,7.81375,9.786008,7.899381,7.125,5.35251,6.707361,South,East South Central,1539.602123,"[01021, 01047, 01051, 01085, 01101]",7503.0,19838.0,27770.0,12335.0,0.444184,58786.0,27643.0,,,,,,83423.0,65047.0,28726,13834,14892,26130,12588,13542,8440,4573,3867,6786,3042,3744,5459,2436,3023,1296,573,723,78,39,39,61,25,36,25,25,0,317,95,222,278,62,216,118,43,75,32,5,27,32,5,27,0,0,0,262,93,169,197,67,130,0,0,0,380,135,245,269,73,196,92,0,92,939,455,484,821,380,441,346,230,116,37166,956,3248,12119,7554,2998,5903,4388,3.174487,3.28831,0.090371,3.472676,0.090068,3.117043,0.090653,2.563808,0.19555,2.543393,0.199211,2.580395,0.192575,3.166667,0.217949,4.487179,0.358974,1.846154,0.076923,3.615142,0.123028,3.568421,0.347368,3.635135,0.027027,2.0,0.0,2.0,0.0,2.0,0.0,1.503817,0.248092,1.44086,0.27957,1.538462,0.230769,2.626316,0.292105,1.081481,0.459259,3.477551,0.2,3.591054,0.125666,4.197802,0.164835,3.020661,0.088843,0.053,0.074,0.134,0.295,0.444,3.003,36.287947,32.539527,-86.644082
1,50,1,Alabama,Baldwin,1003,223234,108247,114987,89845,95902,9308,9907,753,754,911,1435,53,70,1832,1930,5545,4989,185747,19215,1507,2346,123,3762,10534,8.987202,8.806627,9.157192,9.150993,9.481721,7.587129,8.095791,9.14741,8.851459,7.246981,7.887108,8.150943,8.628571,5.757642,5.864767,6.497565,6.719182,9.321749,7.849388,8.999336,7.638534,8.422764,5.8126,6.602525,South,East South Central,4117.546676,"[01025, 01053, 01097, 01099, 01129]",24578.0,83544.0,109679.0,58966.0,0.537623,55962.0,31112.0,53289.0,34763.0,,45634.0,53456.0,43279.0,59418.0,126316,60310,66006,116288,54788,61500,41648,19863,21785,12006,5593,6413,9565,4129,5436,2164,808,1356,1015,523,492,790,410,380,145,81,64,1180,426,754,980,271,709,243,147,96,9,0,9,9,0,9,0,0,0,938,469,469,695,394,301,262,119,143,1712,853,859,1529,744,785,559,199,360,5119,2749,2370,3823,1813,2010,1389,637,752,146989,3978,10332,40579,32266,13759,30431,15644,3.329113,3.489788,0.079388,3.463621,0.09156,3.513696,0.068267,2.494586,0.203315,2.19882,0.261756,2.752534,0.152347,2.270936,0.221675,2.342256,0.216061,2.195122,0.227642,2.690678,0.169492,2.997653,0.36385,2.517241,0.059682,2.0,0.0,,,2.0,0.0,2.878465,0.259062,2.948827,0.159915,2.808102,0.358209,3.418808,0.106893,2.910903,0.127784,3.923166,0.086147,2.850361,0.253174,2.477628,0.340487,3.2827,0.151899,0.083,0.059,0.098,0.323,0.436,2.968,54.215293,30.72775,-87.722071
2,50,1,Alabama,Barbour,1005,24686,13064,11622,5894,5341,6260,5547,52,43,55,61,21,10,153,132,629,488,11235,11807,95,116,31,285,1117,8.784412,8.463564,9.14507,9.475568,10.196592,7.767732,8.491437,9.346154,10.465116,8.109091,10.065574,6.285714,7.7,6.418301,5.810606,6.434022,5.766393,9.818336,8.107733,9.852632,9.137931,6.741935,6.136842,6.142346,South,East South Central,2292.144655,"[01011, 01045, 01067, 01109, 01113]",4816.0,5622.0,10518.0,806.0,0.076631,34186.0,23013.0,,50417.0,,26793.0,19760.0,30417.0,47031.0,9171,4846,4325,7264,3657,3607,1578,814,764,8137,4304,3833,5551,2776,2775,552,240,312,72,72,0,42,42,0,0,0,0,88,40,48,72,27,45,5,0,5,1,0,1,0,0,0,0,0,0,345,230,115,100,76,24,44,44,0,183,80,103,153,50,103,9,0,9,573,395,178,251,192,59,76,60,16,18173,1490,3411,6486,3287,1279,1417,803,2.38062,2.444444,0.207938,2.349154,0.245357,2.551214,0.166012,1.703576,0.317808,1.568773,0.355019,1.854944,0.276024,1.166667,0.416667,1.166667,0.416667,,,1.920455,0.181818,1.35,0.325,2.395833,0.0625,0.0,1.0,,,0.0,1.0,1.217391,0.710145,1.617391,0.669565,0.417391,0.791304,1.918033,0.163934,1.25,0.375,2.436893,0.0,1.539267,0.561955,1.731646,0.513924,1.11236,0.668539,0.067,0.121,0.12,0.201,0.491,2.928,10.769826,31.868263,-85.387129
3,50,1,Alabama,Bibb,1007,22394,11929,10465,8482,8181,2912,1807,50,41,21,25,5,1,116,130,343,280,16663,4719,91,46,6,246,623,8.606145,8.349484,8.89871,8.645838,9.105122,7.706387,8.455451,8.52,8.219512,7.52381,9.8,6.4,3.0,6.422414,6.3,7.186589,6.975,8.871332,7.993219,8.384615,8.76087,5.833333,6.357724,7.091493,South,East South Central,1612.167481,"[01021, 01065, 01073, 01105, 01117, 01125]",1986.0,7525.0,9595.0,5539.0,0.57728,45340.0,34000.0,,,,,20329.0,42708.0,50769.0,12002,6037,5965,10483,5181,5302,1570,674,896,3316,2146,1170,2296,1377,919,200,83,117,8,8,0,8,8,0,0,0,0,37,16,21,37,16,21,37,16,21,0,0,0,0,0,0,0,0,0,9,9,0,9,9,0,0,0,0,149,108,41,108,89,19,6,6,0,313,171,142,206,95,111,0,0,0,15780,903,1747,7471,2938,908,1197,616,2.459823,2.400933,0.126562,2.27464,0.141792,2.528751,0.111148,1.686369,0.3076,1.476701,0.358341,2.07094,0.21453,2.0,0.0,2.0,0.0,,,7.0,0.0,7.0,0.0,7.0,0.0,,,,,,,2.0,0.0,2.0,0.0,,,1.651007,0.275168,1.925926,0.175926,0.926829,0.536585,1.316294,0.341853,1.111111,0.444444,1.56338,0.21831,0.02,0.034,0.096,0.278,0.572,3.348,13.890616,32.996421,-87.125115
4,50,1,Alabama,Blount,1009,57826,28472,29354,24494,25682,453,419,143,139,73,90,14,7,345,385,2950,2632,50176,872,282,163,21,730,5582,8.651714,8.49136,8.807249,8.771209,9.131843,8.030905,7.947494,10.125874,9.446043,8.30137,9.066667,8.142857,10.571429,6.486957,6.535065,6.4,6.06193,8.955796,7.990826,9.79078,8.723926,8.952381,6.512329,6.240595,South,East South Central,1670.103911,"[01043, 01055, 01073, 01095, 01115, 01127]",2640.0,24711.0,27588.0,22071.0,0.800022,48695.0,,65385.0,99219.0,,,44934.0,35495.0,49872.0,35774,17200,18574,29814,14167,15647,4775,1900,2875,596,281,315,411,192,219,22,10,12,132,22,110,94,14,80,13,13,0,124,43,81,104,43,61,62,25,37,18,0,18,18,0,18,0,0,0,211,106,105,120,56,64,90,28,62,440,212,228,361,154,207,24,22,2,2610,1468,1142,1006,476,530,82,48,34,39627,2967,4894,13489,8492,4775,3217,1793,2.606581,2.334181,0.166601,2.199651,0.176337,2.45876,0.157586,1.563758,0.310403,1.544484,0.316726,1.580952,0.304762,1.916667,0.287879,4.227273,0.363636,1.454545,0.272727,4.177419,0.16129,4.906977,0.0,3.790123,0.246914,2.0,0.0,,,2.0,0.0,3.270142,0.43128,2.377358,0.471698,4.171429,0.390476,1.913636,0.179545,1.971698,0.273585,1.859649,0.092105,0.927969,0.614559,0.811989,0.675749,1.077058,0.535902,0.053,0.114,0.18,0.194,0.459,2.892,34.624193,33.982109,-86.567906


In [27]:
info_df[(info_df['per_gop'] > 0.025) & (info_df['per_gop'] <= 0.075)]['per_gop']

KeyError: 'per_gop'

In [31]:
np.quantile(info_df['per_gop'].tolist(), 0.5)

nan

In [45]:
def df_splitter(info_df, split_on, splits=2, equal_pop=True, mode='verbose'):
    
    if mode not in ['verbose', 'mean', 'percentile']:
        mode = verbose
        
    info_df = info_df[~info_df[split_on].isna()].sort_values(by=split_on)
    
    if equal_pop:
        # https://stackoverflow.com/a/31871770/14083095
        # splitting df into approx equal populations
        info_df['pop_cumsum'] = info_df['tot_pop'].cumsum()
        subpop = info_df['pop_cumsum'].max() / splits
        info_df['split'] = (info_df['pop_cumsum'] / subpop).apply(math.ceil)
    else:
        # splitting df into approx equal shapes
        info_df['split'] = pd.qcut(info_df[split_on], splits)
        
    replace_dict = {}
    to_replace = info_df['split'].unique()
    
    # renaming our splits into something more readable
    for i, s in enumerate(to_replace):
        if mode == 'verbose':
            replace_dict[s] = f"[{info_df.loc[info_df['split']==s,split_on].min():.2f},"\
            f" {info_df.loc[info_df['split']==s,split_on].max():.2f}]"
        elif mode == 'mean':
            replace_dict[s] = np.round(
                info_df.loc[info_df['split']==s,split_on].mean(),
                decimals=3
            )
        else:
            replace_dict[s] = (100/splits) * (int(i)+1)
    info_df['split'] = info_df['split'].replace(replace_dict)
    
    return info_df

In [46]:
def make_heatmap_timeseries(
    nyt_df, info_df, y='new_cases_per_100k', splits=25, split_on=None,
    equal_pop=True, mode='percentile', group=True
):

    y_title = split_on
    y_subtitle = 'county'
    if equal_pop:
        y_subtitle = 'pop'
    # check number of splits and only split on numeric columns
    # otherwise, use names as the different lines (setting splits=1)
    splits = int(splits)
    if split_on in info_df.select_dtypes(exclude='number').columns:
        splits = 1
    y_ = [y]
    # refer to non-per-capita column
    if '_per_100k' in y:
        y_ = [y.replace('_per_100k', '')]
    elif y is 'mortality_rate':
        y_ = ['cases', 'deaths']
    # first split df so that we can plot different lines
    if splits > 1:
        info_df = df_splitter(info_df, split_on, splits, equal_pop, mode)
        merged = df_merger(
            nyt_df, info_df, x_cols=[split_on, 'split'], y_cols=y_, date='all',
            weight='tot_pop'
        )
        # 'split' column generated by df_splitter()
        split_on = 'split'
    else:
        merged = df_merger(
            nyt_df, info_df, x_cols=[split_on], y_cols=y_, date='all'
        )
        
    # recalculate aggregates
    if '_per_100k' in y:
        y_ = y.replace('_per_100k', '')
        data = merged.groupby(by=['date', split_on])[y_].sum().fillna(0)\
               / merged.groupby(by=['date', split_on])['tot_pop'].sum() * 100_000
    elif y is 'mortality_rate':
        data = merged.groupby(by=['date', split_on])['deaths'].sum()\
               / merged.groupby(by=['date', split_on])['cases'].sum()
    else:
        data = merged.groupby(by=['date', split_on])[y].sum().fillna(0)
    data = data.reset_index().rename(columns={0: y})
    
    y_alt = f'{split_on}:O'
    
    # nearest point selection
    nearest = alt.selection(type='single', nearest=True, on='mouseover',
                            fields=['date'], empty='all')
    
    # title
    dx = 160
    dy = splits*9
    title = alt.Chart(data).mark_text(dx=dx, dy=dy, size=20).encode(
        text='monthdate(date):T'
    ).transform_filter(nearest)
    
    w_title = alt.Chart(data).mark_text(dx=dx, dy=dy, stroke='white', strokeWidth=3, size=20).encode(
        text='monthdate(date):T'
    ).transform_filter(nearest)
    
    # right panel: heatmap
    heatmap = alt.Chart(data).mark_rect().encode(
        alt.X(
            'monthdate(date):T',
            axis=alt.Axis(format='%b %d')
        ),
        alt.Y(
            y_alt,
            sort=alt.EncodingSortField(f'{split_on}', order='descending'),
            title=f'{y_title} ({y_subtitle} {mode})'
        ),
        color=alt.Color(
            f'{y}:Q',
            scale=alt.Scale(
                scheme='lightmulti'
            )
        )
    ).add_selection(nearest)
    
    # left panel: bar chart
    bars = alt.Chart(data).mark_bar().encode(
        alt.X(
            f'{y}:Q',
            scale=alt.Scale(
                domain=[0, data[y].max()]
            )
        ),
        alt.Y(
            y_alt,
            sort=alt.EncodingSortField(f'{split_on}', order='descending'),
            title=f'{y_title} ({y_subtitle} {mode})'
        ),
        color=alt.Color(
            f'{y}:Q',
            scale=alt.Scale(
                scheme='lightmulti'
            )
        ),
        tooltip=[
            alt.Tooltip(f'{y}:Q'),
            alt.Tooltip(y_alt),
        ]
    ).transform_filter(nearest)
    
    
    # selects nearest points based on date
    selectors = alt.Chart(data).mark_point().encode(
        x='monthdate(date):T',
        opacity=alt.value(0)
    ).add_selection(nearest)
    
    return (heatmap | bars+w_title+title).properties(
        title=f'{y} vs {y_title}'
    )

# Aggregate Differences

In [39]:
def df_splitter(info_df, split_on, splits=2, equal_pop=True, mode='verbose'):
    
    if mode not in ['verbose', 'mean', 'percentile']:
        mode = verbose
        
    info_df = info_df[~info_df[split_on].isna()].sort_values(by=split_on)
    
    if equal_pop:
        # https://stackoverflow.com/a/31871770/14083095
        # splitting df into approx equal populations
        info_df['pop_cumsum'] = info_df['tot_pop'].cumsum()
        subpop = info_df['pop_cumsum'].max() / splits
        info_df['split'] = (info_df['pop_cumsum'] / subpop).apply(math.ceil)
    else:
        # splitting df into approx equal shapes
        info_df['split'] = pd.qcut(info_df[split_on], splits)
        
    replace_dict = {}
    to_replace = info_df['split'].unique()
    
    # renaming our splits into something more readable
    for i, s in enumerate(to_replace):
        if mode == 'verbose':
            replace_dict[s] = f"[{info_df.loc[info_df['split']==s,split_on].min():.2f},"\
            f" {info_df.loc[info_df['split']==s,split_on].max():.2f}]"
        elif mode == 'mean':
            replace_dict[s] = np.round(
                info_df.loc[info_df['split']==s,split_on].mean(),
                decimals=3
            )
        else:
            replace_dict[s] = (100/splits) * (int(i)+1)
    info_df['split'] = info_df['split'].replace(replace_dict)
    
    return info_df

In [48]:
def make_line_timeseries(
    nyt_df, info_df, y='new_cases_per_100k_15sg', splits=2, split_on=None, 
    equal_pop=True
):
    
    # check number of splits and only split on numeric columns
    # otherwise, use names as the different lines (setting splits=1)
    splits = int(splits)
    if split_on in info_df.select_dtypes(exclude='number').columns:
        splits = 1
    y_ = [y]
    if '_per_100k' in y:
        y_ = [y.replace('_per_100k', '')]
    elif y is 'mortality_rate':
        y_ = ['cases', 'deaths']
    # first split df so that we can plot different lines
    if splits > 1:
        info_df = df_splitter(info_df, split_on, splits, equal_pop)
        merged = df_merger(
            nyt_df, info_df, x_cols=[split_on, 'split'], y_cols=y_, date='all',
            weight='tot_pop'
        )
        # 'split' column generated by df_splitter()
        split_on = 'split'
    else:
        merged = df_merger(
            nyt_df, info_df, x_cols=[split_on], y_cols=y_, date='all'
        )
        
    # recalculate aggregates
    if '_per_100k' in y:
        y_ = y.replace('_per_100k', '')
        data = merged.groupby(by=['date', split_on])[y_].sum().fillna(0)\
               / merged.groupby(by=['date', split_on])['tot_pop'].sum() * 100_000
    elif y is 'mortality_rate':
        data = merged.groupby(by=['date', split_on])['deaths'].sum()\
               / merged.groupby(by=['date', split_on])['cases'].sum()

#     elif y is 'mortality_rate':
#         data = merged.groupby(by=['date', split_on])[y_].sum().fillna(0)\
#                / merged.groupby(by=['date', split_on])['tot_pop'].sum()
    else:
        data = merged.groupby(by=['date', split_on])[y].sum().fillna(0)
    data = data.reset_index().rename(columns={0: y})
    
    
    # nearest point selection
    nearest = alt.selection(type='single', nearest=True, on='mouseover',
                            fields=['date'], empty='none')
    
    # base line chart
    lines = alt.Chart(data).mark_line().encode(
        x='date:T',
        y=alt.Y(
            f'{y}:Q',
            title=y.replace('_', ' ')
        ),
        color=f'{split_on}:O'
    )
    
    # selects nearest points based on date
    selectors = alt.Chart(data).mark_point().encode(
        x='date:T',
        opacity=alt.value(0)
    ).add_selection(nearest)
    
    # marks a point on line where selected
    points = lines.mark_point().encode(
        opacity=alt.condition(nearest, alt.value(1), alt.value(0))
    )
    
    # white background for text
    white_text = lines.mark_text(align='left', dx=5, dy=-5, stroke='white', strokeWidth=3).encode(
        text=alt.condition(nearest, f'{y}:Q', alt.value(' '), format='.1f')
    )
    
    # text showing y value
    text = lines.mark_text(align='left', dx=5, dy=-5).encode(
        text=alt.condition(nearest, f'{y}:Q', alt.value(' '), format='.1f')
    )
    
    # rule showing nearest selector
    rules = alt.Chart(data).mark_rule(color='gray').encode(
        x='date:T',
        size=alt.value(1)
    ).transform_filter(nearest)
    
    return alt.layer(
        lines, selectors, points, rules, white_text, text
    ).configure_axis(
        gridDash=[1,2]
    ).properties(
        width=640, height=384
    )

In [45]:
make_line_timeseries(
    nyt_df, 
    info_df, 
    y='new_cases_per_100k_15d', 
    splits=3,
    split_on='per_gop', 
    equal_pop=True)

In [44]:
make_line_timeseries(
    nyt_df, 
    info_df, 
    y='new_cases_per_100k_15d', 
    splits=3,
    split_on='mask', 
    equal_pop=True)

In [43]:
make_line_timeseries(
    nyt_df, 
    info_df, 
    y='new_cases_per_100k_15d', 
    splits=3,
    split_on='pop_density', 
    equal_pop=True)

# Visualizing Via Heatmap

In [37]:
def make_heatmap_timeseries(
    nyt_df, info_df, y='new_cases_per_100k_15sg', splits=10, split_on=None,
    equal_pop=True, mode='percentile'
):
    y_title = split_on
    y_subtitle = 'county'
    if equal_pop:
        y_subtitle = 'pop'
    # check number of splits and only split on numeric columns
    # otherwise, use names as the different lines (setting splits=1)
    splits = int(splits)
    if split_on in info_df.select_dtypes(exclude='number').columns:
        splits = 1
    y_ = [y]
    if '_per_100k' in y:
        y_ = [y.replace('_per_100k', '')]
    elif y is 'mortality_rate':
        y_ = ['cases', 'deaths']
    # first split df so that we can plot different lines
    if splits > 1:
        info_df = df_splitter(info_df, split_on, splits, equal_pop, mode)
        merged = df_merger(
            nyt_df, info_df, x_cols=[split_on, 'split'], y_cols=y_, date='all',
            weight='tot_pop'
        )
        # 'split' column generated by df_splitter()
        split_on = 'split'
    else:
        merged = df_merger(
            nyt_df, info_df, x_cols=[split_on], y_cols=y_, date='all'
        )
        
    # recalculate aggregates
    if '_per_100k' in y:
        y_ = y.replace('_per_100k', '')
        data = merged.groupby(by=['date', split_on])[y_].sum().fillna(0)\
               / merged.groupby(by=['date', split_on])['tot_pop'].sum() * 100_000
    elif y is 'mortality_rate':
        data = merged.groupby(by=['date', split_on])['deaths'].sum()\
               / merged.groupby(by=['date', split_on])['cases'].sum()
    else:
        data = merged.groupby(by=['date', split_on])[y].sum().fillna(0)
    data = data.reset_index().rename(columns={0: y})
    
    y_alt = f'{split_on}:O'
    
    # nearest point selection
    nearest = alt.selection(type='single', nearest=True, on='mouseover',
                            fields=['date'], empty='all')
    
    # title
    dx = 160
    dy = splits*9
    title = alt.Chart(data).mark_text(dx=dx, dy=dy, size=20).encode(
        text='monthdate(date):T'
    ).transform_filter(nearest)
    
    w_title = alt.Chart(data).mark_text(dx=dx, dy=dy, stroke='white', strokeWidth=3, size=20).encode(
        text='monthdate(date):T'
    ).transform_filter(nearest)
    
    # right panel: heatmap
    heatmap = alt.Chart(data).mark_rect().encode(
        alt.X(
            'monthdate(date):T',
            axis=alt.Axis(format='%b %d'),
            scale=alt.Scale(zero=False)
        ),
        alt.Y(
            y_alt,
            sort=alt.EncodingSortField(f'{split_on}', order='descending'),
            title=f'{y_title} ({y_subtitle} {mode})'
        ),
        color=alt.Color(
            f'{y}:Q',
            scale=alt.Scale(
                scheme='lightmulti'
            )
        )
    ).add_selection(nearest)
    
    # left panel: bar chart
    bars = alt.Chart(data).mark_bar().encode(
        alt.X(
            f'{y}:Q',
            scale=alt.Scale(
                domain=[0, data[y].max()]
            )
        ),
        alt.Y(
            y_alt,
            sort=alt.EncodingSortField(f'{split_on}', order='descending'),
            title=f'{y_title} ({y_subtitle} {mode})'
        ),
        color=alt.Color(
            f'{y}:Q',
            scale=alt.Scale(
                scheme='lightmulti'
            )
        ),
        tooltip=[
            alt.Tooltip(f'{y}:Q'),
            alt.Tooltip(y_alt),
        ]
    ).transform_filter(nearest)
    
    
    # selects nearest points based on date
    selectors = alt.Chart(data).mark_point().encode(
        x='monthdate(date):T',
        opacity=alt.value(0)
    ).add_selection(nearest)
    
    return (heatmap | bars+w_title+title).properties(
        title=f'{y} vs {y_title}'
    )

In [40]:
make_heatmap_timeseries(csse_df[csse_df['date'] < '2021-01-01'], info_df, y='new_cases_per_100k_15d', splits=10, split_on='per_votes_gop', equal_pop=True, mode='percentile')

In [41]:
make_heatmap_timeseries(csse_df[csse_df['date'] < '2021-01-01'], info_df, y='new_cases_per_100k_15d', splits=10, split_on='mask', equal_pop=True, mode='percentile')

In [42]:
make_heatmap_timeseries(csse_df[csse_df['date'] < '2021-01-01'], info_df, y='new_cases_per_100k_15d', splits=10, split_on='pop_density', equal_pop=True, mode='percentile')

In [43]:
make_heatmap_timeseries(csse_df[csse_df['date'] < '2021-01-01'], info_df, y='new_cases_per_100k_15d', splits=10, split_on='edu', equal_pop=True, mode='percentile')

In [44]:
make_heatmap_timeseries(csse_df[csse_df['date'] < '2021-01-01'], info_df, y='new_cases_per_100k_15d', splits=10, split_on='median_income', equal_pop=True, mode='percentile')