# Exploring Trends

In [1]:
import numpy as np
import numpy.polynomial.polynomial as poly      # linear regression
import pandas as pd

import math

import pickle

import re

# opening external coordinates
import json

# opening urls
from urllib.request import urlopen

# benchmarking
from time import time

# date/time operations
from datetime import datetime, timedelta
from pytz import timezone

# plotting
import altair as alt
from altair import datum
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

# for US map
from vega_datasets import data

In [2]:
# large datasets
# alt.data_transformers.enable('data_server');

pd.options.display.max_rows = 250
pd.options.display.max_columns = 250

In [3]:
#hide
def optimize(df):
    '''
    Optimizes the data types in a pandas dataframe.
    '''
    dft = df.copy()
    # converts to datetime if possible
    dft = dft.apply(
        lambda col:
        pd.to_datetime(col, errors='ignore')
        if col.dtypes=='object' 
        else col
    )
    
    # if there are less than half as many unique values as there are rows, convert to category
    for col in dft.select_dtypes(include='object'):
        if len(dft[col].unique()) / len(df[col]) < 0.5:
            dft[col] = dft[col].astype('category')
            
    # downcasts numeric columns if possible
    dft = dft.apply(
        lambda col: 
        pd.to_numeric(col, downcast='integer') 
        if col.dtypes=='int64' 
        else col
    )
    dft = dft.apply(
        lambda col: 
        pd.to_numeric(col, downcast='float') 
        if col.dtypes=='float64' 
        else col
    )
    
    return dft

In [4]:
with open('../data/processed/nyt_df.p', 'rb') as f:
    nyt_df = pickle.load(f)
nyt_df.tail()

Unnamed: 0,date,county,state,fips,cases,deaths,cases_per_100k,deaths_per_100k,new_cases,new_deaths,new_cases_per_100k,new_deaths_per_100k,new_cases_15d,new_deaths_15d,new_cases_per_100k_15d,new_deaths_per_100k_15d,new_cases_15sg,new_deaths_15sg,new_cases_per_100k_15sg,new_deaths_per_100k_15sg,delta_new_cases,delta_new_deaths,delta_new_cases_per_100k,delta_new_deaths_per_100k,delta_new_cases_15d,delta_new_deaths_15d,delta_new_cases_per_100k_15d,delta_new_deaths_per_100k_15d,delta_new_cases_15sg,delta_new_deaths_15sg,delta_new_cases_per_100k_15sg,delta_new_deaths_per_100k_15sg,days,mortality_rate,mortality_rate_15d
503041,2020-09-09,Sweetwater,Wyoming,56037,316,2,746.286281,4.723331,2,0,4,0,21.0,0.0,45.0,0.0,2.275,0.0,4.875,0.0,-3,0,-7,0,-3.0,0.0,-7.0,0.0,0.55,0.0,1.133333,0.0,232,0.006329,0.0
503042,2020-09-09,Teton,Wyoming,56039,451,1,1922.09342,4.261848,1,0,4,0,39.0,0.0,161.0,0.0,1.9,0.0,7.683333,0.0,-2,0,-8,0,-3.0,0.0,-13.0,0.0,-0.275,0.0,-1.116667,0.0,232,0.002217,0.0
503043,2020-09-09,Uinta,Wyoming,56041,308,2,1522.792445,9.888263,0,0,0,0,26.0,0.0,120.0,0.0,0.183333,0.0,0.625,0.0,-1,0,-4,0,0.0,0.0,0.0,0.0,-0.65,0.0,-3.0,0.0,232,0.006494,0.0
503044,2020-09-09,Washakie,Wyoming,56043,110,6,1409.352979,76.873799,0,0,0,0,3.0,1.0,36.0,12.0,0.35,0.041667,4.2,0.5,0,0,0,0,0.0,0.0,0.0,0.0,-0.075,-0.025,-0.9,-0.3,232,0.054545,0.333333
503045,2020-09-09,Weston,Wyoming,56045,21,0,303.161542,0.0,2,0,28,0,9.0,0.0,126.0,0.0,0.15,0.0,2.1,0.0,2,0,28,0,1.0,0.0,14.0,0.0,0.416667,0.0,5.833333,0.0,232,0.0,0.0


In [5]:
with open('../data/processed/info_df.p', 'rb') as f:
    info_df = pickle.load(f)
    
with open('../data/processed/geo_altair.p', 'rb') as f:
    geo_altair = pickle.load(f)

In [4]:
info_df.to_csv('../data/processed/info_df.csv', index=False)

# Helper Functions

In [6]:
def column_selector(info_df, columns='none', mask=[], exclude=[]):
    
    # only select from numeric columns
    all_columns = info_df.select_dtypes(include='number').columns.tolist()
    
    # empty container if we don't have a list going already
    if columns is 'none':
        columns = []
    elif columns is 'all':
        columns = all_columns
    
    # includes all columns that have all elements in mask
    # excludes all columns that have any elements in exclude
    if len(mask) > 0:
        mask_columns = list(set([
            c for c in all_columns
            if all(m in set(re.findall('[0-9a-z]+', c))
            for m in mask)
        ]))
        columns += mask_columns
    if len(exclude) > 0:
        columns = list(set([
            c for c in columns 
            if all(e not in set(re.findall('[0-9a-z]+', c)) 
            for e in exclude)
        ]))
        
    return sorted(list(set(columns)))

In [7]:
column_selector(info_df, mask=['edu'], exclude=['tot', 'per', 'male', 'female'])

['edu',
 'edu_asian',
 'edu_black',
 'edu_hispanic',
 'edu_native',
 'edu_other',
 'edu_pacific',
 'edu_twoplus',
 'edu_white']

In [8]:
def corr(x, y, w, useweight=True):
    
    # only uses elements that are not nan from both lists
    x_ids = ~np.isnan(x)
    y_ids = ~np.isnan(y)
    ids = x_ids & y_ids
    
    if useweight:
        try:
            [xx, xy], [_, yy] = np.cov(x[ids], y[ids], aweights=w[ids])
        except:
            print(x.name)
            print(y.name)
    else:
        [xx, xy], [_, yy] = np.cov(x[ids], y[ids])
    
    return xy / np.sqrt(xx * yy)

In [9]:
def df_merger(nyt_df, info_df, x_cols=None, y_cols=None, date='latest', weight='tot_pop'):
    
    # make sure x and y are valid
    all_y = nyt_df.columns.tolist()
    for y in y_cols:
        if '_per_100k' in y:
            y_cols.append(y.replace('_per_100k', ''))
    y_cols = sorted(list(set([y for y in y_cols if y in all_y])))
    
    all_x = info_df.columns.tolist()
    x_cols = sorted(list(set([c for c in x_cols if c in all_x])))
    
    ## only process specific date and y_cols
    left_columns = list(set(['date', 'fips'] + y_cols))
    if date=='latest':
        left_df = nyt_df[nyt_df['date']==nyt_df['date'].max()][left_columns]
    elif date=='all':
        left_df = nyt_df[left_columns]
    else:
        left_df = nyt_df[nyt_df['date']==date][left_columns]

    ## only process specific x_cols
    right_columns = list(set(['fips', 'state', 'county', weight] + x_cols))
    right_df = info_df[right_columns]
    
    # https://stackoverflow.com/a/47118728/14083095
    # fills nyt_df with entries for counties that do not log cases
    # for more accurate aggregate per capita calculations
    
    # create multiindex that has every fips with every date
    mux = pd.MultiIndex \
            .from_product(
                [left_df['date'].unique(), 
                 right_df['fips'].unique()], 
                names=('date', 'fips')
            )
    # reindex data to multiindex, fill nan entries with 0
    left_df = left_df.set_index(['date','fips']) \
                     .reindex(mux)               \
                     .swaplevel(0,1)             \
                     .reset_index()              \
                     .fillna(0)
   
    df = left_df.merge(right_df, on='fips', how='outer', suffixes=('_x', ''))
    df = df.drop([x for x in df.columns if x[-2:]=='_x'], axis=1)
    
    return df

In [10]:
import statsmodels.api as sm
import scipy.stats as stats
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [11]:
merged = df_merger(
    nyt_df, info_df, 
    x_cols=column_selector(info_df, 'all', exclude=['male', 'female', 'tot', 'lat', 'lon']),
    y_cols=['new_cases_per_100k_15d', 'cases_per_100k']
)

merged.head()

Unnamed: 0,fips,date,cases,new_cases_15d,cases_per_100k,new_cases_per_100k_15d,edu_hispanic,median_income_black,some_hs,median_income_other,median_income_hispanic,graduate,per_edu_asian_nohs,sometimes,per_edu_native_nohs,some_college,age_pop_hispanic,median_income_white,per_edu_black_nohs,area_land,per_pop_white,per_edu_pacific_nohs,per_pop_hispanic,age_pop_pacific,edu_black,age_pop,age_pop_black,bachelors,edu_white,hs,median_income_native,tot_pop,median_income,per_edu_other_nohs,per_votes,per_pop_pacific,pop_density,frequently,edu,edu_asian,median_income_pacific,median_income_asian,per_edu_white_nohs,per_pop_twoplus,pop_25p,median_income_twoplus,associates,no_hs,edu_twoplus,mask,always,rarely,state,edu_other,per_pop_native,per_pop_black,never,per_pop_asian,per_edu_hispanic_nohs,age_pop_native,per_edu_twoplus_nohs,edu_native,age_pop_twoplus,age_pop_asian,county,age_pop_white,per_gop,edu_pacific
0,1001,2020-09-09,1522.0,168.0,2724.229895,295.0,3.591054,27643.0,3248,,83423.0,4388,0.123028,0.134,0.217949,7554,6.707361,65047.0,0.19555,1539.602123,0.737708,0.0,0.029909,7.125,2.563808,8.422041,7.81375,5903,3.28831,12119,,55869,58786.0,0.248092,0.441408,0.000716,36.287947,0.295,3.174487,3.615142,,,0.090371,0.017111,37166,,2998,956,2.626316,3.003,0.444,0.074,Alabama,1.503817,0.004349,0.198643,0.053,0.011563,0.125666,9.786008,0.292105,3.166667,5.35251,7.899381,Autauga,8.727963,0.754018,2.0
1,1003,2020-09-09,4787.0,597.0,2144.386608,261.0,2.850361,31112.0,10332,45634.0,43279.0,15644,0.169492,0.098,0.221675,32266,6.602525,59418.0,0.203315,4117.546676,0.832073,0.0,0.047188,8.422764,2.494586,8.987202,7.849388,30431,3.489788,40579,53289.0,223234,55962.0,0.259062,0.421486,0.000551,54.215293,0.323,3.329113,2.690678,,34763.0,0.079388,0.016852,146989,53456.0,13759,3978,3.418808,2.968,0.436,0.059,Alabama,2.878465,0.006751,0.086076,0.083,0.010509,0.253174,8.999336,0.106893,2.270936,5.8126,7.638534,Baldwin,9.321749,0.798123,2.0
2,1005,2020-09-09,778.0,54.0,3151.583894,216.0,1.539267,23013.0,3411,26793.0,30417.0,803,0.181818,0.12,0.416667,3287,6.142346,47031.0,0.317808,2292.144655,0.455116,1.0,0.045248,6.741935,1.703576,8.784412,8.107733,1417,2.444444,6486,,24686,34186.0,0.710145,0.420886,0.001256,10.769826,0.201,2.38062,1.920455,,50417.0,0.207938,0.011545,18173,19760.0,1279,1490,1.918033,2.928,0.491,0.121,Alabama,1.217391,0.003848,0.478287,0.067,0.004699,0.561955,9.852632,0.163934,1.166667,6.136842,9.137931,Barbour,9.818336,0.528359,0.0
3,1007,2020-09-09,591.0,64.0,2639.099759,275.0,1.316294,34000.0,1747,,42708.0,616,0.0,0.096,0.0,2938,7.091493,50769.0,0.3076,1612.167481,0.744083,,0.02782,5.833333,1.686369,8.606145,7.993219,1197,2.400933,7471,,22394,45340.0,0.0,0.39064,0.000268,13.890616,0.278,2.459823,7.0,,,0.126562,0.010985,15780,20329.0,908,903,1.651007,3.348,0.572,0.034,Alabama,2.0,0.004064,0.210726,0.02,0.002054,0.341853,8.384615,0.275168,2.0,6.357724,8.76087,Bibb,8.871332,0.78227,
4,1009,2020-09-09,1401.0,241.0,2422.785598,411.0,0.927969,,4894,,35495.0,1793,0.16129,0.18,0.287879,8492,6.240595,49872.0,0.310403,1670.103911,0.867707,0.0,0.096531,8.952381,1.563758,8.651714,7.990826,3217,2.334181,13489,65385.0,57826,48695.0,0.43128,0.438972,0.000363,34.624193,0.194,2.606581,4.177419,,99219.0,0.166601,0.012624,39627,44934.0,4775,2967,1.913636,2.892,0.459,0.114,Alabama,3.270142,0.004877,0.01508,0.053,0.002819,0.614559,9.79078,0.179545,1.916667,6.512329,8.723926,Blount,8.955796,0.913855,2.0


In [94]:
y = merged['new_cases_per_100k_15d']
drop_cols = [c for c in merged.select_dtypes(include='number').columns.tolist() if 'cases' in c]
X = merged.select_dtypes(include='number').drop(drop_cols, axis=1).fillna(0.5)
ss = StandardScaler()
Z = ss.fit_transform(X)

In [60]:
from sklearn.ensemble import RandomForestRegressor

In [83]:
reg = RandomForestRegressor()
reg.fit(Z, y)

importance = reg.feature_importances_

In [84]:
importance_df = pd.DataFrame(columns=['feature', 'importance'])
importance_df['feature'] = X.columns.tolist()
importance_df['importance'] = importance
importance_df.sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance
40,per_pop_black,0.087891
26,age_pop_twoplus,0.06615
9,per_votes,0.051346
10,median_income_hispanic,0.04968
36,mask,0.043538
18,edu,0.043256
34,per_pop_white,0.04013
16,median_income,0.034286
7,per_gop,0.032593
8,per_pop_twoplus,0.031074


In [86]:
from xgboost import XGBRegressor

In [95]:
reg = XGBRegressor()
reg.fit(Z, y)

importance = reg.feature_importances_

importance_df = pd.DataFrame(columns=['feature', 'importance'])
importance_df['feature'] = X.columns.tolist()
importance_df['importance'] = importance
importance_df.sort_values(by='importance', ascending=False)



Unnamed: 0,feature,importance
21,no_hs,0.155314
51,per_pop_black,0.057617
12,rarely,0.053008
34,age_pop_twoplus,0.051315
0,edu_pacific,0.046423
15,median_income_hispanic,0.03549
9,per_pop_twoplus,0.034069
25,edu,0.028447
45,age_pop_hispanic,0.027031
13,sometimes,0.025557


In [56]:
# https://machinelearningmastery.com/calculate-feature-importance-with-python/
wls = LinearRegression()
wls.fit(Z, y)
coefs = wls.coef_

for i, v in enumerate(coefs):
    print(f'Feature {i}:: Score: {v:.4f}')

Feature 0:: Score: -34.5082
Feature 1:: Score: -30.6381
Feature 2:: Score: -36.0355
Feature 3:: Score: -26.7619
Feature 4:: Score: -1.4084


In [39]:

X = stats.zscore(X)
X = sm.add_constant(X)
weights = merged['tot_pop']

wls = sm.WLS(y, X, weights=weights)

fit = wls.fit()
fit.summary()

0,1,2,3
Dep. Variable:,new_cases_per_100k_15d,R-squared:,0.144
Model:,WLS,Adj. R-squared:,0.143
Method:,Least Squares,F-statistic:,105.8
Date:,"Fri, 04 Sep 2020",Prob (F-statistic):,1.8300000000000002e-103
Time:,15:44:41,Log-Likelihood:,-22031.0
No. Observations:,3140,AIC:,44070.0
Df Residuals:,3134,BIC:,44110.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,236.1857,4.028,58.634,0.000,228.288,244.084
x1,-29.5564,3.464,-8.532,0.000,-36.349,-22.764
x2,-25.2531,3.919,-6.444,0.000,-32.937,-17.570
x3,-46.4902,4.427,-10.500,0.000,-55.171,-37.809
x4,-13.5220,3.158,-4.282,0.000,-19.714,-7.330
x5,-6.1741,0.597,-10.349,0.000,-7.344,-5.004

0,1,2,3
Omnibus:,3206.938,Durbin-Watson:,1.632
Prob(Omnibus):,0.0,Jarque-Bera (JB):,474861.136
Skew:,4.644,Prob(JB):,0.0
Kurtosis:,62.525,Cond. No.,11.9


In [46]:
x_cols = ['per_pop_hispanic']
y_cols = 'new_cases_per_100k_15d'

merged = df_merger(
    nyt_df, info_df, 
    x_cols=x_cols,
    y_cols=[y_cols]
)

y = merged[y_cols]
X = merged[x_cols].fillna(0.5)
X = stats.zscore(X)
X = sm.add_constant(X)
weights = merged['tot_pop']

wls = sm.WLS(y, X, weights=weights)

fit = wls.fit()
fit.summary()

0,1,2,3
Dep. Variable:,new_cases_per_100k_15d,R-squared:,0.018
Model:,WLS,Adj. R-squared:,0.018
Method:,Least Squares,F-statistic:,56.98
Date:,"Fri, 04 Sep 2020",Prob (F-statistic):,5.72e-14
Time:,15:48:28,Log-Likelihood:,-22248.0
No. Observations:,3140,AIC:,44500.0
Df Residuals:,3138,BIC:,44510.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,173.3941,3.040,57.029,0.000,167.433,179.356
x1,16.6015,2.199,7.549,0.000,12.289,20.914

0,1,2,3
Omnibus:,2696.453,Durbin-Watson:,1.504
Prob(Omnibus):,0.0,Jarque-Bera (JB):,335935.817
Skew:,3.472,Prob(JB):,0.0
Kurtosis:,53.194,Cond. No.,1.81


# Exploring Correlations

In [60]:
def make_correlation_table(
    nyt_df, info_df, x_cols=None, y_cols=None,
    date='latest', useweight=True, weight='tot_pop',
    threshold=0.4
):
    
    df = df_merger(nyt_df, info_df, x_cols, y_cols, date, weight)
    
    wct = pd.DataFrame(index=x_cols, columns=y_cols)
    
    for y in y_cols:
        for x in x_cols:
            wct.loc[x, y] = corr(df[y], df[x], df[weight])
    
    wct = wct[(wct >= threshold) | (wct <= -1 * threshold)].dropna()
    
    return wct.sort_values(by=y_cols[0], ascending=False)

In [65]:
columns = column_selector(info_df, 'all')
make_correlation_table(nyt_df, info_df, x_cols=columns, y_cols=['new_cases_per_100k_15d'], threshold=0.15)

Unnamed: 0,new_cases_per_100k_15d,new_cases_15d
sometimes,0.23238,-0.264128
per_edu_hispanic_nohs,0.224571,0.208264
never,0.207661,-0.242312
per_edu_hispanic_female_nohs,0.204751,0.246553
per_edu_white_female_nohs,0.1958,-0.186053
rarely,0.190198,-0.300529
per_edu_white_nohs,0.188966,-0.224939
per_edu_white_male_nohs,0.176967,-0.251602
per_pop_hispanic_male,0.174789,0.577865
per_pop_hispanic,0.171016,0.579669


Counties with a high hispanic population have a disproportionately high number of COVID-19 cases per capita in the last 15 days, while counties with high white population seem to have a disproportionately low number of cases. Counties with higher educational attainment and higher income tend to have had less COVID cases per capita in the last 15 days.

In [80]:
def make_correlation_heatmap(
    nyt_df, info_df, date='latest', x_cols=None,
    y_cols=[
        'cases_per_100k', 
        'new_cases_per_100k_15d',
        'delta_new_cases_per_100k_15d',
        'deaths_per_100k',
        'new_deaths_per_100k_15d',
        'delta_new_deaths_per_100k_15d',
        'mortality_rate',
        'mortality_rate_15d'
    ],
    useweight=True, weight='tot_pop', size=50, print_corr=True,
    threshold=0.4
):
    
    df = df_merger(nyt_df, info_df, x_cols, y_cols, date, weight)

    # build weighted correlation matrix from df
    wcm_cols = x_cols + y_cols
    
    wcm = pd.DataFrame(index=x_cols, columns=wcm_cols)
    
    for y in wcm_cols:
        for x in x_cols:
            wcm.loc[x, y] = corr(df[x], df[y], df[weight])
    
    wcm = (wcm.reset_index().rename(columns={'index':'y_feature'}).dropna()
              .melt('y_feature', var_name='x_feature', value_name='corr'))
    wcm['corr'] = np.round(wcm['corr'].astype(float), 4)

    if print_corr:
        print('positive correlations')
        print(
            wcm[(wcm['corr'] >= threshold) & (wcm['corr'] != 1)]
            .sort_values(by=['corr', 'y_feature']).iloc[::2, :]
            .sort_values(by=['y_feature', 'x_feature'])
        )
        print('\nnegative correlations')
        print(
            wcm[(wcm['corr'] <= -1 * threshold) & (wcm['corr'] != -1)]
            .sort_values(by=['corr', 'y_feature']).iloc[::2, :]
            .sort_values(by=['y_feature', 'x_feature'])
        )
    
    # build altair chart
    base = alt.Chart(wcm).encode(
        alt.X(
            'x_feature:O',
            sort=x_cols
        ),
        alt.Y(
            'y_feature:O',
#             sort=columns
        )
    )
    heatmap = base.mark_rect().encode(
        color=alt.Color(
            'corr:Q',
            scale=alt.Scale(
                scheme='redblue',
                domain=[-1, 0, 1]
            )
        ),
        tooltip=[
            alt.Tooltip('x_feature:O'),
            alt.Tooltip('y_feature:O'),
            alt.Tooltip('corr:Q', title='correlation')
        ]
    )
    
    # text
    text = base.mark_text(baseline='middle').encode(
        text=alt.Text('corr:Q',format='.2f'),
        color=alt.condition(
            np.abs(alt.datum.corr) <= 0.5,
            alt.value('black'),
            alt.value('white')
        )
    )
    
    return (heatmap + text).configure_view(step=size)

In [81]:
columns = column_selector(
    info_df, 
    ['per_gop', 'mask', 'edu', 'median_income', 'age_pop', 'pop_density'], 
    mask=['per', 'pop'], 
    exclude=['male', 'female', 'tot']
)
make_correlation_heatmap(nyt_df, info_df, x_cols=columns, y_cols=['cases_per_100k', 'new_cases_per_100k_15d'], size=50)

positive correlations
            y_feature         x_feature    corr
40                edu     median_income  0.7337
66                edu     per_pop_asian  0.4250
41               mask     median_income  0.4294
67               mask     per_pop_asian  0.4586
93               mask  per_pop_hispanic  0.4599
147           per_gop     per_pop_white  0.6777
44      per_pop_asian     median_income  0.6062
135     per_pop_asian   per_pop_twoplus  0.4405
176  per_pop_hispanic    cases_per_100k  0.4973
215  per_pop_hispanic     new_cases_15d  0.5797
139   per_pop_pacific   per_pop_twoplus  0.8461

negative correlations
            y_feature       x_feature    corr
53                edu         per_gop -0.4200
54               mask         per_gop -0.6443
212           per_gop   new_cases_15d -0.4196
69            per_gop   per_pop_asian -0.5709
160           per_gop     pop_density -0.4707
149     per_pop_black   per_pop_white -0.4284
150  per_pop_hispanic   per_pop_white -0.7645
206     per

First, let's discuss features that not quite independent from each other:

selected positive correlations (> 0.4):
- educational attainment and median income
- educational attainment and percent asian
- mask discipline and median income
- mask discipline and percent asian
- mask discipline and percent hispanic
- median income and percent asian
- percent GOP and percent white

selected negative correlations (< -0.4):
- educational attainment and percent GOP
- mask discipline and percent GOP
- mask discipline and percent white
- percent Asian and percent GOP
- population density and percent GOP

Since there seems to be multicollinearity, we can't simply throw our data into a multiple linear regression.

## correlating cases per capita in the last 15 days

There are some (weak) correlations to recent cases per capita:

positive:
- percent GOP
- percent black
- percent hispanic

negative:
- median age
- educational attainment
- mask discipline
- median income
- percent asian
- percent white
- population density

# Aggregate Differences

In [82]:
def df_splitter(info_df, split_on, splits=2, equal_pop=True, mode='verbose'):
    
    if mode not in ['verbose', 'mean', 'percentile']:
        mode = verbose
        
    info_df = info_df[~info_df[split_on].isna()].sort_values(by=split_on)
    
    if equal_pop:
        # https://stackoverflow.com/a/31871770/14083095
        # splitting df into approx equal populations
        info_df['pop_cumsum'] = info_df['tot_pop'].cumsum()
        subpop = info_df['pop_cumsum'].max() / splits
        info_df['split'] = (info_df['pop_cumsum'] / subpop).apply(math.ceil)
    else:
        # splitting df into approx equal shapes
        info_df['split'] = pd.qcut(info_df[split_on], splits)
        
    replace_dict = {}
    to_replace = info_df['split'].unique()
    
    # renaming our splits into something more readable
    for i, s in enumerate(to_replace):
        if mode == 'verbose':
            replace_dict[s] = f"[{info_df.loc[info_df['split']==s,split_on].min():.2f},"\
            f" {info_df.loc[info_df['split']==s,split_on].max():.2f}]"
        elif mode == 'mean':
            replace_dict[s] = np.round(
                info_df.loc[info_df['split']==s,split_on].mean(),
                decimals=3
            )
        else:
            replace_dict[s] = (100/splits) * (int(i)+1)
    info_df['split'] = info_df['split'].replace(replace_dict)
    
    return info_df

In [86]:
def make_line_timeseries(
    nyt_df, info_df, y='new_cases_per_100k_15sg', splits=2, split_on=None, 
    equal_pop=True
):
    
    # check number of splits and only split on numeric columns
    # otherwise, use names as the different lines (setting splits=1)
    splits = int(splits)
    if split_on in info_df.select_dtypes(exclude='number').columns:
        splits = 1
    y_ = [y]
    if '_per_100k' in y:
        y_ = [y.replace('_per_100k', '')]
    elif y is 'mortality_rate':
        y_ = ['cases', 'deaths']
    # first split df so that we can plot different lines
    if splits > 1:
        info_df = df_splitter(info_df, split_on, splits, equal_pop)
        merged = df_merger(
            nyt_df, info_df, x_cols=[split_on, 'split'], y_cols=y_, date='all',
            weight='tot_pop'
        )
        # 'split' column generated by df_splitter()
        split_on = 'split'
    else:
        merged = df_merger(
            nyt_df, info_df, x_cols=[split_on], y_cols=y_, date='all'
        )
        
    # recalculate aggregates
    if '_per_100k' in y:
        y_ = y.replace('_per_100k', '')
        data = merged.groupby(by=['date', split_on])[y_].sum().fillna(0)\
               / merged.groupby(by=['date', split_on])['tot_pop'].sum() * 100_000
    elif y is 'mortality_rate':
        data = merged.groupby(by=['date', split_on])['deaths'].sum()\
               / merged.groupby(by=['date', split_on])['cases'].sum()

#     elif y is 'mortality_rate':
#         data = merged.groupby(by=['date', split_on])[y_].sum().fillna(0)\
#                / merged.groupby(by=['date', split_on])['tot_pop'].sum()
    else:
        data = merged.groupby(by=['date', split_on])[y].sum().fillna(0)
    data = data.reset_index().rename(columns={0: y})
    
    
    # nearest point selection
    nearest = alt.selection(type='single', nearest=True, on='mouseover',
                            fields=['date'], empty='none')
    
    # base line chart
    lines = alt.Chart(data).mark_line().encode(
        x='date:T',
        y=alt.Y(
            f'{y}:Q',
            title=y.replace('_', ' ')
        ),
        color=f'{split_on}:N'
    )
    
    # selects nearest points based on date
    selectors = alt.Chart(data).mark_point().encode(
        x='date:T',
        opacity=alt.value(0)
    ).add_selection(nearest)
    
    # marks a point on line where selected
    points = lines.mark_point().encode(
        opacity=alt.condition(nearest, alt.value(1), alt.value(0))
    )
    
    # white background for text
    white_text = lines.mark_text(align='left', dx=5, dy=-5, stroke='white', strokeWidth=3).encode(
        text=alt.condition(nearest, f'{y}:Q', alt.value(' '), format='.1f')
    )
    
    # text showing y value
    text = lines.mark_text(align='left', dx=5, dy=-5).encode(
        text=alt.condition(nearest, f'{y}:Q', alt.value(' '), format='.1f')
    )
    
    # rule showing nearest selector
    rules = alt.Chart(data).mark_rule(color='gray').encode(
        x='date:T',
        size=alt.value(1)
    ).transform_filter(nearest)
    
    return alt.layer(
        lines, selectors, points, rules, white_text, text
    ).configure_axis(
        gridDash=[1,2]
    ).properties(
        width=640, height=384
    )

In [87]:
make_line_timeseries(
    nyt_df, 
    info_df, 
    y='new_cases_per_100k_15sg', 
    splits=2,
    split_on='per_gop', 
    equal_pop=True)

In [88]:
make_line_timeseries(
    nyt_df, 
    info_df, 
    y='new_cases_per_100k_15sg', 
    splits=2,
    split_on='per_pop_black', 
    equal_pop=True)

In [89]:
make_line_timeseries(
    nyt_df, 
    info_df, 
    y='new_cases_per_100k_15sg', 
    splits=2,
    split_on='per_pop_hispanic', 
    equal_pop=True)

In [90]:
make_line_timeseries(
    nyt_df, 
    info_df, 
    y='new_cases_per_100k_15sg', 
    splits=2,
    split_on='edu', 
    equal_pop=True)

In [91]:
make_line_timeseries(
    nyt_df, 
    info_df, 
    y='new_cases_per_100k_15sg', 
    splits=2,
    split_on='mask', 
    equal_pop=True)

In [92]:
make_line_timeseries(
    nyt_df, 
    info_df, 
    y='new_cases_per_100k_15sg', 
    splits=2,
    split_on='median_income', 
    equal_pop=True)

# Visualizing Via Heatmap

In [112]:
def make_heatmap_timeseries(
    nyt_df, info_df, y='new_cases_per_100k_15sg', splits=10, split_on=None,
    equal_pop=True, mode='percentile'
):
    y_title = split_on
    y_subtitle = 'county'
    if equal_pop:
        y_subtitle = 'pop'
    # check number of splits and only split on numeric columns
    # otherwise, use names as the different lines (setting splits=1)
    splits = int(splits)
    if split_on in info_df.select_dtypes(exclude='number').columns:
        splits = 1
    y_ = [y]
    if '_per_100k' in y:
        y_ = [y.replace('_per_100k', '')]
    elif y is 'mortality_rate':
        y_ = ['cases', 'deaths']
    # first split df so that we can plot different lines
    if splits > 1:
        info_df = df_splitter(info_df, split_on, splits, equal_pop, mode)
        merged = df_merger(
            nyt_df, info_df, x_cols=[split_on, 'split'], y_cols=y_, date='all',
            weight='tot_pop'
        )
        # 'split' column generated by df_splitter()
        split_on = 'split'
    else:
        merged = df_merger(
            nyt_df, info_df, x_cols=[split_on], y_cols=y_, date='all'
        )
        
    # recalculate aggregates
    if '_per_100k' in y:
        y_ = y.replace('_per_100k', '')
        data = merged.groupby(by=['date', split_on])[y_].sum().fillna(0)\
               / merged.groupby(by=['date', split_on])['tot_pop'].sum() * 100_000
    elif y is 'mortality_rate':
        data = merged.groupby(by=['date', split_on])['deaths'].sum()\
               / merged.groupby(by=['date', split_on])['cases'].sum()
    else:
        data = merged.groupby(by=['date', split_on])[y].sum().fillna(0)
    data = data.reset_index().rename(columns={0: y})
    
    y_alt = f'{split_on}:O'
    
    # nearest point selection
    nearest = alt.selection(type='single', nearest=True, on='mouseover',
                            fields=['date'], empty='all')
    
    # title
    dx = 160
    dy = splits*9
    title = alt.Chart(data).mark_text(dx=dx, dy=dy, size=20).encode(
        text='monthdate(date):T'
    ).transform_filter(nearest)
    
    w_title = alt.Chart(data).mark_text(dx=dx, dy=dy, stroke='white', strokeWidth=3, size=20).encode(
        text='monthdate(date):T'
    ).transform_filter(nearest)
    
    # right panel: heatmap
    heatmap = alt.Chart(data).mark_rect().encode(
        alt.X(
            'monthdate(date):T',
            axis=alt.Axis(format='%b %d')
        ),
        alt.Y(
            y_alt,
            sort=alt.EncodingSortField(f'{split_on}', order='descending'),
            title=f'{y_title} ({y_subtitle} {mode})'
        ),
        color=alt.Color(
            f'{y}:Q',
            scale=alt.Scale(
                scheme='lightmulti'
            )
        )
    ).add_selection(nearest)
    
    # left panel: bar chart
    bars = alt.Chart(data).mark_bar().encode(
        alt.X(
            f'{y}:Q',
            scale=alt.Scale(
                domain=[0, data[y].max()]
            )
        ),
        alt.Y(
            y_alt,
            sort=alt.EncodingSortField(f'{split_on}', order='descending'),
            title=f'{y_title} ({y_subtitle} {mode})'
        ),
        color=alt.Color(
            f'{y}:Q',
            scale=alt.Scale(
                scheme='lightmulti'
            )
        ),
        tooltip=[
            alt.Tooltip(f'{y}:Q'),
            alt.Tooltip(y_alt),
        ]
    ).transform_filter(nearest)
    
    
    # selects nearest points based on date
    selectors = alt.Chart(data).mark_point().encode(
        x='monthdate(date):T',
        opacity=alt.value(0)
    ).add_selection(nearest)
    
    return (heatmap | bars+w_title+title).properties(
        title=f'{y} vs {y_title}'
    )

In [124]:
make_heatmap_timeseries(nyt_df, info_df, y='new_cases_per_100k_15sg', splits=10, split_on='per_gop', equal_pop=True, mode='percentile')

In [125]:
make_heatmap_timeseries(nyt_df, info_df, y='new_cases_per_100k_15sg', splits=10, split_on='per_pop_white', equal_pop=True, mode='percentile')

In [126]:
make_heatmap_timeseries(nyt_df, info_df, y='new_cases_per_100k_15sg', splits=10, split_on='per_pop_hispanic', equal_pop=True, mode='percentile')

In [127]:
make_heatmap_timeseries(nyt_df, info_df, y='new_cases_per_100k_15sg', splits=10, split_on='per_pop_black', equal_pop=True, mode='percentile')

In [128]:
make_heatmap_timeseries(nyt_df, info_df, y='new_cases_per_100k_15sg', splits=10, split_on='edu', equal_pop=True, mode='percentile')

In [129]:
make_heatmap_timeseries(nyt_df, info_df, y='new_cases_per_100k_15sg', splits=10, split_on='mask', equal_pop=True, mode='percentile')

In [130]:
make_heatmap_timeseries(nyt_df, info_df, y='new_cases_per_100k_15sg', splits=10, split_on='median_income', equal_pop=True, mode='percentile')