In [1]:
#hide
import json
from datetime import datetime, timedelta
from pytz import timezone
from time import time
from urllib.request import urlopen

import numpy as np
import numpy.polynomial.polynomial as poly
import pandas as pd

import altair as alt
from altair import datum
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

alt.data_transformers.enable('data_server');

In [2]:
#hide
def optimize(df):
    '''
    Optimizes the data types in a pandas dataframe.
    '''
    dft = df.copy()
    # converts to datetime if possible
    dft = dft.apply(lambda col:
        pd.to_datetime(col, errors='ignore') if col.dtypes=='object' else col)
    
    # if there are less than half as many unique values as there are rows, convert to category
    for col in dft.select_dtypes(include='object'):
        if len(dft[col].unique()) / len(df[col]) < 0.5:
            dft[col] = dft[col].astype('category')
            
    # downcasts numeric columns if possible
    dft = dft.apply(lambda col: 
        pd.to_numeric(col, downcast='integer') if col.dtypes=='int64' else col)
    dft = dft.apply(lambda col: 
        pd.to_numeric(col, downcast='float') if col.dtypes=='float64' else col)
    
    return dft

In [3]:
#hide
df = optimize(pd.read_csv('data/df.csv'))
pop_df_ac = optimize(pd.read_csv('data/pop_df_ac.csv', 
                                 dtype={'fips':'str', 'cluster':'int'}))
# df['days'] = ((df['date'] - df['date'].min()) / np.timedelta64(1, 'D')).astype('int')
# df['rate'] = df['deaths'] / df['cases']
# la = timezone('US/Pacific')
# last = df['date'].max()
# df_slice = df[df['date'] == last]
# df_slice.head()

In [4]:
#hide
color_dict = dict(
    per_gop=dict(
        scheme='redblue',
        sort='descending',
        mid=0.5
    ),
    mask=dict(
        scheme='brownbluegreen',
        sort='ascending',
        mid=pop_df_ac['mask'].median()
    ),
    median_income=dict(
        scheme='purplegreen',
        sort='ascending',
        mid=61937
    ),
    edu=dict(
        scheme='blueorange',
        sort='ascending',
        mid=pop_df_ac['edu'].median()
    )
)

# "Iceberg" Plots

I've coined the term "iceberg" to refer to a stacked area chart where the two halves are split along the color axis. The two halves are engineered in a way that their cumulative sum populations are approximately equal to each other.

In [5]:
#collapse-hide
def make_iceberg_chart(df, pop_df_ac, params):
    
    # https://stackoverflow.com/a/41489086
    # find cutoff to split dataframe into equal-population halves
    total = pop_df_ac['total_pop'].sum()
    half = total / 2
    sorted_pop_df_ac = pop_df_ac.sort_values(by=params['c_col'])
    # returns last index where cumsum < half
    cutoff_idx = np.flatnonzero(sorted_pop_df_ac['total_pop'].cumsum() < half)[-1]
    cutoff_val = sorted_pop_df_ac[params['c_col']].iloc[cutoff_idx]
    
    # checks the half
    h0 = sorted_pop_df_ac['total_pop'][:cutoff_idx + 1].sum() / total
    print(f'{h0*100:.1f}')
    
    base = alt.Chart(df).properties(
        title=params['chart_title'],
        width=720,
        height=480
    )
    
    c_datum = 'datum.' + params['c_col']
    
    # top of iceberg: c_col > cutoff_val
    top = base.mark_area(opacity=0.8).transform_filter(
        (eval(c_datum) > cutoff_val)
    ).encode(
        alt.X('date:T', title='Date'),
        alt.Y(params['y_col'] + ':Q'),
        color=alt.Color(
            params['c_col'] + ':Q',
            scale=alt.Scale(
                scheme=color_dict[params['c_col']]['scheme'],
                domain=[np.percentile(df[params['c_col']].unique(), 1), 
                        np.percentile(df[params['c_col']].unique(), 99)],
                domainMid=color_dict[params['c_col']]['mid'],
            ),
            sort=color_dict[params['c_col']]['sort'],
        ),
        order=alt.Order(params['c_col'], sort='ascending')
    )
    
    # bottom of iceberg: c_col <= cutoff_val
    bottom = base.mark_area(opacity=0.8).transform_filter(
        (eval(c_datum) <= cutoff_val)
    ).transform_calculate(
        y_alt='datum.' + params['y_col'] + '* -1'
    ).encode(
        alt.X('date:T', title='Date'),
        alt.Y('y_alt:Q', axis=alt.Axis(format='(.2s')),
        color=alt.Color(
            params['c_col'] + ':Q',
            scale=alt.Scale(
                scheme=color_dict[params['c_col']]['scheme'],
                domain=[np.percentile(df[params['c_col']].unique(), 1), 
                        np.percentile(df[params['c_col']].unique(), 99)],
                domainMid=color_dict[params['c_col']]['mid'],
            ),
            sort=color_dict[params['c_col']]['sort'],
        ),
        order=alt.Order(params['c_col'], sort='descending')
    )
    
    df['zero'] = 0
    zero = base.mark_line(color='black').encode(
        x='date',
        y='zero',
        size=alt.value(0.5)
    )
    
    layer = alt.layer(top, bottom, zero).encode(
        tooltip=[
            'state:N', 'county:N', 'date:T',
            alt.Tooltip(
                params['y_col'] + ':Q',
                format='.2f',
            )
        ]
    )
    
    return layer

In [6]:
#hide
def make_iceberg_chart_2(df, pop_df_ac, params):
    
    # https://stackoverflow.com/a/41489086
    # find cutoff to split dataframe into equal-population halves
    total = pop_df_ac['total_pop'].sum()
    half = total / 2
    sorted_pop_df_ac = pop_df_ac.sort_values(by=params['c_col'])
    # returns last index where cumsum < half
    cutoff_idx = np.flatnonzero(sorted_pop_df_ac['total_pop'].cumsum() < half)[-1]
    cutoff_val = sorted_pop_df_ac[params['c_col']].iloc[cutoff_idx]
    
    # checks the half
    h0 = sorted_pop_df_ac['total_pop'][:cutoff_idx + 1].sum() / total
    print(f'{h0*100:.1f}')
    
    base = alt.Chart(df).properties(
        title=params['chart_title'],
        width=720,
        height=480
    )
    
    c_datum = 'datum.' + params['c_col']
    
    # top of iceberg: c_col > cutoff_val
    points0 = base.mark_area().transform_filter(
        (eval(c_datum) > cutoff_val)
    ).transform_joinaggregate(
        cases0='sum(cases)'
    ).encode(
        alt.X('date:T', title='Date'),
        alt.Y(params['y_col'] + ':Q'),
        color=alt.Color(
            params['c_col'] + ':Q',
            scale=alt.Scale(
                scheme=params['c_scheme'],
                domain=[np.percentile(df[params['c_col']].unique(), 0.5), 
                        np.percentile(df[params['c_col']].unique(), 99.5)],
                domainMid=params['c_mid'],
            ),
            sort=params['c_sort'],
        ),
        order=alt.Order(params['c_col'], sort='ascending')
    )
    
    # bottom of iceberg: c_col <= cutoff_val
    points1 = base.mark_area().transform_filter(
        (eval(c_datum) <= cutoff_val)
    ).encode(
        alt.X('date:T', title='Date'),
        alt.Y('y_alt:Q', axis=alt.Axis(format='(.2s')),
        color=alt.Color(
            params['c_col'] + ':Q',
            scale=alt.Scale(
                scheme=params['c_scheme'],
                domain=[np.percentile(df[params['c_col']].unique(), 1), 
                        np.percentile(df[params['c_col']].unique(), 99)],
                domainMid=params['c_mid'],
            ),
            sort=params['c_sort'],
        ),
        order=alt.Order(params['c_col'], sort='descending'),
    )
    
    line = base.mark_line(
        color='red'
    ).transform.calculate(
        is_top = c_datum + '> cu'
    ).encode(
        x='date',
        y='ratio'
    )
    
    df['zero'] = 0
    zero = base.mark_line(color='black').encode(
        x='date',
        y='zero',
        size=alt.value(0.5)
    )
    
    layer = alt.layer(points0, points1, line, zero).encode(
        tooltip=[
            'state:N', 'county:N', 'date:T',
            alt.Tooltip(
                params['y_col'] + ':Q',
                format='.2f',
            )
        ]
    )
    
    return layer

## Comparing New Cases vs. Percent GOP

In [7]:
#hide_input
params = dict(
    chart_title='Cases per Day vs. Percent GOP',
    y_col='new_cases',
    c_col='per_gop'
)

ice_gop = make_iceberg_chart(df, pop_df_ac, params)
# ice_gop.save('docs/ice_gop.html')
ice_gop

49.7


Democratic counties still comprise most daily cases compared to their Republican counterparts. However, when splitting the country into "Democratic" and "Republican" halves, we see that Republican-leaning counties currently account for more than half of the new daily cases.

In [8]:
#hide_input
norm_cases = alt.Chart(df).mark_area().encode(
    alt.X('date:T', title='Date'),
    alt.Y('cases:Q', stack='normalize'),
    color=alt.Color(
        'per_gop' + ':Q',
        scale=alt.Scale(
            scheme='redblue',
            domain=[np.percentile(df['per_gop'].unique(), 1), 
                    np.percentile(df['per_gop'].unique(), 99)],
            domainMid=0.5,
        ),
        sort='descending',
    ),
)

# norm_cases.save('docs/norm_cases.html')
norm_cases

When looking at the share of total cases, Republican counties are catching up due to the virus hitting the Southern states.

In [30]:
#hide_input
norm_new_cases = alt.Chart(df).mark_area().encode(
    alt.X('date:T', title='Date'),
    alt.Y('new_cases:Q', stack='normalize'),
    color=alt.Color(
        'per_gop' + ':Q',
        scale=alt.Scale(
            scheme='redblue',
            domain=[np.percentile(df['per_gop'].unique(), 1), 
                    np.percentile(df['per_gop'].unique(), 99)],
            domainMid=0.5,
        ),
        sort='descending',
    ),
)

# norm_new_cases.save('docs/norm_new_cases.html')
norm_new_cases

## Comparing New Cases vs. Educational Attainment

Counties with lower educational attainment coefficient have a disproportionate share of new cases:

In [9]:
#hide_input
params = dict(
    chart_title='Cases per Day vs. Educational Attainment',
    y_col='new_cases',
    c_col='edu',
)

ice_edu = make_iceberg_chart(df, pop_df_ac, params)
# ice_edu.save('docs/ice_edu.html')
ice_edu

49.2


## Comparing New Cases vs. Median Income

Strangely enough, while there didn't seem to be a large correlation between new cases and median income (for the most recent 15-day window), we can see from this iceberg plot that low-income counties are currently disproportionately affected by COVID-19.

In [10]:
#hide_input
params = dict(
    chart_title='Cases per Day vs. Median Income',
    y_col='new_cases',
    c_col='median_income'
)

ice_inc = make_iceberg_chart(df, pop_df_ac, params)
# ice_inc_normed.save('docs/ice_inc_normed.html')
ice_inc

49.8


## Comparing New Cases vs. Mask Discipline

While dense counties were hit the hardest in the beginning, they seem to be doing slightly better

In [16]:
#hide_input
params = dict(
    chart_title='Cases per Day vs. Mask Discipline',
    y_col='new_cases',
    c_col='mask'
)

ice_mask = make_iceberg_chart(df, pop_df_ac, params)
# ice_inc_normed.save('docs/ice_inc_normed.html')
ice_mask

50.0


In [None]:
#hide
def make_normed_iceberg_chart(df, params):
    
    base = alt.Chart(df).properties(
        title=params['chart_title'],
        width=720,
        height=480
    )
    
    c_datum = 'datum.' + params['c_col']
    y_datum = 'datum.' + params['y_col']
    
    # top of iceberg
    points0 = base.mark_area().transform_filter(
        (eval(c_datum) >= params['c_mid'])
    ).transform_joinaggregate(
        pop_0='sum(total_pop)',
    ).transform_calculate(
        per_y=y_datum + ' / datum.pop_0'
    ).encode(
        alt.X('date:T', title='Date'),
        alt.Y('per_y:Q'),
        color=alt.Color(
            params['c_col'] + ':Q',
            scale=alt.Scale(
                scheme=params['c_scheme'],
                domain=[np.percentile(df[params['c_col']].unique(), 0.5), 
                        np.percentile(df[params['c_col']].unique(), 99.5)],
                domainMid=params['c_mid'],
            ),
            sort=params['c_sort'],
        ),
        order=alt.Order(params['c_col'], sort='ascending')
    )
    
    # bottom of iceberg
    points1 = base.mark_area().transform_filter(
        (eval(c_datum) < params['c_mid'])
    ).transform_joinaggregate(
        pop_1='sum(total_pop)',
    ).transform_calculate(
        per_y=y_datum + ' / datum.pop_1 * -1'
    ).encode(
        alt.X('date:T', title='Date'),
        alt.Y('per_y:Q', axis=alt.Axis(format='(.2s')),
        color=alt.Color(
            params['c_col'] + ':Q',
            scale=alt.Scale(
                scheme=params['c_scheme'],
                domain=[np.percentile(df[params['c_col']].unique(), 1), 
                        np.percentile(df[params['c_col']].unique(), 99)],
                domainMid=params['c_mid'],
            ),
            sort=params['c_sort'],
        ),
        order=alt.Order(params['c_col'], sort='descending'),
    )
    
    df['zero'] = 0
    zero = base.mark_line(color='black').encode(
        x='date',
        y='zero',
        size=alt.value(0.5)
    )
    
    layer = alt.layer(points0, points1, zero).encode(
        tooltip=[
            'state:N', 'county:N',
            alt.Tooltip(
                params['y_col'] + ':Q',
                format='.0f',
            )
        ]
    )
    
    return layer