In [175]:
#hide
import json
from datetime import datetime, timedelta
from pytz import timezone
from time import time
from urllib.request import urlopen

import numpy as np
import numpy.polynomial.polynomial as poly
import pandas as pd
from scipy.stats import ttest_ind

import altair as alt
from altair import datum
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

# alt.data_transformers.enable('json');
alt.data_transformers.enable('data_server');
alt.renderers.enable('mimetype');

In [176]:
#hide
def optimize(df):
    '''
    Optimizes the data types in a pandas dataframe.
    '''
    dft = df.copy()
    # converts to datetime if possible
    dft = dft.apply(lambda col:
        pd.to_datetime(col, errors='ignore') if col.dtypes=='object' else col)
    
    # if there are less than half as many unique values as there are rows, convert to category
    for col in dft.select_dtypes(include='object'):
        if len(dft[col].unique()) / len(df[col]) < 0.5:
            dft[col] = dft[col].astype('category')
            
    # downcasts numeric columns if possible
    dft = dft.apply(lambda col: 
        pd.to_numeric(col, downcast='integer') if col.dtypes=='int64' else col)
    dft = dft.apply(lambda col: 
        pd.to_numeric(col, downcast='float') if col.dtypes=='float64' else col)
    
    return dft

In [177]:
#hide
df = optimize(pd.read_csv('data/df.csv'))
pop_df_ac = optimize(pd.read_csv('data/pop_df_ac.csv', 
                                 dtype={'fips':'str', 'cluster':'int'}))
df['days'] = ((df['date'] - df['date'].min()) / np.timedelta64(1, 'D')).astype('int')
df['rate'] = df['deaths'] / df['cases']
la = timezone('US/Pacific')
last = df['date'].max()
df_slice = df[df['date'] == last]
df_slice.head()

Unnamed: 0,state,cluster,date,cases,deaths,county,total_pop,fips,white,black,...,delta_new_cases_7sg,delta_new_deaths_7sg,delta_new_cases_per_100k_7sg,delta_new_deaths_per_100k_7sg,delta_new_cases_15sg,delta_new_deaths_15sg,delta_new_cases_per_100k_15sg,delta_new_deaths_per_100k_15sg,days,rate
79721,Alabama,3,2020-08-03,7939,135,Autauga-Coffee-Covington-Crenshaw-Dale-Elmore-...,471885,01001-01031-01039-01041-01045-01051-01061-0106...,333136,102762,...,-5.964286,0.642857,-1.263928,0.136232,-12.908334,-0.466667,-2.735483,-0.098894,195,0.017005
79722,Alabama,5,2020-08-03,12409,221,Baldwin-Mobile,636444,01003-01097,419294,167703,...,-125.178574,-1.607143,-19.668434,-0.252519,-50.441666,-0.733333,-7.925547,-0.115224,195,0.01781
79723,Alabama,8,2020-08-03,2637,31,Barbour-Bullock-Macon-Russell,110816,01005-01011-01087-01113,42892,59019,...,-8.571428,-0.214286,-7.734829,-0.193371,-4.566667,-0.2,-4.120945,-0.180479,195,0.011756
79724,Alabama,7,2020-08-03,5711,125,Bibb-Butler-Chilton-Conecuh-Dallas-Escambia-Lo...,202625,01007-01013-01021-01035-01047-01053-01085-01099,113944,76232,...,-4.392857,-0.321429,-2.167974,-0.158632,-4.841667,-0.65,-2.389472,-0.32079,195,0.021888
79725,Alabama,0,2020-08-03,16463,269,Blount-Calhoun-Chambers-Cherokee-Clay-Cleburne...,908217,01009-01015-01017-01019-01027-01029-01037-0104...,712824,116482,...,-33.964287,-2.428571,-3.739666,-0.2674,-16.575001,-1.458333,-1.825004,-0.160571,195,0.01634


In [178]:
df.columns

Index(['state', 'cluster', 'date', 'cases', 'deaths', 'county', 'total_pop',
       'fips', 'white', 'black', 'asian', 'hispanic', 'area', 'lon', 'lat',
       'votes_gop', 'votes_dem', 'total_votes', 'median_income', 'pop25',
       'no_hs', 'some_hs', 'hs', 'some_college', 'associates', 'bachelors',
       'graduate', 'never', 'rarely', 'sometimes', 'frequently', 'always',
       'pop_density', 'per_white', 'per_black', 'per_asian', 'per_hispanic',
       'per_total_votes', 'per_votes_gop', 'per_votes_dem', 'per_no_hs',
       'per_some_hs', 'per_hs', 'per_some_college', 'per_associates',
       'per_bachelors', 'per_graduate', 'edu', 'mask', 'per_gop', 'gop',
       'cases_per_100k', 'deaths_per_100k', 'new_cases', 'new_deaths',
       'new_cases_per_100k', 'new_deaths_per_100k', 'new_cases_7sg',
       'new_deaths_7sg', 'new_cases_per_100k_7sg', 'new_deaths_per_100k_7sg',
       'new_cases_15sg', 'new_deaths_15sg', 'new_cases_per_100k_15sg',
       'new_deaths_per_100k_15sg', 'delt

In [179]:
#hide
#https://colorbrewer2.org/

color_dict = dict(
    # blue-red
    per_gop=dict(
        range=['#2166ac', '#F7F7F7', '#b2182b'],
        mid=0.5
    ),
    # pink-green
    mask=dict(
        range=['#c51b7d', '#F7F7F7', '#4d9221'],
        mid=pop_df_ac['mask'].median()
    ),
    # purple-orange
    median_income=dict(
        range=['#542788', '#F7F7F7', '#b35806'],
        mid=61937
    ),
    # green-purple
    edu=dict(
        range=['#1b7837', '#F7F7F7', '#762a83'],
        mid=pop_df_ac['edu'].median()
    )
)

# Funnel Charts

In [180]:
#collapse-hide
def make_funnel_chart(df, x_col, c_col, c_range, c_mid):

    x_col_2 = x_col + '_per_100k_15sg'
    cols = ['date', 'state', 'county', 'total_pop'] + [x_col, x_col_2, c_col]

    df = df[df['date'] >= '2020-03-01'][cols]
    
    base = alt.Chart(df).properties(
        title=x_col + ' vs. ' + c_col,
        width=480,
        height=720
    )
    
    c_datum = 'datum.' + c_col
    x_datum = 'datum.' + x_col

    # left of iceberg: c_col <= cutoff_val
    left = base.mark_area().transform_filter(
        (eval(c_datum) <= c_mid)
    ).transform_joinaggregate(
        pop_left='sum(total_pop)',
    ).transform_calculate(
        per_x=x_datum + ' / datum.pop_left * -100000'
    ).encode(
        alt.X('per_x:Q'),
        alt.Y('date:T', title='Date'),
        order=alt.Order(c_col, sort='descending')
    )
    
    # right of iceberg: c_col > cutoff_val
    right = base.mark_area().transform_filter(
        (eval(c_datum) > c_mid)
    ).transform_joinaggregate(
        pop_right='sum(total_pop)',
    ).transform_calculate(
        per_x=x_datum + ' / datum.pop_right * 100000'
    ).encode(
        alt.X('per_x:Q'),
        alt.Y('date:T', title='Date'),
        order=alt.Order(c_col, sort='ascending')
    )   
    
    # just a line at zero to make it easier to see
    zero = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(
        color='black',
        size=0.5
    ).encode(x='x')

    # find dates where {new cases} is statistically different between halves
    dates = [[], []]
    for date in df['date'].unique():
        df_ = df[df['date'] == date]
        df0_ = df_[df_[c_col]<=c_mid][x_col_2].values
        df1_ = df_[df_[c_col]>c_mid][x_col_2].values
        t, p = ttest_ind(df0_, df1_)
        # two-tailed t-test; p/2 < alpha
        if p/2 < 0.05:
            # if df0_.mean() > df1_.mean()
            if t > 0: 
                dates[0].append(date)
            else:
                dates[1].append(date)


    # https://stackoverflow.com/a/6934267
    def get_ranges(d):
        last_date = d[-1]
        while d:
            end = 1
            try:
                while d[end] - d[end - 1] == np.timedelta64(1, 'D'):
                    end += 1
            except IndexError:
                pass
            if d[end-1] == last_date:
                end_date = d[end-1]
            else:
                end_date = d[end-1] + np.timedelta64(1, 'D')
            yield {'start': d[0],
                   'end': end_date}
            d = d[end:]

    rects = []
    for i, d in enumerate(dates):
        if len(d) > 0:
            ranges = alt.pd.DataFrame(get_ranges(d))
            rects.append(
                alt.Chart(ranges).mark_rect(
                    color=c_range[-1*i], 
                    opacity=0.1
                ).encode(
                    y='start:T',
                    y2='end:T'
                )
            )
    
    funnel = alt.layer(left, right).encode(
        color=alt.Color(
            c_col + ':Q',
            scale=alt.Scale(
                range=c_range,
                domain=[np.percentile(pop_df_ac[c_col], 1),
                        c_mid,
                        np.percentile(pop_df_ac[c_col], 99)],
                interpolate={
                    'type':'rgb', 
                    'gamma':0.75
                }
            ),
        ),
        tooltip=[
            'state:N', 'county:N', 'date:T',
            alt.Tooltip(
                x_col + ':Q',
                format='.2f',
            ),
            alt.Tooltip(
                c_col + ':Q',
                format='.2f',
            )
        ]
    )

    layer = alt.layer(*rects, funnel, zero)
    
    return layer

## New Cases vs. Percent GOP

In [181]:
funnel_gop = make_funnel_chart(df, 'new_cases', 'per_gop', color_dict['per_gop']['range'], color_dict['per_gop']['mid'])
# ice_gop.save('docs/ice_gop.html')
funnel_gop

<VegaLite 4 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


Democratic counties still comprise most daily cases compared to their Republican counterparts. However, when splitting the country into "Democratic" and "Republican" halves, we see that Republican-leaning counties currently account for more than half of the new daily cases.

## Comparing New Cases vs. Educational Attainment

Counties with lower educational attainment coefficient have a disproportionate share of new cases:

In [182]:
funnel_edu = make_funnel_chart(df, 'new_cases', 'edu', color_dict['edu']['range'], color_dict['edu']['mid'])
funnel_edu

<VegaLite 4 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


## Comparing New Cases vs. Median Income

Strangely enough, while there didn't seem to be a large correlation between new cases and median income (for the most recent 15-day window), we can see from this iceberg plot that low-income counties are currently disproportionately affected by COVID-19.

In [183]:
#hide_input
funnel_inc = make_funnel_chart(df, 'new_cases', 'median_income', 
                               color_dict['median_income']['range'], 
                               color_dict['median_income']['mid'])
funnel_inc

<VegaLite 4 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


## Comparing New Cases vs. Mask Discipline

While dense counties were hit the hardest in the beginning, they seem to be doing slightly better

In [184]:
#hide_input
funnel_mask = make_funnel_chart(df, 'new_cases', 'mask', color_dict['mask']['range'], color_dict['mask']['mid'])
funnel_mask

<VegaLite 4 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


In [186]:
make_funnel_chart(df, 'new_cases', 'pop_density', ['#c51b7d', '#F7F7F7', '#4d9221'], pop_df_ac['pop_density'].median())

<VegaLite 4 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


In [None]:
#collapse-hide
def make_iceberg_chart(df, pop_df_ac, params):
    
    # https://stackoverflow.com/a/41489086
    # find cutoff along color column to split dataframe into equal-population halves
    total = pop_df_ac['total_pop'].sum()
    half = total / 2
    sorted_pop_df_ac = pop_df_ac.sort_values(by=params['c_col'])
    # returns last index where cumsum < half
    cutoff_idx = np.flatnonzero(sorted_pop_df_ac['total_pop'].cumsum() < half)[-1]
    cutoff_val = sorted_pop_df_ac[params['c_col']].iloc[cutoff_idx]
    
    # checks the half
    h0 = sorted_pop_df_ac['total_pop'][:cutoff_idx + 1].sum() / total
    print(f'{h0*100:.1f}')
    
    base = alt.Chart(df).properties(
        title=params['chart_title'],
        width=720,
        height=480
    )
    
    c_datum = 'datum.' + params['c_col']
    
    # top of iceberg: c_col > cutoff_val
    top = base.mark_area(opacity=0.8).transform_filter(
        (eval(c_datum) > cutoff_val)
    ).encode(
        alt.X('date:T', title='Date'),
        alt.Y(params['y_col'] + ':Q'),
        color=alt.Color(
            params['c_col'] + ':Q',
            scale=alt.Scale(
                scheme=color_dict[params['c_col']]['scheme'],
                domain=[np.percentile(df[params['c_col']].unique(), 1), 
                        np.percentile(df[params['c_col']].unique(), 99)],
                domainMid=color_dict[params['c_col']]['mid'],
            ),
            sort=color_dict[params['c_col']]['sort'],
        ),
        order=alt.Order(params['c_col'], sort='ascending')
    )
    
    # bottom of iceberg: c_col <= cutoff_val
    bottom = base.mark_area(opacity=0.8).transform_filter(
        (eval(c_datum) <= cutoff_val)
    ).transform_calculate(
        y_alt='datum.' + params['y_col'] + '* -1'
    ).encode(
        alt.X('date:T', title='Date'),
        alt.Y('y_alt:Q', axis=alt.Axis(format='(.2s')),
        color=alt.Color(
            params['c_col'] + ':Q',
            scale=alt.Scale(
                scheme=color_dict[params['c_col']]['scheme'],
                domain=[np.percentile(df[params['c_col']].unique(), 1), 
                        np.percentile(df[params['c_col']].unique(), 99)],
                domainMid=color_dict[params['c_col']]['mid'],
            ),
            sort=color_dict[params['c_col']]['sort'],
        ),
        order=alt.Order(params['c_col'], sort='descending')
    )
    
    df['zero'] = 0
    zero = base.mark_line(color='black').encode(
        x='date',
        y='zero',
        size=alt.value(0.5)
    )
    
    layer = alt.layer(top, bottom, zero).encode(
        tooltip=[
            'state:N', 'county:N', 'date:T',
            alt.Tooltip(
                params['y_col'] + ':Q',
                format='.2f',
            )
        ]
    )
    
    return layer

In [None]:
#collapse-hide
def make_iceberg_chart_2(df, pop_df_ac, params):
    
    # https://stackoverflow.com/a/41489086
    # find cutoff along color column to split dataframe into equal-population halves
    total = pop_df_ac['total_pop'].sum()
    half = total / 2
    sorted_pop_df_ac = pop_df_ac.sort_values(by=params['c_col'])
    # returns last index where cumsum < half
    cutoff_idx = np.flatnonzero(sorted_pop_df_ac['total_pop'].cumsum() < half)[-1]
    cutoff_val = sorted_pop_df_ac[params['c_col']].iloc[cutoff_idx]
    
    # checks the half
    h0 = sorted_pop_df_ac['total_pop'][:cutoff_idx + 1].sum() / total
    print(f'{h0*100:.1f}')
    
    base = alt.Chart(df).properties(
        title=params['chart_title'],
        width=720,
        height=480
    )
    
    c_datum = 'datum.' + params['c_col']
    
    # top of iceberg: c_col > cutoff_val
    top = base.mark_area(opacity=1).transform_filter(
        (eval(c_datum) > cutoff_val)
    ).encode(
        alt.X('date:T', title='Date'),
        alt.Y(params['y_col'] + ':Q'),
        color=alt.Color(
            params['c_col'] + ':Q',
            scale=alt.Scale(
                scheme=color_dict[params['c_col']]['scheme'],
                domain=[np.percentile(df[params['c_col']].unique(), 1), 
                        np.percentile(df[params['c_col']].unique(), 99)],
                domainMid=color_dict[params['c_col']]['mid'],
            ),
            sort=color_dict[params['c_col']]['sort'],
        ),
        order=alt.Order(params['c_col'], sort='ascending')
    )
    
    # bottom of iceberg: c_col <= cutoff_val
    bottom = base.mark_area(opacity=1).transform_filter(
        (eval(c_datum) <= cutoff_val)
    ).transform_calculate(
        y_alt='datum.' + params['y_col'] + '* -1'
    ).encode(
        alt.X('date:T', title='Date'),
        alt.Y('y_alt:Q', axis=alt.Axis(format='(.2s')),
        color=alt.Color(
            params['c_col'] + ':Q',
            scale=alt.Scale(
                scheme=color_dict[params['c_col']]['scheme'],
                domain=[np.percentile(df[params['c_col']].unique(), 1), 
                        np.percentile(df[params['c_col']].unique(), 99)],
                domainMid=color_dict[params['c_col']]['mid'],
            ),
            sort=color_dict[params['c_col']]['sort'],
        ),
        order=alt.Order(params['c_col'], sort='descending')
    )
    
    # just a line at zero to make it easier to see
    # not sure why mark_rule doesn't work as intended
    df['zero'] = 0
    zero = base.mark_line(color='black').encode(
        x='date',
        y='zero',
        size=alt.value(0.5)
    )

    # find dates where new cases is statistically different between samples
    dates = []
    for date in df['date'].unique():
        df_ = df[df['date'] == date]
        df0_ = df_[df_[params['c_col']]<=cutoff_val]['new_cases_per_100k'].values
        df1_ = df_[df_[params['c_col']]>cutoff_val]['new_cases_per_100k'].values
        stat, p = ttest_ind(df0_, df1_)
        if p < 0.05:
            dates.append(date)

    # https://stackoverflow.com/a/6934267
    def get_ranges(dates):
        while dates:
            end = 1
            try:
                while dates[end] - dates[end - 1] == np.timedelta64(1, 'D'):
                    end += 1
            except IndexError:
                pass
            yield {
                'start': dates[0],
                'end': dates[end-1] + np.timedelta64(1, 'D')
            }
            dates = dates[end:]

    ranges = alt.pd.DataFrame(get_ranges(dates))

    rects = alt.Chart(ranges).mark_rect(color='gray', opacity=0.2).encode(
        x='start:T',
        x2='end:T'
    )
    
    iceberg = alt.layer(top, bottom).encode(
        tooltip=[
            'state:N', 'county:N', 'date:T',
            alt.Tooltip(
                params['y_col'] + ':Q',
                format='.2f',
            )
        ]
    )

    layer = alt.layer(rects, iceberg, zero)
    
    return layer

In [None]:
#collapse-hide
def make_iceberg_chart_3(df, pop_df_ac, params):
    
    base = alt.Chart(df).properties(
        title=params['chart_title'],
        width=720,
        height=480
    )
    
    c_datum = 'datum.' + params['c_col']
    y_datum = 'datum.' + params['y_col']
    
    # top of iceberg: c_col > cutoff_val
    top = base.mark_area(opacity=1).transform_filter(
        (eval(c_datum) > color_dict[params['c_col']]['mid'])
    ).transform_joinaggregate(
        pop='sum(total_pop)',
    ).transform_calculate(
        per_y=y_datum + ' / datum.pop * 100000'
    ).encode(
        alt.X('date:T', title='Date'),
        alt.Y('per_y:Q'),
        color=alt.Color(
            params['c_col'] + ':Q',
            scale=alt.Scale(
                scheme=color_dict[params['c_col']]['scheme'],
                domain=[np.percentile(df[params['c_col']].unique(), 1), 
                        np.percentile(df[params['c_col']].unique(), 99)],
                domainMid=color_dict[params['c_col']]['mid'],
            ),
            sort=color_dict[params['c_col']]['sort'],
        ),
        order=alt.Order(params['c_col'], sort='ascending')
    )
    
    # bottom of iceberg: c_col <= cutoff_val
    bottom = base.mark_area(opacity=1).transform_filter(
        (eval(c_datum) <= color_dict[params['c_col']]['mid'])
    ).transform_joinaggregate(
        pop='sum(total_pop)',
    ).transform_calculate(
        per_y=y_datum + ' / datum.pop * -100000'
    ).encode(
        alt.X('date:T', title='Date'),
        alt.Y('per_y:Q',
            #   axis=alt.Axis(format='(.2s')
        ),
        color=alt.Color(
            params['c_col'] + ':Q',
            scale=alt.Scale(
                scheme=color_dict[params['c_col']]['scheme'],
                domain=[np.percentile(df[params['c_col']].unique(), 1), 
                        np.percentile(df[params['c_col']].unique(), 99)],
                domainMid=color_dict[params['c_col']]['mid'],
            ),
            sort=color_dict[params['c_col']]['sort'],
        ),
        order=alt.Order(params['c_col'], sort='descending')
    )
    
    # just a line at zero to make it easier to see
    # not sure why mark_rule doesn't work as intended
    zero = alt.Chart(pd.DataFrame({'y': [0]})).mark_rule(
        color='black',
        size=0.5
    ).encode(y='y')

    # find dates where {new cases} is statistically different between halves
    dates = []
    for date in df['date'].unique():
        df_ = df[df['date'] == date]
        df0_ = df_[df_[params['c_col']]<=color_dict[params['c_col']]['mid']]['new_cases_per_100k'].values
        df1_ = df_[df_[params['c_col']]>color_dict[params['c_col']]['mid']]['new_cases_per_100k'].values
        stat, p = ttest_ind(df0_, df1_)
        if p < 0.05:
            dates.append(date)

    # https://stackoverflow.com/a/6934267
    def get_ranges(dates):
        while dates:
            end = 1
            try:
                while dates[end] - dates[end - 1] == np.timedelta64(1, 'D'):
                    end += 1
            except IndexError:
                pass
            yield {
                'start': dates[0],
                'end': dates[end-1] + np.timedelta64(1, 'D')
            }
            dates = dates[end:]

    ranges = alt.pd.DataFrame(get_ranges(dates))

    rects = alt.Chart(ranges).mark_rect(color='gray', opacity=0.2).encode(
        x='start:T',
        x2='end:T'
    )
    
    iceberg = alt.layer(top, bottom).encode(
        tooltip=[
            'state:N', 'county:N', 'date:T',
            alt.Tooltip(
                params['y_col'] + ':Q',
                format='.2f',
            )
        ]
    )

    layer = alt.layer(rects, iceberg, zero)
    
    return layer

In [None]:
#hide
def make_normed_iceberg_chart(df, params):
    
    base = alt.Chart(df).properties(
        title=params['chart_title'],
        width=720,
        height=480
    )
    
    c_datum = 'datum.' + params['c_col']
    y_datum = 'datum.' + params['y_col']
    
    # top of iceberg
    points0 = base.mark_area().transform_filter(
        (eval(c_datum) >= params['c_mid'])
    ).transform_joinaggregate(
        pop_0='sum(total_pop)',
    ).transform_calculate(
        per_y=y_datum + ' / datum.pop_0'
    ).encode(
        alt.X('date:T', title='Date'),
        alt.Y('per_y:Q'),
        color=alt.Color(
            params['c_col'] + ':Q',
            scale=alt.Scale(
                scheme=params['c_scheme'],
                domain=[np.percentile(df[params['c_col']].unique(), 0.5), 
                        np.percentile(df[params['c_col']].unique(), 99.5)],
                domainMid=params['c_mid'],
            ),
            sort=params['c_sort'],
        ),
        order=alt.Order(params['c_col'], sort='ascending')
    )
    
    # bottom of iceberg
    points1 = base.mark_area().transform_filter(
        (eval(c_datum) < params['c_mid'])
    ).transform_joinaggregate(
        pop_1='sum(total_pop)',
    ).transform_calculate(
        per_y=y_datum + ' / datum.pop_1 * -1'
    ).encode(
        alt.X('date:T', title='Date'),
        alt.Y('per_y:Q', axis=alt.Axis(format='(.2s')),
        color=alt.Color(
            params['c_col'] + ':Q',
            scale=alt.Scale(
                scheme=params['c_scheme'],
                domain=[np.percentile(df[params['c_col']].unique(), 1), 
                        np.percentile(df[params['c_col']].unique(), 99)],
                domainMid=params['c_mid'],
            ),
            sort=params['c_sort'],
        ),
        order=alt.Order(params['c_col'], sort='descending'),
    )
    
    df['zero'] = 0
    zero = base.mark_line(color='black').encode(
        x='date',
        y='zero',
        size=alt.value(0.5)
    )
    
    layer = alt.layer(points0, points1, zero).encode(
        tooltip=[
            'state:N', 'county:N',
            alt.Tooltip(
                params['y_col'] + ':Q',
                format='.0f',
            )
        ]
    )
    
    return layer

In [None]:
#collapse-hide
def make_funnel_chart(df, pop_df_ac, params):

    x_col_2 = params['x_col'] + '_per_100k_15sg'
    cols = ['date', 'state', 'county', 'total_pop'] + [params['c_col'], params['x_col'], x_col_2]

    df = df[df['date'] >= '2020-03-01'][cols]
    
    base = alt.Chart(df).properties(
        title=params['chart_title'],
        width=480,
        height=720
    )
    
    c_datum = 'datum.' + params['c_col']
    x_datum = 'datum.' + params['x_col']

    # left of iceberg: c_col <= cutoff_val
    left = base.mark_area().transform_filter(
        (eval(c_datum) <= color_dict[params['c_col']]['mid'])
    ).transform_joinaggregate(
        pop='sum(total_pop)',
    ).transform_calculate(
        per_x=x_datum + ' / datum.pop * -100000'
    ).encode(
        alt.X('per_x:Q'),
        alt.Y('date:T', title='Date',
            #   axis=alt.Axis(format='(.2s')
        ),
        color=alt.Color(
            params['c_col'] + ':Q',
            scale=alt.Scale(
                scheme=color_dict[params['c_col']]['scheme'],
                domain=[np.percentile(df[params['c_col']].unique(), 1), 
                        np.percentile(df[params['c_col']].unique(), 99)],
                domainMid=color_dict[params['c_col']]['mid'],
            ),
            sort=color_dict[params['c_col']]['sort'],
        ),
        order=alt.Order(params['c_col'], sort='descending')
    )
    
    # right of iceberg: c_col > cutoff_val
    right = base.mark_area().transform_filter(
        (eval(c_datum) > color_dict[params['c_col']]['mid'])
    ).transform_joinaggregate(
        pop='sum(total_pop)',
    ).transform_calculate(
        per_x=x_datum + ' / datum.pop * 100000'
    ).encode(
        alt.X('per_x:Q'),
        alt.Y('date:T', title='Date'),
        color=alt.Color(
            params['c_col'] + ':Q',
            scale=alt.Scale(
                scheme=color_dict[params['c_col']]['scheme'],
                domain=[np.percentile(df[params['c_col']].unique(), 1), 
                        np.percentile(df[params['c_col']].unique(), 99)],
                domainMid=color_dict[params['c_col']]['mid'],
            ),
            sort=color_dict[params['c_col']]['sort'],
        ),
        order=alt.Order(params['c_col'], sort='ascending')
    )   
    
    # just a line at zero to make it easier to see
    # not sure why mark_rule doesn't work as intended
    zero = alt.Chart(pd.DataFrame({'x': [0]})).mark_rule(
        color='black',
        size=0.5
    ).encode(x='x')

    # find dates where {new cases} is statistically different between halves
    dates = []
    for date in df['date'].unique():
        df_ = df[df['date'] == date]
        df0_ = df_[df_[params['c_col']]<=color_dict[params['c_col']]['mid']][x_col_2].values
        df1_ = df_[df_[params['c_col']]>color_dict[params['c_col']]['mid']][x_col_2].values
        stat, p = ttest_ind(df0_, df1_)
        if p < 0.05:
            dates.append(date)

    # https://stackoverflow.com/a/6934267
    def get_ranges(dates):
        last_date = dates[-1]
        while dates:
            end = 1
            try:
                while dates[end] - dates[end - 1] == np.timedelta64(1, 'D'):
                    end += 1
            except IndexError:
                pass
            if dates[end-1] == last_date:
                end_date = dates[end-1]
            else:
                end_date = dates[end-1] + np.timedelta64(1, 'D')
            yield {
                'start': dates[0],
                'end': end_date
            }
            dates = dates[end:]

    ranges = alt.pd.DataFrame(get_ranges(dates))

    rects = alt.Chart(ranges).mark_rect(color='gray', opacity=0.2).encode(
        y='start:T',
        y2='end:T'
    )
    
    funnel = alt.layer(left, right).encode(
        tooltip=[
            'state:N', 'county:N', 'date:T',
            alt.Tooltip(
                params['x_col'] + ':Q',
                format='.2f',
            )
        ]
    )

    layer = alt.layer(rects, funnel, zero)
    
    return layer