In [8]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from urllib.request import urlopen
import json

import itertools

from time import time
from datetime import datetime, timedelta
from pytz import timezone

import altair as alt
from vega_datasets import data

In [10]:
def optimize(df):
    '''
    Optimizes the data types in a pandas dataframe.
    '''
    dft = df.copy()
    # converts to datetime if possible
    dft = dft.apply(lambda col: pd.to_datetime(col, errors='ignore') if col.dtypes=='object' else col)
    # if there are less than half as many unique values as there are rows, convert to category
    for col in dft.select_dtypes(include='object'):
        if len(dft[col].unique()) / len(df[col]) < 0.5:
            dft[col] = dft[col].astype('category')
    # downcasts numeric columns if possible
    dft = dft.apply(lambda col: pd.to_numeric(col, downcast='integer') if col.dtypes=='int64' else col)
    dft = dft.apply(lambda col: pd.to_numeric(col, downcast='float') if col.dtypes=='float64' else col)
    return dft

In [11]:
df = optimize(pd.read_csv('data/df.csv'))
df.head()

Unnamed: 0,date,county,state,fips,cases,deaths,area,population,lat,lon,...,new_cases_per_100k_7d,new_deaths_per_100k_7d,new_case_density_7d,new_death_density_7d,delta_new_cases_7d,delta_new_deaths_7d,delta_new_cases_per_100k_7d,delta_new_deaths_per_100k_7d,delta_new_case_density_7d,delta_new_death_density_7d
0,2020-01-21,Snohomish,Washington,53061,1,0,2087.272949,822083,48.046917,-121.69278,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2020-01-22,Snohomish,Washington,53061,1,0,2087.272949,822083,48.046917,-121.69278,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2020-01-23,Snohomish,Washington,53061,1,0,2087.272949,822083,48.046917,-121.69278,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2020-01-24,Cook,Illinois,17031,1,0,945.325989,5150233,41.839622,-87.817429,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2020-01-24,Snohomish,Washington,53061,1,0,2087.272949,822083,48.046917,-121.69278,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [120]:
df['days'] = ((df['date'] - df['date'].min()) / np.timedelta64(1, 'D')).astype('int')

In [122]:
df['days'].max()

173

In [13]:
df['date'].dt.dayofweek.unique()

array([1, 2, 3, 4, 5, 6, 0], dtype=int64)

In [138]:
la = timezone('US/Pacific')
last = df['date'].max()
df_slice = df[df['date'] == last]
df_slice.head()

Unnamed: 0,date,county,state,fips,cases,deaths,area,population,lat,lon,...,new_deaths_per_100k_7d,new_case_density_7d,new_death_density_7d,delta_new_cases_7d,delta_new_deaths_7d,delta_new_cases_per_100k_7d,delta_new_deaths_per_100k_7d,delta_new_case_density_7d,delta_new_death_density_7d,days
316491,2020-07-12,Autauga,Alabama,1001,706,16,594.435974,55869,32.536152,-86.641197,...,0.767101,0.039144,0.00129,-0.285714,0.142857,-0.5114,0.2557,-0.00086,0.00043,173
316492,2020-07-12,Baldwin,Alabama,1003,1294,12,1589.784058,223234,30.725863,-87.723953,...,0.127989,0.016625,8.1e-05,7.428571,0.0,3.327706,0.0,0.002093,0.0,173
316493,2020-07-12,Barbour,Alabama,1005,403,2,884.875977,24686,31.867889,-85.389244,...,0.0,0.033353,0.0,0.428571,0.0,1.736091,0.0,0.001962,0.0,173
316494,2020-07-12,Bibb,Alabama,1007,228,1,622.58197,22394,32.996456,-87.124962,...,0.0,0.035863,0.0,0.142857,0.0,0.637926,0.0,0.001025,0.0,173
316495,2020-07-12,Blount,Alabama,1009,331,1,644.776001,57826,33.985249,-86.569756,...,0.0,0.03525,0.0,2.714286,0.0,4.693884,0.0,0.00728,0.0,173


In [137]:
counties = alt.topo_feature('https://vega.github.io/vega-datasets/data/us-10m.json', 'counties')
background = alt.Chart(counties).mark_geoshape(
    fill='#F2F2F2',
    stroke='white',
    strokeWidth=0.5
)

counts = alt.Chart(counties).mark_geoshape().encode(
    color=alt.Color(
        'delta_new_cases_per_100k_7d:Q',
        scale=alt.Scale(
            scheme='redyellowblue',
            domainMid=0,
            domain=[-10,20]
        ),
        sort='descending',
        title='change in new cases per 100k'
    )
).transform_lookup(
    lookup='fips',
    from_=alt.LookupData(df_slice, 'fips', ['delta_new_cases_per_100k_7d'])
).properties(
    width=720,
    height=480
).project('albersUsa')

background + counts

In [136]:
counties = alt.topo_feature('https://vega.github.io/vega-datasets/data/us-10m.json', 'states')
background = alt.Chart(counties).mark_geoshape(
    fill='#F2F2F2',
    stroke='white',
    strokeWidth=2
).properties(
    width=720,
    height=480
).project('albersUsa')

data = alt.Data

step = 7
slider = alt.binding_range(min=df['days'].max() % step, max=df['days'].max(), step=step)
select_date = alt.selection_single(name='days', fields=['days'], bind=slider, init={'days': df['days'].max()})

alt.data_transformers.enable('data_server')

points = alt.Chart(df).mark_circle(
    stroke='#444444',
    strokeWidth=0.5,
).properties(
    title='New Cases per County (7d avg)'
).encode(
    latitude='lat:Q',
    longitude='lon:Q',
    size=alt.Size(
        'new_cases_7d:Q',
        scale=alt.Scale(
            base=20,
            domain=[0,6000],
            range=[0,10000]
        ),
        legend=alt.Legend(
            values=[10, 100, 1000, 2000]
        ),
        title='new cases (7d avg)'
    ),
    color=alt.Color(
        'delta_new_cases_per_100k_7d:Q',
        scale=alt.Scale(
            scheme='redyellowblue',
            domainMid=0,
            domain=[-10,20]
        ),
        sort='descending',
        title='change in new cases per 100k'
    ),
    tooltip=[
        'state:N', 'county:N', 
        alt.Tooltip(
            'new_cases_7d:Q',
            format='.1f',
            title='new cases (7d avg)',
        ), 
        alt.Tooltip(
            'delta_new_cases_per_100k_7d:Q',
            format='.1f',
            title='change in new cases per 100k'
        )]
).add_selection(
    select_date
).transform_filter(
    select_date
)

background + points