In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
!cd ~/Documents/Data_Projects/COVID/COVID-19

In [32]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 150)

from datetime import timedelta
import math

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
mpl.rcParams['figure.dpi']= 600
sns.set()

import plotly
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go

from data_clean import *
from viz_helper import *
%load_ext autoreload
%autoreload 10

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
# Read in live data (countries from Johns Hopkins, states + count)
countries = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
states = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv')
local = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv')

In [16]:
df_country = clean_data_hopkins(countries, country='all')
df_state = clean_data_nyt(states, level='state')
df_local = clean_data_nyt(local, level='local')


divide by zero encountered in log


invalid value encountered in log



In [57]:
containment = pd.read_csv('containment.txt')
containment.columns = ['State', 'Date']
containment = containment.sort_values(by='Date').groupby(by='State').agg('last').reset_index()
containment.sample(5)

Unnamed: 0,State,Date
4,Colorado,2020-03-26
12,Illinois,2020-03-22
10,Hawaii,2020-03-25
20,Michigan,2020-03-24
32,Pennsylvania,2020-03-19


In [17]:
df_country[df_country.Country=='US'].tail()

Unnamed: 0,Country,Date,Confirmed,EpidemicStartDate,DaysElapsed,NewConfirmed,DaysElapsed_Log,Confirmed_Log,NewConfirmed_Log
12048,US,2020-03-28,121478,2020-01-22,67,19821.0,4.204693,11.707488,9.894497
12228,US,2020-03-29,140886,2020-01-22,68,19408.0,4.219508,11.855706,9.873441
12408,US,2020-03-30,161807,2020-01-22,69,20921.0,4.234107,11.99416,9.948509
12588,US,2020-03-31,188172,2020-01-22,70,26365.0,4.248495,12.145112,10.179793
12768,US,2020-04-01,213372,2020-01-22,71,25200.0,4.26268,12.270792,10.134599


In [18]:
df_local[df_local.County=='Albemarle'].tail()

Unnamed: 0,County,State,Date,Confirmed,EpidemicStartDate,DaysElapsed,NewConfirmed,DaysElapsed_Log,Confirmed_Log,NewConfirmed_Log
337,Albemarle,Virginia,2020-03-28,16,2020-03-07,22,8.0,3.091042,2.772589,2.079442
338,Albemarle,Virginia,2020-03-29,17,2020-03-07,23,1.0,3.135494,2.833213,0.0
339,Albemarle,Virginia,2020-03-30,19,2020-03-07,24,2.0,3.178054,2.944439,0.693147
340,Albemarle,Virginia,2020-03-31,21,2020-03-07,25,2.0,3.218876,3.044522,0.693147
341,Albemarle,Virginia,2020-04-01,21,2020-03-07,26,0.0,3.258097,3.044522,-inf


In [19]:
# NYT counts have recently been higher than the daily reports from Hopkins
pd.DataFrame(df_country[df_country.Country=='US'].groupby('Date').sum().Confirmed - df_state.groupby('Date').sum().Confirmed).tail(7)

Unnamed: 0_level_0,Confirmed
Date,Unnamed: 1_level_1
2020-03-26,-1697.0
2020-03-27,-991.0
2020-03-28,-2150.0
2020-03-29,-1275.0
2020-03-30,-1989.0
2020-03-31,338.0
2020-04-01,-1089.0


In [20]:
px.scatter(data_frame=df_state[df_state.State.isin(['New York'])], 
           x='Confirmed', y='NewConfirmed', color='DaysElapsed', symbol='State',
           log_x=True,log_y=True, labels={'y':'Daily New Cases', 'x':'Cumulative Confirmed Cases'},
           hover_name='Date', hover_data=['NewConfirmed','Confirmed','DaysElapsed'],
           title='Logged New Confirmed Cases vs Logged Cumulative Cases Per Day', template='ggplot2', 
          )

In [63]:
px.colors.sequential.Purp

['rgb(243, 224, 247)',
 'rgb(228, 199, 241)',
 'rgb(209, 175, 232)',
 'rgb(185, 152, 221)',
 'rgb(159, 130, 206)',
 'rgb(130, 109, 186)',
 'rgb(99, 88, 159)']

In [76]:
states = ['New York', 'New Jersey', 'Washington', 'Virginia']
fig =px.scatter(data_frame=df_state[df_state.State.isin(states)], 
                x='Date', y='Confirmed', color='State',
                log_x=False,log_y=True, labels={'y':'Daily New Cases', 'x':'Cumulative Confirmed Cases'},
                hover_name='Date', hover_data=['NewConfirmed','Confirmed','DaysElapsed'],
                title='Confirmed Cases in Select States', template='ggplot2', 
                color_discrete_sequence=px.colors.sequential.Aggrnyl,
               )

c = 0
for s in states:
    containment_date = containment[containment.State==s].Date.values[0]
    fig.add_shape(dict(type="line", 
                       x0=containment_date, y0=1, x1=containment_date, y1=100000, 
                       line=dict(color=px.colors.sequential.Aggrnyl[c],
                                 width=3
                                )))
    c += 1
    print('{} entered containment on {}'.format(s, containment_date))
    
fig.update_shapes(dict(xref='x', yref='y'))

fig.show()

New York entered containment on 2020-03-23
New Jersey entered containment on 2020-03-22
Washington entered containment on 2020-03-23
Virginia entered containment on 2020-03-30


In [22]:
px.scatter(data_frame=df_country[df_country.Country.isin(['US', 'Korea, South', 'Italy'])], 
           x='Confirmed', y='NewConfirmed', color='Country',
           log_x=True,log_y=True, labels={'y':'Daily New Cases', 'x':'Cumulative Confirmed Cases'},
           hover_name='Date', hover_data=['NewConfirmed','Confirmed','DaysElapsed'],
           title='New Confirmed Cases vs Cumulative Cases Per Day', template='ggplot2', 
          )