In [4]:
import numpy as np
import pandas as pd
import plotly.express as px
from datetime import datetime
import dateutil.parser
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def data_parser(time_in):
    """
    Process dates in dataframe to string format
    """
    return dateutil.parser.parse(time_in).strftime('%m/%d')

In [6]:
df_data = pd.read_csv("states-daily.csv")  # daily U.S. data from https://covidtracking.com/
df_data.sort_values(by='date', ascending=True, inplace=True)

df_states = pd.read_csv("nst-est2019-popchg2010_2019.csv")  # census population data
df_state_names = pd.read_csv("state_names.csv")  # state names

# map state names to abbreviations
d_abb_to_name = {}
for i in range(0, df_state_names.shape[0]):
    d_abb_to_name[df_state_names['Abbreviation'].values[i]] = df_state_names['State'].values[i]
    
# get population by state
d_pop = {}
for v in d_abb_to_name.keys():
    d_pop[v] = df_states.loc[df_states['NAME'] == d_abb_to_name[v]][['POPESTIMATE2019']].values[0][0]

# for each state, get the slice of dataframe as appropriate, and then convert dates
ddf_states = {}
for name, group in df_data.groupby("state"):
    ddf_states[name] = group
    ddf_states[name]['date_parsed'] = ddf_states[name].loc[:, ('dateChecked')].apply(lambda x: data_parser(x))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



# Visualize Raw Data
This section simply checks to see whether the data can be visualized appropriately. We are primarily interested in state-by-state pairs.

In [8]:
states_of_interest = ['GA', 'NY']

for v in states_of_interest:
    curr_date = ddf_states[v]['date_parsed']
    curr_pos = ddf_states[v]['positive'].values
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=curr_date, y=curr_pos,
                  mode='lines',
                  name='positive cases'))
    fig.update_layout(title='Positive Cases of COVID19',
                   xaxis_title='Date',
                   yaxis_title='Number of Positive Cases')
    fig.show()


In [20]:
v = 'CA'
curr_date = ddf_states[v]['date_parsed']
curr_pos = ddf_states[v]['positive'].values

In [21]:
ddf_states[v]

Unnamed: 0,date,state,positive,negative,pending,hospitalized,death,total,dateChecked,date_parsed
856,20200304,CA,53.0,462.0,,,,515,2020-03-04T21:00:00Z,03/04
832,20200305,CA,53.0,462.0,,,,515,2020-03-05T21:00:00Z,03/05
796,20200306,CA,60.0,462.0,,,,522,2020-03-06T21:00:00Z,03/06
746,20200307,CA,69.0,462.0,,,,531,2020-03-07T21:00:00Z,03/07
695,20200308,CA,88.0,462.0,,,,550,2020-03-08T20:00:00Z,03/08
646,20200309,CA,114.0,690.0,,,,804,2020-03-09T20:00:00Z,03/09
595,20200310,CA,133.0,690.0,,,,823,2020-03-10T20:00:00Z,03/10
544,20200311,CA,157.0,916.0,,,,1073,2020-03-11T20:00:00Z,03/11
493,20200312,CA,202.0,916.0,,,4.0,1118,2020-03-12T20:00:00Z,03/12
442,20200313,CA,202.0,916.0,,,4.0,1118,2020-03-13T20:00:00Z,03/13


In [23]:
print(d_pop[v])

39512223


# Visualize Per Million
It's more instructive to look at the data in terms of cases per million. We also look at tests per million.

In [24]:
def viz_pos_vs_test(ddf_states_in, d_pop_in, states_in):
    date = ddf_states_in[states_in[0]]['date_parsed']
    pos1 = ddf_states_in[states_in[0]]['positive'].values
    pos2 = ddf_states_in[states_in[1]]['positive'].values

    neg1 = np.nan_to_num(ddf_states_in[states_in[0]]['negative'].values)
    neg2 = np.nan_to_num(ddf_states_in[states_in[1]]['negative'].values)

    test1 = pos1+neg1
    test2 = pos2+neg2


    fig = go.Figure()
    #fig = make_subplots(rows=2, cols=1)

    fig.add_trace(go.Scatter(
        x=date,
        y=pos1,
        name = "cases per million ({})".format(states_in[0]),
        connectgaps=True # override default to connect the gaps
    ))
    fig.add_trace(go.Scatter(
        x=date,
        y=pos2,
        name="cases per million ({})".format(states_in[1]),
        connectgaps=True # override default to connect the gaps
    ))

    # Add figure title
    fig.update_layout(
        title_text="Positive Cases of COVID19 (Per Million)"
    )

    # Set x-axis title
    fig.update_xaxes(title_text="date")

    fig.show()


    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=date,
        y=test1,
        name = "tests per million ({})".format(states_in[0]),
        connectgaps=True # override default to connect the gaps
    ))
    fig.add_trace(go.Scatter(
        x=date,
        y=test2,
        name="tests per million ({})".format(states_in[1]),
        connectgaps=True # override default to connect the gaps
    ))

    # Add figure title
    fig.update_layout(
        title_text="Tests of COVID19 (Per Million)"
    )

    # Set x-axis title
    fig.update_xaxes(title_text="date")
    return fig



pairs = [('NY', 'WA')]
for curr_pair in pairs:    
    print("\n\nMaking figures for pair {}".format(curr_pair))
    curr_fig = viz_pos_vs_test(ddf_states_in=ddf_states, d_pop_in=d_pop, states_in=curr_pair)
    curr_fig.show()




Making figures for pair ('NY', 'WA')


In [25]:
pairs = [('NY', 'CA')]
for curr_pair in pairs:    
    print("\n\nMaking figures for pair {}".format(curr_pair))
    curr_fig = viz_pos_vs_test(ddf_states_in=ddf_states, d_pop_in=d_pop, states_in=curr_pair)
    curr_fig.show()



Making figures for pair ('NY', 'CA')


In [14]:
pairs = [('NY', 'GA')]
for curr_pair in pairs:    
    print("\n\nMaking figures for pair {}".format(curr_pair))
    curr_fig = viz_pos_vs_test(ddf_states_in=ddf_states, d_pop_in=d_pop, states_in=curr_pair)
    curr_fig.show()



Making figures for pair ('NY', 'GA')
