# kevchin - Visualizing US Covid Cases by State

## Chart and Quantify Covid Infection Curves
  * chart the normal slowing growth curves
  * determine the slope of the curve
  * compare across geographies (States and Counties)

In [1]:
!pip install arviz==0.6.1
!pip install pymc3==3.8
!pip install Theano==1.0.4



In [2]:
%matplotlib inline
import numpy as np
from IPython.display import display, Markdown
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import seaborn as sns
import plotly.express as px
from datetime import timedelta

sns.set_context('talk')
plt.style.use('seaborn-whitegrid')

import warnings
warnings.filterwarnings("ignore")

  import pandas.util.testing as tm


## load dataset code modified from pymc3 repo:
  * https://github.com/twiecki/covid19/blob/master/covid19_exp_model.ipynb
  * a 'narrow' format for the data
  * fewer columns and more rows

In [3]:
def load_individual_US_timeseries(name, delCols=[]):
    base_url='https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series'
    url = f'{base_url}/time_series_covid19_{name}.csv'
    df = pd.read_csv(url, 
                        index_col=['Country_Region', 'Admin2', 
                                   'Province_State', 'Lat', 'Long_', ])
    if (len(delCols) > 0):
        extraCols = ['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Combined_Key'] + delCols
    else:
        extraCols = ['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Combined_Key']
    # Admin2 == 'City/Region' for US
    #
    df.drop(extraCols, axis=1, inplace=True)
    df['type'] = name.lower()
    df.columns.name = 'date'
    extraCols = ['Lat', 'Long_']
    df = (df.set_index('type', append=True)
                .reset_index(extraCols, drop=True)
                .stack()
                .reset_index()
                .set_index('date')
            )

    df.index = pd.to_datetime(df.index)
    df.columns = ['country', 'admin2','state', 'type', 'cases']
    return df

def add_confirmed_after_n_days_column(df, n_days=100, relevant_threshold=100):
    affected_states = df[df.cases > relevant_threshold].state.unique()
    query = (df.cases >= n_days)
    cols = ["state", "date"]
    date_since_n_lookup = dict(
        (df[query][cols].groupby("state").min().reset_index()).values
    )
    first_confirmed_date = f"first_{n_days}_confirmed_date"
    days_since = f"days_since_{n_days}"
    df[first_confirmed_date] = df.state.apply(lambda x: date_since_n_lookup.get(x))
    df[days_since] = (df.date - df[first_confirmed_date]).dt.days
    df = df.drop(columns=[first_confirmed_date])
    df = df.set_index("date")
    return df


### Load Johns Hopkins University data for Confirmed Cases
  * a wide format for the data
  * few rows and additional column per day of data

In [4]:
## Data in Long format
cases_df = load_individual_US_timeseries('confirmed_US')
cases_df = add_confirmed_after_n_days_column(cases_df.reset_index())

cityCases_df = cases_df.groupby(['state', 'admin2', 'date', 'type']).sum().reset_index()
cityCases_df = add_confirmed_after_n_days_column(cityCases_df, 100)

stateCases_df = cityCases_df.groupby(['state', 'date', 'type']).sum().reset_index()
stateCases_df = add_confirmed_after_n_days_column(stateCases_df, 100)

## Data in Wide format
confirmed_cases_us_url = 'https://github.com/CSSEGISandData/COVID-19/blob/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_US.csv?raw=true'
confirmed_cases_url = confirmed_cases_us_url
confirmed_cases = pd.read_csv(confirmed_cases_url, sep=',')

In [5]:
cases_df.head()

Unnamed: 0_level_0,country,admin2,state,type,cases,days_since_100
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-22,US,,American Samoa,confirmed_us,0,
2020-01-23,US,,American Samoa,confirmed_us,0,
2020-01-24,US,,American Samoa,confirmed_us,0,
2020-01-25,US,,American Samoa,confirmed_us,0,
2020-01-26,US,,American Samoa,confirmed_us,0,


In [6]:
confirmed_cases.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,2/1/20,2/2/20,2/3/20,2/4/20,2/5/20,2/6/20,2/7/20,2/8/20,2/9/20,2/10/20,2/11/20,2/12/20,2/13/20,2/14/20,2/15/20,2/16/20,2/17/20,2/18/20,2/19/20,...,6/9/20,6/10/20,6/11/20,6/12/20,6/13/20,6/14/20,6/15/20,6/16/20,6/17/20,6/18/20,6/19/20,6/20/20,6/21/20,6/22/20,6/23/20,6/24/20,6/25/20,6/26/20,6/27/20,6/28/20,6/29/20,6/30/20,7/1/20,7/2/20,7/3/20,7/4/20,7/5/20,7/6/20,7/7/20,7/8/20,7/9/20,7/10/20,7/11/20,7/12/20,7/13/20,7/14/20,7/15/20,7/16/20,7/17/20,7/18/20
0,16,AS,ASM,16,60.0,,American Samoa,US,-14.271,-170.132,"American Samoa, US",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,316,GU,GUM,316,66.0,,Guam,US,13.4443,144.7937,"Guam, US",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,180,180,183,183,183,183,185,186,188,192,200,222,222,222,225,226,231,247,247,247,253,257,267,280,280,280,280,301,303,307,309,310,310,310,312,312,313,314,314,314
2,580,MP,MNP,580,69.0,,Northern Mariana Islands,US,15.0979,145.6739,"Northern Mariana Islands, US",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,30,31,31,31,31,31,31,31,31,31,33,33,33,36,36,37,37,37
3,63072001,PR,PRI,630,72001.0,Adjuntas,Puerto Rico,US,18.180117,-66.754367,"Adjuntas, Puerto Rico, US",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,36,38,38,39,38,38,38,38,38,40,40,43,43,43,43,43,43,43,43,47,48,50,50,50,50,50,52,58,59,59,60,61,63,63,66,66,66,66,67,68
4,63072003,PR,PRI,630,72003.0,Aguada,Puerto Rico,US,18.360255,-67.175131,"Aguada, Puerto Rico, US",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,27,27,27,27,28,28,28,28,28,28,29,29,29,29,30,30,32,32,33,34,34,34,35,36,36,37,37,48,48,48,50,50,52,54,57,58,60,60,66,67


# Plot states that have flattened the curve
### Flag for using Plotly plots
  * Will default to using Plotly
  * Can set to False if you prefer

In [7]:
plotlyChart = True
#plotlyChart = False

In [8]:
def plotStateGeo(stateName, title="Infections by Date", plotly=True):
  dateCols = confirmed_cases.columns.str.contains('/20')
  colList = list(confirmed_cases.columns[dateCols])
  look_df = confirmed_cases.loc[confirmed_cases["Province_State"]== stateName, 
                                dateCols].transpose()
  firstDateState = look_df.loc[look_df.sum(axis=1) > 0, ].head(1).index[0]

  slice_df = confirmed_cases.loc[confirmed_cases["Province_State"]== stateName, 
          ['Admin2'] + colList[colList.index(firstDateState):]].copy()
  
  if (plotly):
    chart_df = pd.melt(slice_df, id_vars=['Admin2'], 
                       var_name='date', value_name='cases')
    chart_df.columns = ['area', 'date', 'cases']
    chart_df['state'] = stateName
    fig = px.area(chart_df, x="date", y="cases", color="area",
	      line_group="state",
        labels={
                     "cases": "Confirmed Cases",
                     "date": "",
               },
        title=title)
    fig.update_layout(showlegend=True)
    fig.show()
  else:
    lol = slice_df[colList[colList.index(firstDateState):]].values.tolist()
    with warnings.catch_warnings():
      fig = plt.figure(figsize=(20,10))
      fig.subplots_adjust(top=0.8)
      ax1 = fig.add_subplot(111)
      ax1.set_ylabel('Confirmed Cases')
      ax1.set_xlabel('')
      ax1.set_title(title)
      x = range(len(lol[0]))
      ax1.stackplot(x, lol)

for stateName in ['New York', 'Illinois', 'New Jersey']:
  plotStateGeo(stateName, stateName+"- Infections by Initial Detection Date", plotlyChart )

In [9]:
def plotAreas(df, plotly, titleString):
  if (plotly):
    fig = px.scatter(df, x="lastN", y="change", log_x=False, size='size',
                  color='area', hover_name="area", hover_data=["size"])
    fig.update_layout(
        title=titleString,
        xaxis_title="Time (days elapsed)",
        yaxis_title="R (rate of transmisson, #)",
    )
    
    #fig.update_xaxes(autorange="reversed")
    fig.show()
  else:

    ax = sns.relplot(y="change", x="lastN",  hue='area', size="size",
                sizes=(40, 400), alpha=.5, palette="muted",
                height=6, data=df)
    ax.fig.set_size_inches(20,10)

    ax.set(xlabel="Time (days elapsed)", ylabel="R (day over day slope)", 
          title=titleString)
    #plt.gca().invert_xaxis()
    ax._legend.remove()
    plt.legend(ncol=3, prop={'size': 8})
    plt.show()

def plotStates(df, plotly, titleString):
  if (plotly):
    fig = px.scatter(df, x="lastN", y="change", log_x=False, size='size',
                  color='state', hover_name="state", hover_data=["size"])
    fig.update_layout(
        title=titleString,
        xaxis_title="Time (days elapsed)",
        yaxis_title="R (rate of transmisson, #)",
    )
    
    #fig.update_xaxes(autorange="reversed")
    fig.show()
  else:
    ax = sns.relplot(y="change", x="lastN",  hue='state', size="size",
                sizes=(40, 400), alpha=.5, palette="muted",
                height=6, data=df)
    ax.fig.set_size_inches(20,10)
    ax.set(xlabel="Time (days elapsed)", ylabel="R (rate of transmisson, #)", 
          title=titleString)
    ax._legend.remove()
    plt.legend(ncol=3, prop={'size': 8})
    #plt.gca().invert_xaxis()
    plt.show()

### Week over Week, Determine slope for last n days 
Use slope of form y = mX + B (polynomial degree = 1)
  * m = slope
  * B = intercept
  * n=14, n=30 is estimated incubation and hospitalization period respectively

Every week we calculate the slope for a window of last 'n' days.
  * if there are less than 'n' days, calculate slope for those
  * if there are more than 'n' days, take first 'n' then advance window forward 1 week

In [10]:
stateName = 'Florida'
stateLevel_df = cityCases_df.loc[cityCases_df['state'] == stateName,].copy()
lol = []
for s in stateLevel_df['admin2'].unique():
  for week in np.arange(2,20):
    days = week * 7
    slice_df = stateLevel_df.loc[stateLevel_df['admin2'] == s,].copy()
    slice_df = slice_df[slice_df["cases"] >= 100]
    since100 = len(slice_df.index)

    slice_df = slice_df.tail(days)
    l = len(slice_df.index)
    if ((l == 0) | (l >= since100)):
      continue
    
    y = slice_df.head(14)['cases']
    lastY = y[0] # cases seen at start of period
    #print (s, days, len(y), lastY,
    #       slice_df.head(1).index.values[0],
    #       slice_df.tail(1).index.values[0])

    x = np.arange(0, len(y))
    polynomialDegree = 1
    res = np.polyfit(x, y, polynomialDegree)
    # For polynomial degree 1:
    # y = res[0] * X + res[1]
    powerx = str(1+np.round(res[0],4))
    
    lol.append([s, since100, 1+np.round(res[0],4), lastY, days])

flat_df = pd.DataFrame(lol, columns=['area', 'since100','change', 'size', 'lastN'])
flat_df = flat_df.fillna(0)
flat_df = flat_df.sort_values(by=['change'], ascending=False)
flat_df = flat_df[flat_df['change'] > 0]
flat_df = flat_df.sort_values(by=['area','lastN'], ascending=[False, False])
state_area_df = flat_df.copy()
state_area_df['lastN'] = -state_area_df['lastN']
state_area_df.head()

Unnamed: 0,area,since100,change,size,lastN
591,Washington,25,4.0967,110,-21
590,Washington,25,11.5011,121,-14
589,Walton,58,2.5736,106,-56
588,Walton,58,3.4308,117,-49
587,Walton,58,4.0396,130,-42


In [11]:
plotAreas (state_area_df,
            plotlyChart, "Covid19 Transmission Rates within "+ stateName)

In [12]:
lol = []
for s in stateCases_df['state'].unique():
  for week in np.arange(2,20):
    days = week * 7
    slice_df = stateCases_df.loc[stateCases_df['state'] == s,].copy()
    slice_df = slice_df[slice_df["cases"] >= 100]
    since100 = len(slice_df.index)

    slice_df = slice_df.tail(days).copy()
    l = len(slice_df.index)
    if ((l == 0) | (l >= since100)):
      continue
    
    y = slice_df.head(14)['cases']
    lastY = y[0] # cases seen at start of period
    x = np.arange(0, len(y))
    polynomialDegree = 1
    res = np.polyfit(x, y, polynomialDegree)
    # For polynomial degree 1:
    # y = res[0] * X + res[1]
    powerx = str(1+np.round(res[0],4))
    
    lol.append([s, since100, 1+np.round(res[0],4), lastY, days])

flat_df = pd.DataFrame(lol, columns=['state', 'since100','change', 'size', 'lastN'])
flat_df = flat_df.fillna(0)
flat_df = flat_df.sort_values(by=['change'], ascending=False)
flat_df = flat_df[flat_df['change'] > 0]
flat_df = flat_df.sort_values(by=['state','lastN'], ascending=[False, False])
states_all_df = flat_df.copy()
states_all_df['lastN'] = -states_all_df['lastN']

In [13]:
states_all_df.loc[states_all_df['state'] == 'Florida',].head(5)

Unnamed: 0,state,since100,change,size,lastN
157,Florida,126,818.1077,1004,-119
156,Florida,126,1103.4945,4246,-112
155,Florida,126,1019.4242,12350,-105
154,Florida,126,858.2396,19895,-98
153,Florida,126,685.3538,26314,-91


In [14]:
stateList = ['New York', 'Illinois', 'New Jersey']
plotStates (states_all_df.loc[states_all_df['state'].isin(stateList),],
            plotlyChart, "States with Early Cases of Covid19")

## Calculate the percentage change so we can compare
  * between states
  * over a specific time period

### We will use 7 day rolling percent change
  * look at last N days
  * in this case N = 90

In [15]:
Ndays = 90
pct_df = stateCases_df.loc[stateCases_df['cases'] >0,].copy()
pct_df['weekly_percent'] = 0
for s in stateCases_df['state'].unique():
    l = pct_df['state'] == s
    shifted = pct_df.loc[l, 'cases'].shift(7)
    pct_df.loc[l, 'weekly_percent'] = 100 *\
      (pct_df.loc[l, 'cases'] - shifted) / shifted

pct_df = pct_df[pct_df['weekly_percent'].notnull()]
pct_df = pct_df.reset_index()
pct_df['datetime'] = pd.to_datetime(pct_df['date'])
pct_df = pct_df.set_index('datetime')
lastNdays = str(pct_df['date'].max() - timedelta(days=Ndays))
pct_df = pct_df[lastNdays::]
pct_df.loc[pct_df['state'] == 'New York',].head(10)

Unnamed: 0_level_0,date,state,type,cases,days_since_100,weekly_percent
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-04-19,2020-04-19,New York,confirmed_us,243382,42,28.751065
2020-04-20,2020-04-20,New York,confirmed_us,248416,43,26.905374
2020-04-21,2020-04-21,New York,confirmed_us,253519,44,24.873904
2020-04-22,2020-04-22,New York,confirmed_us,258222,45,20.409039
2020-04-23,2020-04-23,New York,confirmed_us,263460,46,17.778543
2020-04-24,2020-04-24,New York,confirmed_us,271590,47,17.776901
2020-04-25,2020-04-25,New York,confirmed_us,282143,48,18.810059
2020-04-26,2020-04-26,New York,confirmed_us,288045,49,18.350987
2020-04-27,2020-04-27,New York,confirmed_us,291996,50,17.543153
2020-04-28,2020-04-28,New York,confirmed_us,295106,51,16.403899


In [16]:
def plotPercentChange(df, stateList, 
                      titleString='Normalized 7 day % Rate of Change (over last '+str(Ndays)+' days)',
                      plotly=True):
  if (len(titleString) > 0):
    title = titleString
  else:
    title = 'Normalized 7 day % Rate of Change (over last '+str(Ndays)+' days)'

  if (plotly):
    fig = px.line(df.loc[df['state'].isin(stateList),], x="date", y="weekly_percent", color="state",
                line_group="state", hover_name="state",
                      labels={
                      "weekly_percent": '7 day window change (%)',
                      "date": "date",
                },
          title=title)
    fig.update_xaxes(title_text='')
    fig.show()
  else:
    sns.set(rc={'figure.figsize':(20,10)})
    ax = sns.lineplot(x = "date", y = "weekly_percent", data=df.loc[df['state'].isin(stateList),], hue="state")
    ax.set_title(title)
    ax.set_ylabel('7 day window change (%)')
    ax.set_xlabel('')
    for item in ax.get_xticklabels():
        item.set_rotation(45)

## Approximate the rate of change
  * look at last 'N' days
    * N=14 and N=30 might be interesting
  * sum up % change over that period
  * sort by that sum

In [17]:
def measureChange(stateName, lastN=30): 
    l = (stateCases_df['state'] == stateName) & (stateCases_df['cases'] > 0)
    return (stateCases_df.loc[l,'cases'].pct_change()[-lastN:].sum())

d = {"states": pd.Series(stateCases_df['state'].unique())}
ranked_df = pd.DataFrame(d)

nDays = 14
ranked_df['rate'] = ranked_df.apply(lambda row : 
                                    measureChange(row['states'], nDays), axis=1)
ranked_df = ranked_df.sort_values(by='rate')
ranked_df.head()

Unnamed: 0,states,rate
30,New Jersey,0.021635
32,New York,0.024202
6,Connecticut,0.024903
21,Massachusetts,0.03052
40,Rhode Island,0.046272


In [18]:
stateList = ranked_df.head(10)['states']
stateCount = len(stateList)
plotStates (states_all_df.loc[states_all_df['state'].isin(stateList),],
            plotlyChart, "Top "+ str(stateCount) 
            +" States with Declining/Low Rates of Covid19 (last "+str(nDays)+" days)")

topState = ranked_df.head(1).iloc[0]['states']
plotStateGeo(topState, topState+' - is Least Infection Change % by State (last '+str(nDays)+" days)" , plotlyChart)
stateList = ranked_df.head(5)['states']
titleString = 'Declining States: '+'Normalized 7 day % Rate of Change (over last '+str(Ndays)+' days)'
plotPercentChange(pct_df, stateList, titleString)

stateList = ranked_df.tail(10)['states']
plotStates (states_all_df.loc[states_all_df['state'].isin(stateList),],
            plotlyChart, "Top "+ str(stateCount) 
            +" States with Rising/High Rates of Covid19 (last "+str(nDays)+" days)")

titleString = 'Rising States: '+'Normalized 7 day % Rate of Change (over last '+str(Ndays)+' days)'
stateList = ranked_df.tail(5)['states']
plotPercentChange(pct_df, stateList, titleString)

topState = ranked_df.tail(1).iloc[0]['states']
plotStateGeo(topState, topState+' - is Most Infection Change % by State (last '+str(nDays)+" days)", plotlyChart)



In [19]:
stateList = ['Arkansas', 'Iowa', 'Nebraska', 'North Dakota', 
          'South Dakota', 'Utah',  'Wyoming']
stateCount = len(stateList)
plotStates (states_all_df.loc[states_all_df['state'].isin(stateList),],
            plotlyChart, "The "+ str(stateCount) 
            +" States with No Lock-down Policy")

## Look at percentage change over 30 days

In [20]:
Ndays = 30
pct30_df = stateCases_df.loc[stateCases_df['cases'] >0,].copy()
pct30_df['weekly_percent'] = 0
for s in stateCases_df['state'].unique():
    l = pct30_df['state'] == s
    shifted = pct30_df.loc[l, 'cases'].shift(7)
    pct30_df.loc[l, 'weekly_percent'] = 100 *\
      (pct30_df.loc[l, 'cases'] - shifted) / shifted

pct30_df = pct30_df[pct30_df['weekly_percent'].notnull()]
pct30_df = pct30_df.reset_index()
pct30_df['datetime'] = pd.to_datetime(pct30_df['date'])
pct30_df = pct30_df.set_index('datetime')
lastNdays = str(pct30_df['date'].max() - timedelta(days=Ndays))
pct30_df = pct30_df[lastNdays::]
pct30_df.loc[pct30_df['state'] == 'New York',].head(10)


Unnamed: 0_level_0,date,state,type,cases,days_since_100,weekly_percent
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-06-18,2020-06-18,New York,confirmed_us,385760,102,1.278053
2020-06-19,2020-06-19,New York,confirmed_us,386556,103,1.268489
2020-06-20,2020-06-20,New York,confirmed_us,387272,104,1.213182
2020-06-21,2020-06-21,New York,confirmed_us,387936,105,1.20316
2020-06-22,2020-06-22,New York,confirmed_us,388488,106,1.183506
2020-06-23,2020-06-23,New York,confirmed_us,389085,107,1.172723
2020-06-24,2020-06-24,New York,confirmed_us,389666,108,1.174632
2020-06-25,2020-06-25,New York,confirmed_us,390415,109,1.206709
2020-06-26,2020-06-26,New York,confirmed_us,391220,110,1.206552
2020-06-27,2020-06-27,New York,confirmed_us,391923,111,1.200965


In [21]:
stateList = ['New York', 'Illinois', 'New Jersey', 'California', 'Texas', 'Florida']
stateCount = len(stateList)
titleString = "The "+ str(stateCount) +" States with a Lock-down Policy over "+str(Ndays)+" days"
plotPercentChange(pct30_df, stateList, titleString)

In [22]:
xStates = 10
titleString = 'Random States: '+'Normalized 7 day % Rate of Change (over last '+str(Ndays)+' days)'
stateList = list(pct30_df.sample(frac=1).drop_duplicates(['state']).sample(xStates)['state'].unique())
plotPercentChange(pct30_df, stateList, titleString)

In [23]:
state = 'Alabama'
census = 'http://www2.census.gov/programs-surveys/popest/datasets/2010-2019/national/totals/nst-est2019-alldata.csv'
#'STATE', 'POPESTIMATE2019'
pop_df = pd.read_csv(census,usecols=['NAME', 'POPESTIMATE2019'])
pop_df.columns = ['state', 'population']
allStates = list(stateCases_df['state'].unique())
pop_df = pop_df[pop_df['state'].isin(allStates)]
pop_df['millions'] = np.round(pop_df['population']/1e6,2)
N_population = pop_df.loc[pop_df['state'] == state, 'population'].iloc[0]
bin_labels_5 = ['Quintile'+str(x) for x in range(5,0,-1)]
pop_df['populationGroup'] = pd.qcut(pop_df['population'],
                                    q=[0, .2, .4, .6, .8, 1],
                              labels=bin_labels_5)
print (pop_df.columns, state, N_population)

rateByPop_df = pd.pivot_table(pct30_df, 
                           index=['state'], values=['weekly_percent'], 
                           aggfunc=np.mean).reset_index()
rateByPop_df['weekly_percent'] = np.round(rateByPop_df['weekly_percent'],2)
rateByPop_df = pd.merge(rateByPop_df, pop_df, on='state')

ageSince100_df = pd.pivot_table(pct30_df, 
                           index=['state'], values=['days_since_100'], 
                           aggfunc=np.min).reset_index()
                    
ageSince100_df.columns = ['state', 'since100cases']

cases30_df = pd.pivot_table(pct30_df, 
                           index=['state'], values=['cases'], 
                           aggfunc=np.mean).reset_index()
cases30_df.columns = ['state', 'avgCases30Day']
cases30_df['avgCases30Day'] = cases30_df['avgCases30Day'].astype(int)

rateByPop_df = pd.merge(rateByPop_df, ageSince100_df, on='state')
rateByPop_df = pd.merge(rateByPop_df, cases30_df, on='state')
rateByPop_df['cases_per_mm'] = np.round(rateByPop_df['avgCases30Day']/rateByPop_df['millions'],2)
del ageSince100_df
del cases30_df
rateByPop_df.head()


Index(['state', 'population', 'millions', 'populationGroup'], dtype='object') Alabama 4903185


Unnamed: 0,state,weekly_percent,population,millions,populationGroup,since100cases,avgCases30Day,cases_per_mm
0,Alabama,21.19,4903185,4.9,Quintile3,90,43077,8791.22
1,Alaska,23.17,731545,0.73,Quintile5,81,1120,1534.25
2,Arizona,34.42,7278717,7.28,Quintile2,89,91448,12561.54
3,Arkansas,22.59,3017804,3.02,Quintile4,90,22801,7550.0
4,California,20.88,39512223,39.51,Quintile1,101,260133,6583.98


In [24]:
fig = px.scatter(rateByPop_df, x="since100cases", y="weekly_percent", log_x=False, size='millions',
              color='populationGroup', hover_name="state", hover_data=["avgCases30Day"])
titleString = 'Flatten - Grade States by Population, Rate of Transmission, and Disease Maturity (last 30 days)'
fig.update_layout(
    title=titleString,
    xaxis_title="Days since 100 cases (higher=worse)",
    yaxis_title="Weekly Change Rate (%) (higher=worse)",
)

#fig.update_xaxes(autorange="reversed")
fig.show()

In [25]:
fig = px.scatter(rateByPop_df, x="since100cases", y="weekly_percent", log_x=False, size='millions',
              color='populationGroup', hover_name="state", hover_data=["avgCases30Day"])
titleString = 'Flatten - Grade States by Population, Rate of Transmission, and Disease Maturity (last 30 days)'
fig.update_layout(
    title=titleString,
    xaxis_title="Days since 100 cases (higher=worse)",
    yaxis_title="Weekly Change Rate (%) (higher=worse)",
)

#fig.update_xaxes(autorange="reversed")
fig.show()

In [26]:
fig = px.scatter(rateByPop_df, x="since100cases", y="cases_per_mm", log_x=False, size='millions',
              color='populationGroup', hover_name="state", hover_data=["avgCases30Day"])
titleString = 'Impacted - States by Population, Rate of Transmission, and Disease Maturity (last 30 days)'
fig.update_layout(
    title=titleString,
    xaxis_title="Days since 100 cases (lower=worse)",
    yaxis_title="Cumulative Cases per Million (higher=worse)",
)

#fig.update_xaxes(autorange="reversed")
fig.show()

## Now some select counties by states that are not doing well:
  * Arizona
  * Florida
  * Texas
  * California

In [27]:
stateList = ['Arizona', 'Florida', 'Texas', 'California']
for stateName in stateList:
  plotStateGeo(stateName, stateName+"- Infections by Initial Detection Date", 
               plotlyChart )

plotStates (states_all_df.loc[states_all_df['state'].isin(stateList),],
            plotlyChart, "The Large Increasing States: "+ ",".join(stateList) 
            +" combined Chart")

stateList = ['New York', 'New Jersey', 'Illinois', 'Massachusetts']
plotStates (states_all_df.loc[states_all_df['state'].isin(stateList),],
            plotlyChart, "The Large Decreasing States: "+ ",".join(stateList) 
            +" combined Chart")

In [28]:
stateList = ['New York', 'New Jersey', 'Illinois', 'Massachusetts']

titleString = 'Decreasing States: '+'Normalized 7 day % Rate of Change (over last '+str(90)+' days)'
plotPercentChange(pct_df, stateList, titleString)

In [29]:
stateList = ['New York', 'New Jersey', 'Illinois', 'Massachusetts']

titleString = 'Decreasing States: '+'Normalized 7 day % Rate of Change (over last '+str(30)+' days)'
plotPercentChange(pct30_df, stateList, titleString)

# Done