<IMG SRC="https://github.com/jacquesroy/byte-size-data-science/raw/master/images/Banner.png" ALT="BSDS Banner" WIDTH=1195 HEIGHT=200>

<table align="left">
    <tr><td>
<a rel="license" href="http://creativecommons.org/licenses/by/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by/4.0/88x31.png" /></a></td><td>This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</a>.</td>
    </tr>
    <tr><td>Jacques Roy, Byte Size Data Science</td><td> </td></tr>
    </table>

# Time Series with Covid Data

In [None]:
# youtube video related to this notebook
from IPython.display import IFrame

IFrame(src="https://www.youtube.com/embed/-OCj9L11y_0?rel=0&amp;controls=0&amp;showinfo=0", width=560, height=315)

In [None]:
# Libraries needed in the notebook
import requests, json
import pandas as pd
import numpy as np
import datetime as dt
import dateutil.parser
from io import StringIO
import math

# pd.set_option('display.max_colwidth', -1)

import matplotlib.pyplot as plt
# matplotlib.patches lets us create colored patches, which we can use for legends in plots
import matplotlib.patches as mpatches
%matplotlib inline

## Getting the data
Let's look at data from the CDC.

In [None]:
# Library used to read datasets
# https://github.com/xmunoz/sodapy
!pip install sodapy 2>&1 >pipsodapy.txt

from sodapy import Socrata

In [None]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cdc.gov", None)

### Get records from March 2021
This API returns the data as character strings and limits queries to 10,000 records. We need to loop.

We also select specific attributes, the same as in notebook 84.

In [None]:
from datetime import date

# If we wanted to do today:
# We are using a fix date for future comparisons
start_date = (date(2021,3,1)).strftime('%Y-%m-%d')
where = "submission_date >= '{}'".format(start_date)
select = "submission_date,state,tot_cases,new_case,tot_death,new_death"

In [None]:
data_df = pd.DataFrame(client.get('9mfq-cb36', select=select,where=where, limit=10000))
offset = 10000
result = client.get('9mfq-cb36', offset=offset, select=select,where=where, limit=10000)
while (len(result) > 0) :
    data_df = data_df.append(pd.DataFrame(result))
    offset += 10000
    result = client.get('9mfq-cb36', offset=offset, select=select,where=where, limit=10000)

print("Number of records: " + str(data_df.shape[0]))
data_df.head()

### We need to convert to the proper data types

In [None]:
need_types = {
    'submission_date': 'datetime64[ns]', 'tot_cases': float,
    'new_case': float, 'tot_death': float, 'new_death': float
}
data2_df = data_df.astype(need_types).sort_values(['submission_date', 'state'])

data2_df.head()

### States population
We use this so we can compare states.

Amazingly difficult to find. I used a table found at: https://data.ers.usda.gov/reports.aspx?ID=17827

Using the 2019 column. The important part is that we have a fix data point of population by state.

In [None]:
states_pop = """
state,description,total
US,United States,328239523
AL,Alabama,4903185
AK,Alaska,731545
AZ,Arizona,7278717
AR,Arkansas,3017804
CA,California,39512223
CO,Colorado,5758736
CT,Connecticut,3565287
DE,Delaware,973764
DC,District of Columbia,705749
FL,Florida,21477737
GA,Georgia,10617423
HI,Hawaii,1415872
ID,Idaho,1787065
IL,Illinois,12671821
IN,Indiana,6732219
IA,Iowa,3155070
KS,Kansas,2913314
KY,Kentucky,4467673
LA,Louisiana,4648794
ME,Maine,1344212
MD,Maryland,6045680
MA,Massachusetts,6892503
MI,Michigan,9986857
MN,Minnesota,5639632
MS,Mississippi,2976149
MO,Missouri,6137428
MT,Montana,1068778
NE,Nebraska,1934408
NV,Nevada,3080156
NH,New Hampshire,1359711
NJ,New Jersey,8882190
NM,New Mexico,2096829
NY,New York,19453561
NC,North Carolina,10488084
ND,North Dakota,762062
OH,Ohio,11689100
OK,Oklahoma,3956971
OR,Oregon,4217737
PA,Pennsylvania,12801989
RI,Rhode Island,1059361
SC,South Carolina,5148714
SD,South Dakota,884659
TN,Tennessee,6829174
TX,Texas,28995881
UT,Utah,3205958
VT,Vermont,623989
VA,Virginia,8535519
WA,Washington,7614893
WV,West Virginia,1792147
WI,Wisconsin,5822434
WY,Wyoming,578759
PR,Puerto Rico,3193694
"""

with StringIO(states_pop) as f:
    pop_df = pd.read_csv(f)
# pop_df.head()

### Plot multiple states but use number of cases by 100K people
Also plot the US to see which ones are above and below national average

In [None]:
# 'submission_date', 'state', 'tot_cases', 'new_case', 'tot_death', 'new_death'
us_df = data2_df.groupby('submission_date').agg(np.sum)
# us_df.tail()

In [None]:
# us population, create a divided for cases by 100K residents
us_100k = pop_df[pop_df.state == 'US'].total.values[0] / 100000
us_100k_df = us_df[['new_case']] / us_100k

In [None]:
states_list=['AZ','CA', 'FL', 'HI', 'MI', 'ND', 'NY', 'OR', 'SD', 'TX']

states_df = data2_df[data2_df.state.isin(states_list)][['submission_date','state','new_case']]

In [None]:
nb_rows = math.ceil(len(states_list) / 2)

fig, axes = plt.subplots(nrows=nb_rows, ncols=2)
fig.set_figheight(20)
fig.set_figwidth(15)
for ix, ax in enumerate(axes.flatten()) :
    if (ix < len(states_list) ) :
        div_val = pop_df[pop_df.state == states_list[ix]].total.values[0] / 100000
        tmp_pd = states_df[states_df['state'] == states_list[ix]][['submission_date','new_case']]
        tmp_pd.new_case = tmp_pd.new_case / div_val
        tmp_pd.plot.line(ax=ax, x='submission_date',y='new_case', label= states_list[ix],
                         title=states_list[ix] + " New Cases", legend=True)
        us_100k_df.plot.line(ax=ax, y='new_case', label="US", legend=True)
        ax.set_xlabel('')
    else:
        fig.delaxes(ax) # Remove empty graph if any

## Oregon with moving average
Use Pandas **`rolling`** capability

In [None]:
or_df = data2_df.loc[data2_df['state'] == 'OR']
or_df.index = or_df.loc[:,'submission_date']

or_df = or_df.assign(ma7=or_df['new_case'].rolling(7, min_periods=1).mean()) 

plt.figure(figsize=(18,6))
or_df['new_case'].plot.line(legend=True, grid=True)
or_df['ma7'].plot.line(legend=True, grid=True)
plt.title('Daily cases, Oregon')
plt.show()

### Different moving average
ewm: exponential weighted

In [None]:
or_df = or_df.assign(ewm = or_df['new_case'].ewm(halflife=4.0).mean())
plt.figure(figsize=(18,6))
or_df['new_case'].plot.line(legend=True, grid=True)
or_df['ma7'].plot.line(legend=True, grid=True)
or_df['ewm'].plot.line(legend=True, grid=True)
plt.title('Daily cases, Oregon')
plt.show()

## Comparing States with moving averages
MI ND OR and USA

In [None]:
plt.figure(figsize=(18,6))
for state in ['MI', 'ND', 'OR'] :
    div_val = pop_df[pop_df.state == state]['total'].values[0] / 100000
    tmp_pd = data2_df.loc[data2_df['state'] == state]
    tmp_pd.index = tmp_pd['submission_date']
    
    #tmp_pd['new_case'] = tmp_pd['new_case'] / div_val # Causes warning
    tmp_pd = tmp_pd.assign(normalized=tmp_pd['new_case'] / div_val)
    
    tmp_pd = tmp_pd.assign(ma7=tmp_pd['normalized'].rolling(7, min_periods=1).mean())
    tmp_pd['ma7'].plot.line(label=state, legend=True, grid=True)
    
ma7 = us_100k_df['new_case'].rolling(7, min_periods=1).mean()
ma7.plot.line(label='USA', legend=True, grid=True)
plt.title('Daily cases moving average')
plt.show()

### TS Analysis
- Stationary
- AR: Autoregression
- MA: Moving average
- ARMA: Autoregression moving average
- ARIMA: Autoregressive integrates moving average


https://www.statsmodels.org/stable/examples/index.html#time-series-analysis

## Stationary?

In [None]:
import statsmodels.graphics.tsaplots as sgt
import statsmodels.tsa.stattools as sts
from statsmodels.tsa.seasonal import seasonal_decompose

In [None]:
# Function for color display
from IPython.display import Markdown, display

def printmd(string):
    display(Markdown(string))
    
def is_stationary(adf, name) :
    if (adf[1] < 0.5) :
        if (adf[0] < adf[4]['1%']) :
            print('The {} time series is stationary within the 1% margin'.format(name))
        elif (adf[0] < adf[4]['5%']) :
            print('The {} time series is stationary within the 5% margin'.format(name))
        else :
            printmd("The {} time series is <span style='color:{}'>**NOT**</span> stationary".format(name,'red'))
    else :
        printmd("The {} time series is <span style='color:{}'>**NOT**</span> stationary".format(name,'red'))
    return

In [None]:
adf = sts.adfuller(or_df.new_case)
print('adf: {}\npvalue: {}\nusedlag: {}\nnubs: {}'.format(adf[0],adf[1],adf[2],adf[3]))
print('critical values: {}\nicbest: {}'.format(adf[4],adf[5]))
is_stationary(adf, 'total')

### ARIMA
Predicting the next time step.

In [None]:
from statsmodels.tsa.ar_model import AutoReg, ar_select_order
from statsmodels.tsa.api import acf, pacf, graphics

In [None]:
!pip install pmdarima 2>&1 >pmdarima.out

from statsmodels.tsa.arima_model import ARIMA
from pmdarima.arima import auto_arima

In [None]:
model_auto = auto_arima(or_df.new_case)

In [None]:
model_auto.summary()