<IMG SRC="https://github.com/jacquesroy/byte-size-data-science/raw/master/images/Banner.png" ALT="BSDS Banner" WIDTH=1195 HEIGHT=200>

<table align="left">
    <tr><td>
<a rel="license" href="http://creativecommons.org/licenses/by/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by/4.0/88x31.png" /></a></td><td>This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</a>.</td>
    </tr>
    <tr><td>Jacques Roy, Byte Size Data Science</td><td> </td></tr>
    </table>

# Time Series with Covid Data

In [None]:
# youtube video related to this notebook
from IPython.display import IFrame

IFrame(src="https://www.youtube.com/embed/awC9-O_evAo?rel=0&amp;controls=0&amp;showinfo=0", width=560, height=315)

In [None]:
# Libraries needed in the notebook
import requests, json
import pandas as pd
import numpy as np
import datetime as dt
import dateutil.parser
from io import StringIO
import math

# pd.set_option('display.max_colwidth', -1)

import matplotlib.pyplot as plt
# matplotlib.patches lets us create colored patches, which we can use for legends in plots
import matplotlib.patches as mpatches
%matplotlib inline

## Getting the data
Let's look at data from the CDC.

In [None]:
# Library used to read datasets
# https://github.com/xmunoz/sodapy
!pip install sodapy 2>&1 >pipsodapy.txt

from sodapy import Socrata

In [None]:
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cdc.gov", None)

### Get the available datasets
you can also go to: https://data.cdc.gov/browse?q=covid&sortBy=relevance

In [None]:
# Get information on the available datasets
results = client.datasets() # returns an array
print("Number of datasets: {}".format(len(results)))

In [None]:
# Names of the COVID-related datasets
xxx = [res['resource'] for res in results if "COVID" in res['resource']['name']]
print("Number of COVID related datasets: {}".format(len(xxx)))
[res['name'] for res in xxx]

In [None]:
# Get a specific dataset to move forward with
info = [res for res in xxx if res['name'] == 'United States COVID-19 Cases and Deaths by State over Time'][0]
info

### Get some metadata on the columns

In [None]:
for i in range(len(info['columns_name'])) :
    print("{:15}|{}".format(info['columns_name'][i],info['columns_datatype'][i]))

In [None]:
# Get the columns for 85ca-t3if|Traffic Crashes - Crashes
meta = client.get_metadata(info['id'])
print("{:15}|{:14}|{:8}|{}".format("Column name","type","non-null", "description"))
print("===============|==============|========")
for c in meta['columns'] :
    print("{:15}|{:14}|{:8}|{}".format(c['name'],c['dataTypeName'],c['cachedContents']['non_null'],c['description']))

### Get all the records
This API returns the data as character strings and limits queries to 10,000 records. We need to loop.

In [None]:
data_df = pd.DataFrame(client.get(info['id'], limit=10000))
offset = 10000
result = client.get(info['id'], offset=offset, limit=10000)
while (len(result) > 0) :
    data_df = data_df.append(pd.DataFrame(result))
    offset += 10000
    result = client.get(info['id'], offset=offset, limit=10000)

print("Number of records: " + str(data_df.shape[0]))
# data_df.head()

### We need to convert to the proper data types

In [None]:
need_types = {
    'submission_date': 'datetime64[ns]', 'tot_cases': float, 'conf_cases': float, 'prob_cases': float,
    'new_case': float, 'pnew_case': float, 'tot_death': float, 'new_death': float, 'pnew_death': float,
    'created_at': 'datetime64[ns]', 'conf_death': float, 'prob_death': float
}
data2_df = data_df.astype(need_types).sort_values(['submission_date', 'state'])

data2_df.head()

In [None]:
data2_df.tail()

### Limit the columns we use
We use only: 'submission_date', 'state', 'tot_case', 'new_cases', 'tot_death', 'new_death'

In [None]:
data3_df = data2_df[['submission_date', 'state', 'tot_cases', 'new_case', 'tot_death', 'new_death']]

### Let's see a set of rows for a specific state

In [None]:
data3_df.loc[data3_df['state'] == 'OR'].tail(5)

## Plot some curves: New cases
The X axis is the index so we set it to `submission_date`

In [None]:
or_df = data3_df.loc[data3_df['state'] == 'OR']
or_df.index = or_df['submission_date']
plt.figure(figsize=(18,6))
or_df['new_case'].plot.line(legend=True, grid=True)
plt.title('Daily cases, Oregon')
plt.show()

In [None]:
ny_df = data3_df.loc[data3_df['state'] == 'NY']
ny_df.index = ny_df['submission_date']
plt.figure(figsize=(18,6))
ny_df['new_case'].plot.line(legend=True,grid=True)
plt.title('Daily cases, New York')
plt.show()

In [None]:
ny_df.loc['2020-04'].head(10)

### Plot New York and Oregon on the same graph

In [None]:
plt.figure(figsize=(18,6))
or_df['new_case'].plot.line(label='Oregon',legend=True,grid=True)
ny_df['new_case'].plot.line(label='New York',legend=True,grid=True)
plt.title('Daily cases, Oregon and New York')
plt.show()

### Population:
- Oregon state  :  4,217,737
- New York state: 19,453,561

New York has over 4.6 times Oregon's population.

### States population
We use this so we can compare states.

Amazingly difficult to find. I used a table found at: https://data.ers.usda.gov/reports.aspx?ID=17827

Using the 2019 column. The important part is that we have a fix data point of population by state.

In [None]:
states_pop = """
state,description,total
US,United States,328239523
AL,Alabama,4903185
AK,Alaska,731545
AZ,Arizona,7278717
AR,Arkansas,3017804
CA,California,39512223
CO,Colorado,5758736
CT,Connecticut,3565287
DE,Delaware,973764
DC,District of Columbia,705749
FL,Florida,21477737
GA,Georgia,10617423
HI,Hawaii,1415872
ID,Idaho,1787065
IL,Illinois,12671821
IN,Indiana,6732219
IA,Iowa,3155070
KS,Kansas,2913314
KY,Kentucky,4467673
LA,Louisiana,4648794
ME,Maine,1344212
MD,Maryland,6045680
MA,Massachusetts,6892503
MI,Michigan,9986857
MN,Minnesota,5639632
MS,Mississippi,2976149
MO,Missouri,6137428
MT,Montana,1068778
NE,Nebraska,1934408
NV,Nevada,3080156
NH,New Hampshire,1359711
NJ,New Jersey,8882190
NM,New Mexico,2096829
NY,New York,19453561
NC,North Carolina,10488084
ND,North Dakota,762062
OH,Ohio,11689100
OK,Oklahoma,3956971
OR,Oregon,4217737
PA,Pennsylvania,12801989
RI,Rhode Island,1059361
SC,South Carolina,5148714
SD,South Dakota,884659
TN,Tennessee,6829174
TX,Texas,28995881
UT,Utah,3205958
VT,Vermont,623989
VA,Virginia,8535519
WA,Washington,7614893
WV,West Virginia,1792147
WI,Wisconsin,5822434
WY,Wyoming,578759
PR,Puerto Rico,3193694
"""

with StringIO(states_pop) as f:
    pop_df = pd.read_csv(f)
pop_df.head()

### Compare Oregon and New York
This time proportionally

In [None]:
or_div = pop_df[pop_df.state == 'OR'].total.values[0] / 100000
ny_div = pop_df[pop_df.state == 'NY'].total.values[0] / 100000

or_df.new_case = or_df.new_case / or_div
ny_df.new_case = ny_df.new_case / ny_div

In [None]:
plt.figure(figsize=(18,6))
or_df['new_case'].plot.line(label='Oregon',legend=True,grid=True)
ny_df['new_case'].plot.line(label='New York',legend=True,grid=True)
plt.title('Daily cases, Oregon and New York')
plt.show()

### Create a timeseries for the entire US

In [None]:
# 'submission_date', 'state', 'tot_cases', 'new_case', 'tot_death', 'new_death'
us_df = data3_df.groupby('submission_date').agg(np.sum)
us_df.tail()

### Plot multiple states but use number of cases by 100K people
Also plot the US to see which ones are above and below national average

In [None]:
# us population, create a divided for cases by 100K residents
us_100k = pop_df[pop_df.state == 'US'].total.values[0] / 100000
us_100k_df = us_df[['new_case']] / us_100k

In [None]:
states_list=['AZ','CA', 'FL', 'HI', 'MI', 'ND', 'NY', 'OR', 'SD', 'TX']

states_df = data3_df[data3_df.state.isin(states_list)][['submission_date','state','new_case']]

In [None]:
nb_rows = math.ceil(len(states_list) / 2)

fig, axes = plt.subplots(nrows=nb_rows, ncols=2)
fig.set_figheight(20)
fig.set_figwidth(15)
for ix, ax in enumerate(axes.flatten()) :
    if (ix < len(states_list) ) :
        div_val = pop_df[pop_df.state == states_list[ix]].total.values[0] / 100000
        tmp_pd = states_df[states_df['state'] == states_list[ix]][['submission_date','new_case']]
        tmp_pd.new_case = tmp_pd.new_case / div_val
        tmp_pd.plot.line(ax=ax, x='submission_date',y='new_case', label= states_list[ix],
                         title=states_list[ix] + " New Cases", legend=True)
        us_100k_df.plot.line(ax=ax, y='new_case', label="US", legend=True)
        ax.set_xlabel('')
    else:
        fig.delaxes(ax) # Remove empty graph if any

### Compare  selected states with each other

In [None]:
plt.figure(figsize=(18,6))
states_list = ['AZ','CA','MI','OR','SD']
fig, axes = plt.subplots(nrows=1, ncols=1)
fig.set_figheight(6)
fig.set_figwidth(18)
for state in states_list :
    diviser = pop_df[pop_df.state == state].total.values[0] / 100000
    tmp_df = data3_df.loc[data3_df['state'] == state].reset_index()
    tmp_df['new_case_div'] = tmp_df['new_case'].div(diviser)
    tmp_df.plot.line(ax=axes, x='submission_date',y='new_case_div',label=state,legend=True,grid=True)
plt.title('Daily Cases for: {}'.format(", ".join(str(x) for x in states_list)))
plt.show()