# covid stuff

----------------------------------------

- **created** by z: `2020-03-30`
- last **updated**: `2020-04-01T12:51:42PDT`

## _preamble_

#### import packages

In [31]:
import pathlib
import requests
import re
import math
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

#### disable request warning

In [32]:
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

#### directories

In [33]:
# cwd = pathlib.Path.cwd()
# cwd
# docs_dir = pathlib.Path("/private/var/mobile/Library/Mobile Documents/iCloud~AsheKube~Carnets/Documents")
# data_dir = docs_dir / "data"

# where data will be saved/loaded
data_dir = pathlib.Path("/Users/zarek/Dropbox/code/github/zcovid/data")
if not data_dir.is_dir():
    data_dir.mkdir()
    print(">>> created dir {}".format(data_dir))

#### URLs

In [34]:
# base URL for data downloads
base_tsdata_url = "https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data"

#### regexes

In [35]:
date_re = re.compile(r"\d+/\d+/\d+")

#### time-series keys

In [36]:
# for file names and saving data
tskeys = [
    'confirmed_US',
    'confirmed_global',
    'deaths_US',
    'deaths_global',
    'recovered_global'
]

#### utility functions

In [37]:
# sinum(): format number SI

SINUM_PREFIXES = {
    -6: {'short': "a", 'long': "atto"},
    -5: {'short': "f", 'long': "femto"},
    -4: {'short': "p", 'long': "pico"},
    -3: {'short': "n", 'long': "nano"},
    -2: {'short': "μ", 'long': "micro"},
    -1: {'short': "m", 'long': "milli"},
    0: {'short': " ", 'long': "-"},
    1: {'short': "k", 'long': "kilo"},
    2: {'short': "M", 'long': "mega"},
    3: {'short': "G", 'long': "giga"},
    4: {'short': "T", 'long': "tera"},
    5: {'short': "P", 'long': "peta"},
    6: {'short': "E", 'long': "exa"}
}

def sinum(num, unit='B', fmt="{coef:.3f} {pfx}{unit}", long_pfx=False, strip_zeros=True, binary=False, verbose=False):

    # check inputs
    assert isinstance(num, (int, float))
    assert isinstance(unit, str)
    
    # verbosity...
    if verbose:
        print(f">>> sinum(num={num!r}, unit={unit!r}, fmt={fmt!r}, long_pfx={long_pfx}, strip_zeros={strip_zeros}, binary={binary}, verbose={verbose})")
    def _verb(name, value):
        if verbose:
            print("\t{:<12s} = {:>20s}".format(
                name, 
                value if isinstance(value, str) else repr(value)
            ))
            
    # binary mods
    if binary:
        log_base = 1024
        unit = f"i{unit!s}"        
    else:
        log_base = 1000
        unit = str(unit)
        
    # order of magnitude
    if num == 0:
        oom = 0
    else:
        oom = math.floor(math.log(-num if num < 0 else num, log_base))
    if oom < -6:
        oom = -6
    if oom > 6:
        oom = 6
    _verb('oom', oom)
    
    # coefficient
    coef = num / (log_base ** oom)
    _verb('coef', coef)
    
    # SI prefix
    pfx = SINUM_PREFIXES[oom]['long' if long_pfx else 'short']
    _verb('pfx', pfx)
    
    # string out
    out = fmt.format(coef=coef, pfx=pfx, unit=unit)
    # if strip_zeros:
    #     while re.match(r"\d+\.0{2,}", out):
    #         # out = re.sub(r"(?:\d+(?:\.0*)?)0", ' ', out)
    #         out = re.sub(r"(\d+\.0*?)(0)\b", '\1 ', out)
    _verb('out', out)

    return out


# tests
if False:
    print(sinum(82457891234, verbose=True), end='\n\n')
    print(sinum(82457891234, verbose=True, unit=''), end='\n\n')
    print(sinum(82457891234, verbose=True, unit='bloops'), end='\n\n')
    print(sinum(82457891234, binary=True, verbose=True), end='\n\n')
    print(sinum(0.00082457891234, verbose=True), end='\n\n')
    print(sinum(824578912342345.2345, verbose=True), end='\n\n')
    print(sinum(0.00082457891234, binary=True, verbose=True), end='\n\n')
    print(sinum(824578912342345.2345, binary=True, verbose=True), end='\n\n')
    print(sinum(824578912342345.2345, binary=True, verbose=True, fmt="{coef:12.5f} // {pfx} // {unit}"), end='\n\n')
    print(sinum(824578912342345.2345, binary=True, fmt="{coef:12.5f} // {pfx} // {unit}"), end='\n\n')
    print(sinum(1, verbose=True), end='\n\n')
    print(sinum(-1, verbose=True), end='\n\n')
    print(sinum(0, verbose=True), end='\n\n')
    print(sinum(-82457891234, verbose=True), end='\n\n')
    print(sinum(8786996786798967896872457891234, verbose=True), end='\n\n')

## load data

#### initialize data container `d`

In [40]:
# a dict with tskeys as keys to uniform dicts
d = {}
for tsk in tskeys:
    d[tsk] = {}
d

{'confirmed_US': {},
 'confirmed_global': {},
 'deaths_US': {},
 'deaths_global': {},
 'recovered_global': {}}

#### download data from github, save to file

In [41]:
# same load and save process for each tskey
print(f">>> loading CSVs, saving in ``{data_dir}''\n")
for tsk in tskeys:
    print(">>> getting data for '{}'".format(tsk))
    d[tsk]['url'] = f"{base_tsdata_url}/csse_covid_19_time_series/time_series_covid19_{tsk}.csv"
    d[tsk]['req'] = requests.get(d[tsk]['url'], auth=('user', 'pass'))
    d[tsk]['raw'] = d[tsk]['req'].content
    d[tsk]['csv'] = data_dir / f"time_series_covid19_{tsk}.csv"
    if d[tsk]['csv'].is_file() and d[tsk]['csv'].stat().st_size > 0:
        print(f"--> CSV for '{tsk}' will be overwritten")
    with d[tsk]['csv'].open('w') as f:
        print("--> writing ``.../{}'' ... ".format(d[tsk]['csv'].name), end='')
        f.write(d[tsk]['raw'].decode())
        print("wrote {}\n".format(sinum(d[tsk]['csv'].stat().st_size)))
    del d[tsk]['raw']
# d

>>> loading CSVs, saving in ``/Users/zarek/Dropbox/code/github/zcovid/data''

>>> getting data for 'confirmed_US'
--> CSV for 'confirmed_US' will be overwritten
--> writing ``.../time_series_covid19_confirmed_US.csv'' ... wrote 766.891 kB

>>> getting data for 'confirmed_global'
--> CSV for 'confirmed_global' will be overwritten
--> writing ``.../time_series_covid19_confirmed_global.csv'' ... wrote 53.769 kB

>>> getting data for 'deaths_US'
--> CSV for 'deaths_US' will be overwritten
--> writing ``.../time_series_covid19_deaths_US.csv'' ... wrote 779.130 kB

>>> getting data for 'deaths_global'
--> CSV for 'deaths_global' will be overwritten
--> writing ``.../time_series_covid19_deaths_global.csv'' ... wrote 44.961 kB

>>> getting data for 'recovered_global'
--> CSV for 'recovered_global' will be overwritten
--> writing ``.../time_series_covid19_recovered_global.csv'' ... wrote 45.985 kB



#### load data from CSV just saved

In [42]:
# iterate tskeys, loading from CSV saved above
for tsk in tskeys:
    d[tsk]['df'] = pd.read_csv(d[tsk]['csv'])
    print(f"--> read CSV data for {tsk}")
# create backup of d
d_BAK = d.copy()

--> read CSV data for confirmed_US
--> read CSV data for confirmed_global
--> read CSV data for deaths_US
--> read CSV data for deaths_global
--> read CSV data for recovered_global


## clean up data

#### add index columns `d[tsk]['df']` (actual dataframe for the key), then reorder as desired

In [43]:
# column name substitutions
col_subs = {
    'Province_State': 'subregion',
    'Province/State': 'subregion',
    'Country_Region': 'region',
    'Country/Region': 'region',
    'Long_': 'long'    
} 

# columns to move to the beginning (in order)
priority_cols = [
    'locid',
    'region',
    'subregion',
    'combined_key'
]

# collect all columns since different dataframes dont have same columns
all_indx_cols = []
all_date_cols = []

# iterate through tskeys, cleaning up each
for tsk in tskeys:
    
    print(f">>> cleaning up dataframe for '{tsk}'")
    
    # add other index cols
    d[tsk]['df']['tskey'] = tsk
    d[tsk]['df']['domain'] = tsk.split('_')[1]
    d[tsk]['df']['datum'] = tsk.split('_')[0]
    d[tsk]['df']['locid'] = d[tsk]['df'].index
    
    d[tsk]['all_cols'] = list(d[tsk]['df'].columns)
    
    # clean up column names
    for i, c in enumerate(d[tsk]['all_cols']):
        if c in col_subs:
            c = col_subs[c]
        c = c.lower()
        d[tsk]['all_cols'][i] = c
    # print(d[tsk]['all_cols'])
    
    # get column subsets
    d[tsk]['df'].columns = d[tsk]['all_cols']
    d[tsk]['date_cols'] = list(filter(date_re.match, d[tsk]['all_cols']))
    d[tsk]['indx_cols'] = [i for i in d[tsk]['all_cols'] if i not in d[tsk]['date_cols']]

    # reorder columns
    col_idxs = list(range(len(d[tsk]['indx_cols'])))
    for col in priority_cols[::-1]:
        if col in d[tsk]['indx_cols']:
            idx = d[tsk]['indx_cols'].index(col)
            col_idxs.remove(idx)
            col_idxs.insert(0, idx)
    print(col_idxs)
    d[tsk]['indx_cols'] = [d[tsk]['indx_cols'][i] for i in col_idxs]
        
    # add to all_indx_cols
    for col in d[tsk]['indx_cols']:
        if col not in all_indx_cols:
            all_indx_cols.append(col)

    # add to all_date_cols
    for col in d[tsk]['date_cols']:
        if col not in all_date_cols:
            all_date_cols.append(col)

    # save dataframe with reordered columns
    d[tsk]['all_cols'] = [*d[tsk]['indx_cols'], *d[tsk]['date_cols']]
    d[tsk]['df'] = d[tsk]['df'][d[tsk]['all_cols']]

# d[tsk]

print(all_indx_cols)
print(all_date_cols)

d[tsk]['df']


>>> cleaning up dataframe for 'confirmed_US'
[14, 7, 6, 10, 0, 1, 2, 3, 4, 5, 8, 9, 11, 12, 13]
>>> cleaning up dataframe for 'confirmed_global'
[7, 1, 0, 2, 3, 4, 5, 6]
>>> cleaning up dataframe for 'deaths_US'
[15, 7, 6, 10, 0, 1, 2, 3, 4, 5, 8, 9, 11, 12, 13, 14]
>>> cleaning up dataframe for 'deaths_global'
[7, 1, 0, 2, 3, 4, 5, 6]
>>> cleaning up dataframe for 'recovered_global'
[7, 1, 0, 2, 3, 4, 5, 6]
['locid', 'region', 'subregion', 'combined_key', 'uid', 'iso2', 'iso3', 'code3', 'fips', 'admin2', 'lat', 'long', 'tskey', 'domain', 'datum', 'population']
['1/22/2020', '1/23/2020', '1/24/2020', '1/25/2020', '1/26/2020', '1/27/2020', '1/28/2020', '1/29/2020', '1/30/2020', '1/31/2020', '2/1/2020', '2/2/2020', '2/3/2020', '2/4/2020', '2/5/2020', '2/6/2020', '2/7/2020', '2/8/2020', '2/9/2020', '2/10/2020', '2/11/2020', '2/12/2020', '2/13/2020', '2/14/2020', '2/15/2020', '2/16/2020', '2/17/2020', '2/18/2020', '2/19/2020', '2/20/2020', '2/21/2020', '2/22/2020', '2/23/2020', '2/24/2020'

Unnamed: 0,locid,region,subregion,lat,long,tskey,domain,datum,1/22/20,1/23/20,...,3/22/20,3/23/20,3/24/20,3/25/20,3/26/20,3/27/20,3/28/20,3/29/20,3/30/20,3/31/20
0,0,Afghanistan,,33.000000,65.000000,recovered_global,global,recovered,0,0,...,1,1,1,2,2,2,2,2,2,5
1,1,Albania,,41.153300,20.168300,recovered_global,global,recovered,0,0,...,2,2,10,17,17,31,31,33,44,52
2,2,Algeria,,28.033900,1.659600,recovered_global,global,recovered,0,0,...,65,65,24,65,29,29,31,31,37,46
3,3,Andorra,,42.506300,1.521800,recovered_global,global,recovered,0,0,...,1,1,1,1,1,1,1,1,10,10
4,4,Angola,,-11.202700,17.873900,recovered_global,global,recovered,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237,237,United Kingdom,Turks and Caicos Islands,21.694000,-71.797900,recovered_global,global,recovered,0,0,...,0,0,0,0,0,0,0,0,0,0
238,238,MS Zaandam,,0.000000,0.000000,recovered_global,global,recovered,0,0,...,0,0,0,0,0,0,0,0,0,0
239,239,Botswana,,-22.328500,24.684900,recovered_global,global,recovered,0,0,...,0,0,0,0,0,0,0,0,0,0
240,240,Burundi,,-3.373100,29.918900,recovered_global,global,recovered,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
for tsk in tskeys:
    # print(d[tsk]['indx_cols'])
    # if type(d[tsk]['df'].columns).__name__ == 'Index':
    if ('df_BAK' not in d[tsk]) or isinstance(d[tsk]['df'].columns, pd.Index):
        d[tsk]['df_BAK'] = d[tsk]['df'].copy()

In [45]:
for tsk in tskeys:
    
    # create multiindex dataframe (df with just index cols)
    mindx_df = d[tsk]['df_BAK'][d[tsk]['indx_cols']]
    # create multiindex
    mindx = pd.MultiIndex.from_frame(mindx_df)
    # create new dataframe, old df transposed
    d[tsk]['df'] = d[tsk]['df_BAK'][d[tsk]['date_cols']].transpose()
    # add the new multiindex
    d[tsk]['df'].columns = mindx
    # convert index from str to datetime
    d[tsk]['df'].index = pd.to_datetime(d[tsk]['df'].index)
    
# mindx_df
# mindx

d[tskeys[0]]['df']

locid,0,1,2,3,4,5,6,7,8,9,...,3243,3244,3245,3246,3247,3248,3249,3250,3251,3252
region,US,US,US,US,US,US,US,US,US,US,...,US,US,US,US,US,US,US,US,US,US
subregion,American Samoa,Guam,Northern Mariana Islands,Puerto Rico,Virgin Islands,Alabama,Alabama,Alabama,Alabama,Alabama,...,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming,Grand Princess
combined_key,"American Samoa, US","Guam, US","Northern Mariana Islands, US","Puerto Rico, US","Virgin Islands, US","Autauga, Alabama, US","Baldwin, Alabama, US","Barbour, Alabama, US","Bibb, Alabama, US","Blount, Alabama, US",...,"Unassigned, Tennessee, US","Unassigned, Texas, US","Unassigned, Utah, US","Unassigned, Vermont, US","Unassigned, Virginia, US","Unassigned, Washington, US","Unassigned, West Virginia, US","Unassigned, Wisconsin, US","Unassigned, Wyoming, US","Grand Princess, US"
uid,16,316,580,630,850,84001001,84001003,84001005,84001007,84001009,...,84090047,84090048,84090049,84090050,84090051,84090053,84090054,84090055,84090056,84099999
iso2,AS,GU,MP,PR,VI,US,US,US,US,US,...,US,US,US,US,US,US,US,US,US,US
iso3,ASM,GUM,MNP,PRI,VIR,USA,USA,USA,USA,USA,...,USA,USA,USA,USA,USA,USA,USA,USA,USA,USA
code3,16,316,580,630,850,840,840,840,840,840,...,840,840,840,840,840,840,840,840,840,840
fips,60.0,66.0,69.0,72.0,78.0,1001.0,1003.0,1005.0,1007.0,1009.0,...,90047.0,90048.0,90049.0,90050.0,90051.0,90053.0,90054.0,90055.0,90056.0,99999.0
admin2,NaN,NaN,NaN,NaN,NaN,Autauga,Baldwin,Barbour,Bibb,Blount,...,Unassigned,Unassigned,Unassigned,Unassigned,Unassigned,Unassigned,Unassigned,Unassigned,Unassigned,NaN
lat,-14.271000,13.444300,15.097900,18.220800,18.335800,32.539527,30.727750,31.868263,32.996421,33.982109,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
long,-170.132000,144.793700,145.673900,-66.590100,-64.896300,-86.644082,-87.722071,-85.387129,-87.125115,-86.567906,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
tskey,confirmed_US,confirmed_US,confirmed_US,confirmed_US,confirmed_US,confirmed_US,confirmed_US,confirmed_US,confirmed_US,confirmed_US,...,confirmed_US,confirmed_US,confirmed_US,confirmed_US,confirmed_US,confirmed_US,confirmed_US,confirmed_US,confirmed_US,confirmed_US
domain,US,US,US,US,US,US,US,US,US,US,...,US,US,US,US,US,US,US,US,US,US
datum,confirmed,confirmed,confirmed,confirmed,confirmed,confirmed,confirmed,confirmed,confirmed,confirmed,...,confirmed,confirmed,confirmed,confirmed,confirmed,confirmed,confirmed,confirmed,confirmed,confirmed
2020-01-22,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-23,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-24,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-25,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-26,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-03-26,0,45,0,64,17,6,5,0,0,2,...,112,0,0,8,0,69,0,0,0,28
2020-03-27,0,51,0,79,19,6,5,0,0,4,...,172,0,0,6,0,67,0,61,0,28
2020-03-28,0,55,0,100,22,6,10,0,0,5,...,161,0,0,7,0,0,0,0,0,103
2020-03-29,0,56,0,127,0,6,15,0,0,5,...,190,0,0,7,0,125,0,0,0,103


In [46]:
tskeys

['confirmed_US',
 'confirmed_global',
 'deaths_US',
 'deaths_global',
 'recovered_global']

In [47]:
d['confirmed_US']['df'].join(d['confirmed_global']['df'])



Unnamed: 0,"(0, US, American Samoa, American Samoa, US, 16, AS, ASM, 16, 60.0, nan, -14.270999999999999, -170.132, confirmed_US, US, confirmed)","(1, US, Guam, Guam, US, 316, GU, GUM, 316, 66.0, nan, 13.4443, 144.7937, confirmed_US, US, confirmed)","(2, US, Northern Mariana Islands, Northern Mariana Islands, US, 580, MP, MNP, 580, 69.0, nan, 15.0979, 145.6739, confirmed_US, US, confirmed)","(3, US, Puerto Rico, Puerto Rico, US, 630, PR, PRI, 630, 72.0, nan, 18.2208, -66.5901, confirmed_US, US, confirmed)","(4, US, Virgin Islands, Virgin Islands, US, 850, VI, VIR, 850, 78.0, nan, 18.3358, -64.8963, confirmed_US, US, confirmed)","(5, US, Alabama, Autauga, Alabama, US, 84001001, US, USA, 840, 1001.0, Autauga, 32.53952745, -86.64408227, confirmed_US, US, confirmed)","(6, US, Alabama, Baldwin, Alabama, US, 84001003, US, USA, 840, 1003.0, Baldwin, 30.72774991, -87.72207058, confirmed_US, US, confirmed)","(7, US, Alabama, Barbour, Alabama, US, 84001005, US, USA, 840, 1005.0, Barbour, 31.868263, -85.3871286, confirmed_US, US, confirmed)","(8, US, Alabama, Bibb, Alabama, US, 84001007, US, USA, 840, 1007.0, Bibb, 32.99642064, -87.12511459999999, confirmed_US, US, confirmed)","(9, US, Alabama, Blount, Alabama, US, 84001009, US, USA, 840, 1009.0, Blount, 33.98210918, -86.56790593, confirmed_US, US, confirmed)",...,"(246, Canada, Yukon, 64.2823, -135.0, confirmed_global, global, confirmed)","(247, Kosovo, nan, 42.602636, 20.902977, confirmed_global, global, confirmed)","(248, Burma, nan, 21.9162, 95.956, confirmed_global, global, confirmed)","(249, United Kingdom, Anguilla, 18.2206, -63.0686, confirmed_global, global, confirmed)","(250, United Kingdom, British Virgin Islands, 18.4207, -64.64, confirmed_global, global, confirmed)","(251, United Kingdom, Turks and Caicos Islands, 21.69400000000001, -71.7979, confirmed_global, global, confirmed)","(252, MS Zaandam, nan, 0.0, 0.0, confirmed_global, global, confirmed)","(253, Botswana, nan, -22.3285, 24.6849, confirmed_global, global, confirmed)","(254, Burundi, nan, -3.3731, 29.9189, confirmed_global, global, confirmed)","(255, Sierra Leone, nan, 8.460555000000001, -11.779889, confirmed_global, global, confirmed)"
2020-01-22,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-23,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-24,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-25,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2020-01-26,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-03-26,0,45,0,64,17,6,5,0,0,2,...,3,71,0,0,0,0,0,0,0,0
2020-03-27,0,51,0,79,19,6,5,0,0,4,...,3,86,8,0,0,0,0,0,0,0
2020-03-28,0,55,0,100,22,6,10,0,0,5,...,4,91,8,2,2,4,2,0,0,0
2020-03-29,0,56,0,127,0,6,15,0,0,5,...,4,94,10,2,2,4,2,0,0,0


In [48]:
# ##### fignum = 0
# fig = plt.figure(fignum)
# plot_data = deaths_global.iloc[:, [0, 1, 2]]
# # plot_data.columns.names
# plot_regions = plot_data.columns[[0, 1, 2]].get_level_values('region')
# plt.plot(plot_data)
# plt.legend(plot_regions)