In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [38]:
import pandas as pd
import numpy as np
import re
from pathlib import Path

import folium
from folium.plugins import HeatMap, HeatMapWithTime

from IPython.display import Markdown

In [4]:
data_dir = Path('../data/CSSEGISandData/COVID-19/csse_covid_19_data/csse_covid_19_time_series')

### Load all files from data directory

Load each raw data file into separate DataFrames. 

Make two sets:

    1. one large dataframe where we can add additional computations
    2. one the maps each type of data into its own context

In [160]:
all_data = []
all_mapped = {}
for data_file in data_dir.glob('*.csv'):
    file_type = data_file.stem.split('-')[-1]
    df = pd.read_csv(data_file)
    all_mapped[file_type] = df.copy(True)
    df.loc[:,'health_status'] = file_type
    
    all_data.append(df)
#     display(df)
df_all = pd.concat(all_data, ignore_index=True, sort=False)
df_all = df_all.set_index(['Province/State', 'Country/Region', 'Lat', 'Long'])
df_all = df_all.pivot(columns='health_status')
df_all = df_all.sort_index(level=[1,0]).T
df_all.index = df_all.index.set_levels(pd.to_datetime(df_all.index.levels[0]), level=0)
# df_all.head()

Compute the the *Active* cases by computing the difference between *Confirmed* and *Recovered*.

In [164]:
df_all = df_all.sort_index(level=[0,1]).T.stack(level=0)
df_all = df_all.assign(Active=df_all.loc[:,'Confirmed'] - df_all.loc[:,'Recovered'])
df_all = df_all.T.stack(level=4).swaplevel(i=1,j=0).sort_index(level=[0,1])
df_all.tail(8)

Unnamed: 0_level_0,Province/State,"Adams, IN",Alabama,"Alachua, FL","Alameda County, CA",Alaska,Alberta,Anhui,"Anoka, MN","Arapahoe, CO",Arizona,...,Western Australia,"Whatcom, WA","Williamson County, TN",Wisconsin,"Worcester, MA",Wyoming,Xinjiang,"Yolo County, CA",Yunnan,Zhejiang
Unnamed: 0_level_1,Country/Region,US,US,US,US,US,Canada,China,US,US,US,...,Australia,US,US,US,US,US,China,US,China,China
Unnamed: 0_level_2,Lat,39.8522,32.3182,29.7938,37.6017,61.3707,53.9333,31.8257,45.3293,39.6203,33.7298,...,-31.9505,48.8787,35.9179,44.2685,42.4097,42.7560,41.1129,38.7646,24.9740,29.1832
Unnamed: 0_level_3,Long,-77.2865,-86.9023,-82.4944,-121.7195,-152.4044,-116.5765,117.2264,-93.2197,-104.3326,-111.4312,...,115.8605,-121.9719,-86.8622,-89.6165,-71.8571,-107.3025,85.2401,-121.9018,101.4870,120.0934
Unnamed: 0_level_4,health_status,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4,Unnamed: 20_level_4,Unnamed: 21_level_4,Unnamed: 22_level_4
2020-03-13,Active,0,5,0,0,1,29,6,0,0,8,...,14,0,0,18,0,1,3,0,4,18
2020-03-13,Confirmed,0,5,0,0,1,29,990,0,0,9,...,14,0,0,19,0,1,76,0,174,1215
2020-03-13,Deaths,0,0,0,0,0,0,6,0,0,0,...,1,0,0,0,0,0,3,0,2,1
2020-03-13,Recovered,0,0,0,0,0,0,984,0,0,1,...,0,0,0,1,0,0,73,0,170,1197
2020-03-14,Active,0,6,0,0,1,29,6,0,0,11,...,17,0,0,26,0,2,3,0,2,16
2020-03-14,Confirmed,0,6,0,0,1,29,990,0,0,12,...,17,0,0,27,0,2,76,0,174,1227
2020-03-14,Deaths,0,0,0,0,0,0,6,0,0,0,...,1,0,0,0,0,0,3,0,2,1
2020-03-14,Recovered,0,0,0,0,0,0,984,0,0,1,...,0,0,0,1,0,0,73,0,172,1211


## Common functions for creating time moving geographical heatmaps

Note one of the things I'm doing is since China's numbers are so large, I'm using the Log10 value to compute the intensity of the *hot spots*

In [166]:
def make_heatmap_data(data, use_global_max=False):
    # index the data
    data_idx = data.set_index(['Province/State', 'Country/Region', 'Lat', 'Long'])
    data_ts = data_idx.T
    data_ts.columns = data_ts.columns.droplevel([0,1])
    data_ts.index = pd.to_datetime(data_ts.index)
    data_ts = data_ts.sort_index(level=[0])
    
#     display(data_ts.head())
    
    if use_global_max:
        max_value = data_ts.values.max()
        
    hm_date_index = data_ts.index.strftime('%m/%d/%Y').to_list()
    
    hm_data = []
    for label, series in data_ts.iterrows():

        series_slim = series[series>0]
        series_log = np.log(series_slim)
        
        if not use_global_max:
            max_value = series_log.max()

        hm_data.append(series_log.div(max_value).reset_index().to_numpy().tolist())
    
    
    return hm_data, hm_date_index, data_ts


def make_heatmap(data, caption='Results', global_max=False):
    hm_data, hm_idx, hm_data_df = make_heatmap_data(data, global_max)
    
    hmap = folium.Map(location=[0,0], tiles='stamentoner', zoom_start=2)
    hm_overlay = HeatMapWithTime(hm_data, index=hm_idx, max_opacity=0.8, auto_play=True)
    hmap.add_child(hm_overlay)
    
    display(Markdown(F"## {caption}"))
    display(hmap)

# Geographical Heatmaps


In [167]:
active_cases = df_all.loc[(slice(None),['Active']), :].droplevel(level=1)
make_heatmap(active_cases.T.reset_index(), 'Active Cases by Day', False)

## Active Cases by Day

In [170]:
for key in all_mapped.keys():
    make_heatmap(all_mapped[key], F'{key} Cases by Date')

## Recovered Cases by Date

## Deaths Cases by Date

## Confirmed Cases by Date