In [16]:
# %load make_data.py
import warnings
import pandas as pd
import glob
import os
import logging
import boto3
from botocore.exceptions import ClientError
import COVID19Py
import pandas
import datetime


# Cancel copy warnings of pandas
warnings.filterwarnings(
    "ignore", category=pd.core.common.SettingWithCopyWarning)


covid19_csbs = COVID19Py.COVID19(data_source="csbs").getAll(timelines=True)
covid19_jhu = COVID19Py.COVID19(data_source="jhu").getAll(timelines=True)


def upload_file(file_name, bucket, object_name=None):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then file_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    # Upload the file
    s3_client = boto3.client('s3')
    try:
        response = s3_client.upload_file(
            file_name, bucket, object_name,  ExtraArgs={'ACL': 'public-read'})
    except ClientError as e:
        logging.error(e)
        return False
    return True


def parse_api_json(json, source):
    for_pandas = []
    confirmed = []
    deaths = []
    recovered = []
    for location in json['locations']:
        timeline_entry = {}
        entry = {
            'id': location['id'],
            'lat': location['coordinates']['latitude'],
            'lon': location['coordinates']['longitude'],
            'Date': pandas.to_datetime(location['last_updated']),
            'province': location['province'],
            'country_code': location['country_code'],
            'country': location['country'],
            'confirmed': location['latest']['confirmed'],
            'deaths': location['latest']['deaths'],
            'recovered': location['latest']['recovered'],
            'county': ''}
        if 'county' in location.keys():
            entry['county'] = location['county']
        if 'state' in location.keys():
            entry['state'] = location['state']
        entry['source'] = source
        for_pandas.append(entry)

        if 'timelines' in location.keys():
            for status in location['timelines']:
                for date in location['timelines'][status]['timeline']:
                    sub_entry = {'id': location['id'],
                                 'Date': pandas.to_datetime(date),
                                 status: location['timelines'][status]['timeline'][date]}
                    if status == 'confirmed':
                        confirmed.append(sub_entry)
                    elif status == 'deaths':
                        deaths.append(sub_entry)
                    else:
                        recovered.append(sub_entry)
    if confirmed:
        # return confirmed,deaths,recovered
        timeline_df = pandas.DataFrame(confirmed)
        if deaths:
            timeline_df = timeline_df.merge(
                pandas.DataFrame(deaths), on=['id', 'Date'])
        else:
            timeline_df['deaths'] = 0
        if recovered:
            timeline_df = timeline_df.merge(
                pandas.DataFrame(recovered), on=['id', 'Date'])
        else:
            timeline_df['recovered'] = 0.0

        main_df = pandas.DataFrame(for_pandas)
        timeline_df = timeline_df.merge(
            main_df[['id', 'lat', 'lon', 'province', 'country_code', 'country', 'source']], on=['id'])

        return main_df, timeline_df
    else:
        return pandas.DataFrame(for_pandas)


def per_x_cases(grouper, df, date_mapper):
    new_cases_by_country = []
    dates = date_mapper['Date']
    sub_group = df[df[grouper] != ""]
    groupers = sub_group[grouper].unique()

    for group in groupers:
        sub_country = sub_group[sub_group[grouper] == group]
        new_cases_by_country.append(
            {grouper: group, 'Date': dates[0],
             'New Cases': sub_country.loc[sub_country['Date'] == dates[0], 'confirmed'].sum(),
             'New Deaths': 0,
             'New Recovery': 0})
        for date_index in range(1, len(dates)):
            current_date = dates[date_index]
            day_before = dates[date_index-1]
            # print(current_date,day_before)
            t_c, t_d, t_r = sub_country.loc[sub_country['Date']
                                            == current_date, :].sum()[['confirmed', 'deaths', 'recovered']]

            y_c, y_d, y_r = sub_country.loc[sub_country['Date']
                                            == day_before, :].sum()[['confirmed', 'deaths', 'recovered']]

            new_cases = t_c - y_c
            new_deaths = t_d - y_d
            new_recovery = t_r - y_r
            new_cases_by_country.append(
                {grouper: group, 'Date': current_date, 'New Cases': new_cases,
                 'New Deaths': new_deaths, 'New Recovery': new_recovery})
    return pd.DataFrame(new_cases_by_country)


jhu_df, jhu_df_time = parse_api_json(covid19_jhu, 'JHU')
csbs_df = parse_api_json(covid19_csbs, 'CSBS')
csbs_df_old = pd.read_csv(
    'https://jordansdatabucket.s3-us-west-2.amazonaws.com/covid19data/csbs_df.csv.gz', index_col=0)

csbs_df =pandas.concat([csbs_df, csbs_df_old])
dt = datetime.date.today()
datetime_string = dt.strftime('%m_%d_%Y')
# This will probably be a bug someday
# jhu_df_time = jhu_df_time[jhu_df_time['province'].str.split(
#     ', ').str.len() == 1]

date_mapper = pd.DataFrame(
    jhu_df_time['Date'].unique(), columns=['Date'])
date_mapper['Date_text'] = date_mapper['Date'].dt.strftime('%m/%d/%y')

provence_df_per_day = per_x_cases('province', jhu_df_time, date_mapper)
country_df_per_day = per_x_cases('country', jhu_df_time, date_mapper)
print('Generated Data')

jhu_df.to_csv('Data/jhu_df.csv.gz', compression='gzip')
jhu_df_time.to_csv('Data/jhu_df_time.csv.gz', compression='gzip')
csbs_df.to_csv('Data/csbs_df.csv.gz', compression='gzip')
csbs_df_old.to_csv('Data/csbs_df_Archive_{}.csv.gz'.format(datetime_string), compression='gzip')

provence_df_per_day.to_csv(
    'Data/provence_df_per_day.csv.gz', compression='gzip')
country_df_per_day.to_csv('Data/country_df_per_day.csv.gz', compression='gzip')


print('Syncing Data')
ea = ExtraArgs = {'ACL': 'public-read'}
gs = glob.glob('Data/*.csv.gz')
for file in gs:
    upload_file(file, 'jordansdatabucket', os.path.join(
        'covid19data', os.path.basename(file)))
    print("Uploaded " + os.path.basename(file))


Generated Data
Syncing Data
Uploaded provence_df_per_day.csv.gz
Uploaded jhu_df_time.csv.gz
Uploaded country_df_per_day.csv.gz
Uploaded Merged_df.csv.gz
Uploaded csbs_df_Archive_03_25_2020.csv.gz
Uploaded per_day_stats_by_country.csv.gz
Uploaded jhu_df.csv.gz
Uploaded csbs_df.csv.gz
Uploaded per_day_stats_by_state.csv.gz
Uploaded per_day_stats_by_county.csv.gz


In [35]:
import sys
import warnings
import numpy as np
import pandas as pd
import argparse
import pprint
from datetime import date, timedelta
try:
    import dash
    import dash_core_components as dcc
    import dash_html_components as html
    from dash.exceptions import PreventUpdate
    import dash_table
    from dash.dependencies import Input, Output, State
    import plotly.graph_objects as go
    import plotly.express as px
    from plotly.subplots import make_subplots
except ImportError:
    sys.exit('Please install dash, e.g, pip install dash')


mapbox_style = "mapbox://styles/plotlymapbox/cjvprkf3t1kns1cqjxuxmwixz"
mapbox_style = 'dark'
mapbox_access_token = open('.mapbox_token').readlines()[0]

# Import from S3:
jhu_df = pd.read_csv(
    'https://jordansdatabucket.s3-us-west-2.amazonaws.com/covid19data/jhu_df.csv.gz', index_col=0)
jhu_df_time = pd.read_csv(
    'https://jordansdatabucket.s3-us-west-2.amazonaws.com/covid19data/jhu_df_time.csv.gz', index_col=0)
csbs_df = pd.read_csv(
    'https://jordansdatabucket.s3-us-west-2.amazonaws.com/covid19data/csbs_df.csv.gz', index_col=0)
per_day_stats_by_country = pd.read_csv(
    'https://jordansdatabucket.s3-us-west-2.amazonaws.com/covid19data/country_df_per_day.csv.gz', index_col=0)
per_day_stats_by_state = pd.read_csv(
    'https://jordansdatabucket.s3-us-west-2.amazonaws.com/covid19data/provence_df_per_day.csv.gz', index_col=0)

jhu_df_time['Date'] = pd.to_datetime(jhu_df_time['Date'])
jhu_df['Date'] = pd.to_datetime(jhu_df['Date'])
csbs_df['Date'] = pd.to_datetime(csbs_df['Date'])
jhu_df['state'] = ""


date_mapper = pd.DataFrame(
    jhu_df_time['Date'].unique(), columns=['Date'])
date_mapper['Date_text'] = date_mapper['Date'].dt.strftime('%m/%d/%y')
min_date = date_mapper.index[0]
max_date = date_mapper.index[-1]

latest_date = csbs_df.sort_values('Date')['Date'].iloc[0]
merge = pd.concat([jhu_df, csbs_df[csbs_df['Date'] == latest_date]])
merge = merge.fillna('')
problem_countries = merge[merge['country_code'] == '']['country'].tolist()
merge.loc[merge['country'] == 'Namibia', 'country_code'] = ''
centroid_mapper = pd.read_csv('country_centroids_az8.csv')
problem_states = merge[~merge['country_code'].isin(centroid_mapper['iso_a2'])]
new_merge = merge.merge(
    centroid_mapper, left_on='country_code', right_on='iso_a2')
new_merge['case_rate'] = new_merge['confirmed']/new_merge['pop_est'] * 100
new_merge['death_rate'] = new_merge['deaths']/new_merge['pop_est'] * 100
new_merge['confirmed_no_death'] = new_merge['confirmed'] - new_merge['deaths']


# If something is has the same name for continent and subregion, lets just add the word _subregion
new_merge.loc[new_merge['continent'] == new_merge['subregion'],
              'subregion'] = new_merge['subregion'] + ' Subregion'

# Lets remove the US Data since we are doubel counting htis by merging CSBSS
new_merge_no_us = new_merge[~((new_merge['country'] == 'US') & (
    new_merge['province'] == ''))]



def build_hierarchical_dataframe(df, levels, value_column, color_columns=None):
    """
    Build a hierarchy of levels for Sunburst or Treemap charts.

    Levels are given starting from the bottom to the top of the hierarchy,
    ie the last level corresponds to the root.
    """
    df_all_trees = pd.DataFrame(columns=['id', 'parent', 'value', 'color'])
    for i, level in enumerate(levels):
        df_tree = pd.DataFrame(columns=['id', 'parent', 'value', 'color'])
        dfg = df.groupby(levels[i:]).sum()
        dfg = dfg.reset_index()
        df_tree['id'] = dfg[level].copy()
        if i < len(levels) - 1:
            df_tree['parent'] = dfg[levels[i+1]].copy()
        else:
            df_tree['parent'] = 'total'
        df_tree['value'] = dfg[value_column]
        df_all_trees = df_all_trees.append(df_tree, ignore_index=True)
    total = pd.Series(dict(id='total', parent='',
                              value=df[value_column].sum(),
                              color=""))
    df_all_trees = df_all_trees.append(total, ignore_index=True)
    return df_all_trees

def plot_sunburst():
    levels = ['continent', 'subregion', 'name', 'province']
    levels = levels[::-1]
    value_column = 'confirmed'
    df_higherarchy = build_hierarchical_dataframe(
        new_merge_no_us, levels, value_column)
    df_higherarchy['color'] = df_higherarchy['value'] / \
        df_higherarchy['value'].sum()
    df_higherarchy = df_higherarchy.replace('total', 'Total<br>Cases')

    fig = make_subplots(
        1, 2, specs=[[{"type": "domain"}, {"type": "domain"}]],)
    fig.add_trace(go.Sunburst(
        labels=df_higherarchy['id'],
        parents=df_higherarchy['parent'],
        values=df_higherarchy['value'],
        branchvalues='total',
        marker=dict(
            colors=df_higherarchy['color'],
            colorscale='RdBu_r',
            cmid=new_merge_no_us.groupby('name').sum()['confirmed'].mean()/new_merge_no_us['confirmed'].sum()),
        hovertemplate='<b>%{label} </b> <br> Confirmed Cases: %{value}',
        insidetextorientation='radial',
        name='',
        maxdepth=3
    ), 1, 1)

    levels = ['continent', 'subregion', 'name', 'province']
    levels = levels[::-1]
    value_column = 'deaths'
    df_higherarchy = build_hierarchical_dataframe(
        new_merge_no_us, levels, value_column)
    df_higherarchy['color'] = df_higherarchy['value'] / \
        df_higherarchy['value'].sum()
    df_higherarchy = df_higherarchy.replace('total', 'Total<br>Deaths')

    fig.add_trace(go.Sunburst(
        labels=df_higherarchy['id'],
        parents=df_higherarchy['parent'],
        values=df_higherarchy['value'],
        branchvalues='total',
        marker=dict(
            colors=df_higherarchy['color'],
            colorscale='reds',
            cmid=new_merge_no_us.groupby('name').sum()['deaths'].mean()/new_merge_no_us['deaths'].sum()),
        hovertemplate='<b>%{label} </b> <br> Confirmed Deaths: %{value}',
        name='',
        maxdepth=3
    ), 1, 2)

    fig.update_layout(
        uniformtext=dict(minsize=16, mode='hide'),
        paper_bgcolor='rgb(0,0,0,0)',
        # title=dict(text='Total Confirmed Cases<br>Click to Expand',
        #            font=dict(color='white', size=24)),
        margin=dict(l=40, r=40, t=40, b=40)
    )

    return fig

plot_sunburst()

Timestamp('2020-03-24 00:01:00+0000', tz='UTC')

In [359]:
# Import from S3:
jhu_df = pd.read_csv(
    'https://jordansdatabucket.s3-us-west-2.amazonaws.com/covid19data/jhu_df.csv.gz', index_col=0)
jhu_df_time = pd.read_csv(
    'https://jordansdatabucket.s3-us-west-2.amazonaws.com/covid19data/jhu_df_time.csv.gz', index_col=0)
csbs_df = pd.read_csv(
    'https://jordansdatabucket.s3-us-west-2.amazonaws.com/covid19data/csbs_df.csv.gz', index_col=0)
per_day_stats_by_country = pd.read_csv(
    'https://jordansdatabucket.s3-us-west-2.amazonaws.com/covid19data/country_df_per_day.csv.gz', index_col=0)
per_day_stats_by_state = pd.read_csv(
    'https://jordansdatabucket.s3-us-west-2.amazonaws.com/covid19data/provence_df_per_day.csv.gz', index_col=0)

jhu_df_time['Date'] = pd.to_datetime(jhu_df_time['Date'])
jhu_df['Date'] = pd.to_datetime(jhu_df['Date'])
csbs_df['Date'] = pd.to_datetime(csbs_df['Date'])
jhu_df['state'] = ""


date_mapper = pd.DataFrame(
    jhu_df_time['Date'].unique(), columns=['Date'])
date_mapper['Date_text'] = date_mapper['Date'].dt.strftime('%m/%d/%y')
min_date = date_mapper.index[0]
max_date = date_mapper.index[-1]


merge = pd.concat([jhu_df, csbs_df])
merge = merge.fillna('')
problem_countries = merge[merge['country_code'] == '']['country'].tolist()
merge.loc[merge['country'] == 'Namibia', 'country_code'] = ''
centroid_mapper = pd.read_csv('country_centroids_az8.csv')
problem_states = merge[~merge['country_code'].isin(centroid_mapper['iso_a2'])]
new_merge = merge.merge(
    centroid_mapper, left_on='country_code', right_on='iso_a2')
new_merge['case_rate'] = new_merge['confirmed']/new_merge['pop_est'] * 100
new_merge['death_rate'] = new_merge['deaths']/new_merge['pop_est'] * 100
new_merge['confirmed_no_death'] = new_merge['confirmed'] - new_merge['deaths']

# If something is has the same name for continent and subregion, lets just add the word _subregion
new_merge.loc[new_merge['continent'] == new_merge['subregion'],
              'subregion'] = new_merge['subregion'] + ' Subregion'



Unnamed: 0,id,lat,lon,Date,province,country_code,country,confirmed,deaths,recovered,...,name_len,long_len,abbrev_len,tiny,homepart,Longitude,Latitude,case_rate,death_rate,confirmed_no_death
0,0,33.000000,65.000000,2020-03-24 06:01:08.402656+00:00,,AF,Afghanistan,40,1,0,...,11,11,4,-99,1,66.004734,33.835231,0.000141,0.000004,39
1,1,41.153300,20.168300,2020-03-24 06:01:08.412527+00:00,,AL,Albania,104,4,0,...,7,7,4,-99,1,20.049834,41.142450,0.002858,0.000110,100
2,2,28.033900,1.659600,2020-03-24 06:01:08.417963+00:00,,DZ,Algeria,230,17,0,...,7,7,4,-99,1,2.617323,28.158938,0.000673,0.000050,213
3,3,42.506300,1.521800,2020-03-24 06:01:08.423338+00:00,,AD,Andorra,133,1,0,...,7,7,4,5,1,1.560544,42.542291,0.158545,0.001192,132
4,4,-11.202700,17.873900,2020-03-24 06:01:08.428696+00:00,,AO,Angola,3,0,0,...,6,6,4,-99,1,17.537368,-12.293361,0.000023,0.000000,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1504,233,12.116500,-61.679000,2020-03-24 06:01:09.466806+00:00,,GD,Grenada,1,0,0,...,7,7,5,-99,1,-61.682202,12.117250,0.001102,0.000000,1
1505,234,-18.665695,35.529562,2020-03-24 06:01:09.471494+00:00,,MZ,Mozambique,1,0,0,...,10,10,4,-99,1,35.533675,-17.273816,0.000005,0.000000,1
1506,235,34.802075,38.996815,2020-03-24 06:01:09.475908+00:00,,SY,Syria,1,0,0,...,5,5,5,-99,1,38.507882,35.025474,0.000005,0.000000,1
1507,236,-8.874217,125.727539,2020-03-24 06:01:09.480471+00:00,,TL,Timor-Leste,1,0,0,...,11,11,4,-99,1,125.844390,-8.828892,0.000088,0.000000,1


In [351]:
 COVID19Py.COVID19(data_source="jhu").getAll(timelines)

{'latest': {'confirmed': 418678, 'deaths': 18625, 'recovered': 0},
 'locations': []}