# How do different countries differ in their COVID-19 growth rates? 
> Comparing how countries trajectories of total cases are similar with Italy, South Korea and Japan

In [90]:
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

In [91]:
#hide
import pandas as pd
import altair as alt
from IPython.display import HTML
from IPython.display import HTML

In [92]:
#hide
url = ('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
df = pd.read_csv(url)
# rename countries
df['Country/Region'] = df['Country/Region'].replace({'Korea, South': 'South Korea'})
df = df[~df['Country/Region'].isin(['Cruise Ship'])]   # Remove Ships
dt_cols = df.columns[~df.columns.isin(['Province/State', 'Country/Region', 'Lat', 'Long'])]

In [93]:
#hide
dff = (df.groupby('Country/Region')[dt_cols].sum()
       .stack().reset_index(name='Confirmed Cases')
       .rename(columns={'level_1': 'Date', 'Country/Region': 'Country'}))
dff['Date'] = pd.to_datetime(dff['Date'], format='%m/%d/%y')

In [94]:
#hide
MIN_CASES = 700
LAST_DATE = dt_cols[-1]
# sometimes last column may be empty, then go backwards
for c in dt_cols[::-1]:
    if not df[c].fillna(0).eq(0).all():
        LAST_DATE = c
        break
# countries = dff[dff['Date'].eq(LAST_DATE) & dff['Confirmed Cases'].ge(MIN_CASES) & 
#         dff['Country'].ne('China')
#        ].sort_values(by='Confirmed Cases', ascending=False)
countries = dff[dff['Date'].eq(LAST_DATE) & dff['Confirmed Cases'].ge(MIN_CASES)
       ].sort_values(by='Confirmed Cases', ascending=False)
countries = countries['Country'].values

In [84]:
#hide
SINCE_CASES_NUM = 100
dff2 = dff[dff['Country'].isin(countries)].copy()
days_since = (dff2.assign(F=dff2['Confirmed Cases'].ge(SINCE_CASES_NUM))
              .set_index('Date')
              .groupby('Country')['F'].transform('idxmax'))
dff2['Days since 100 cases'] = (dff2['Date'] - days_since.values).dt.days.values
dff2 = dff2[dff2['Days since 100 cases'].ge(0)]

In [86]:
#hide
def get_country_colors(x):
    mapping = {
        'Italy': 'black',
        'China': 'red',
        'Iran': '#A1BA59',
        'South Korea': '#E45756',
        'Spain': '#F58518',
        'Germany': '#9D755D',
        'France': '#F58518',
        'US': '#2495D3',
        'Switzerland': '#9D755D',
        'Norway': '#C1B7AD',
        'United Kingdom': '#2495D3',
        'Netherlands': '#C1B7AD',
        'Sweden': '#C1B7AD',
        'Belgium': '#C1B7AD',
        'Denmark': '#C1B7AD',
        'Austria': '#C1B7AD',
        'Japan': '#9467bd'}
    return mapping.get(x, '#C1B7AD')

In [87]:
#hide_input
baseline_countries = ['Italy', 'South Korea', 'Japan']
max_date = dff2['Date'].max()
color_domain = list(dff2['Country'].unique())
color_range = list(map(get_country_colors, color_domain))

def make_since_chart(highlight_countries=[], baseline_countries=baseline_countries):
    selection = alt.selection_multi(fields=['Country'], bind='legend', 
                                    init=[{'Country': x} for x in highlight_countries + baseline_countries])

    base = alt.Chart(dff2, width=550).encode(
        x='Days since 100 cases:Q',
        y=alt.Y('Confirmed Cases:Q', scale=alt.Scale(type='log')),
        color=alt.Color(
            'Country:N',
            scale=alt.Scale(domain=color_domain, range=color_range),
            legend=alt.Legend(columns=len(color_domain)//18+1, symbolLimit=len(color_domain))),
        tooltip=list(dff2),
        opacity=alt.condition(selection, alt.value(1), alt.value(0.05))
    )
    max_day = dff2['Days since 100 cases'].max()
    ref = pd.DataFrame([[x, 100*1.33**x] for x in range(max_day+1)], columns=['Days since 100 cases', 'Confirmed Cases'])
    base_ref = alt.Chart(ref).encode(x='Days since 100 cases:Q', y='Confirmed Cases:Q')
    return (
        base_ref.mark_line(color='black', opacity=.5, strokeDash=[3,3]) +
        base_ref.transform_filter(
            alt.datum['Days since 100 cases'] >= max_day
        ).mark_text(dy=-6, align='right', fontSize=10, text='33% Daily Growth') +
        base.mark_line(point=True).add_selection(selection) + 
        base.transform_filter(
            alt.datum['Date'] >= int(max_date.timestamp() * 1000)
        ).mark_text(dy=-8, align='right', fontWeight='bold').encode(text='Country:N')
    ).properties(
        title=f"Compare {', '.join(highlight_countries)} trajectory with {', '.join(baseline_countries)}"
    )

## Learning from Italy, South Korea & Japan

Italy, South Korea & Japan are three countries which show different growth rates and how it evolved over time. 

I have included **China**'s statistics, however, the authenticity of its toll can be considered dubious. I recommend considering the case of **China** as an outlier, though, note that several other countries have similar issues of inaccurate reported statistics.

**South Korea** flattened it's growth after 2 weeks since 100 cases. **Italy** continue to grew after 3rd week.

**US** is quickly following Italy's path.

Since I have logarithmically scaled the growth rate, if the rate follows the **33% daily Growth line** and looks linear, it actually means exponential growth in the spread. 
For example, **China** and **South Korea**'s flat curve indicates a decrease in new confirmed cases.

<small>Click (Shift+ for multiple) on Countries legend to filter the visualization.</small>

In [101]:
#hide_input
HTML(f'<small class="float-right">Last Updated on {pd.to_datetime(LAST_DATE).strftime("%B, %d %Y")}</small>')

In [102]:
#hide_input
chart = make_since_chart()
chart

In [103]:
#hide_input
chart2 = make_since_chart(['Spain', 'Germany'])
chart2

In [104]:
#hide_input
chart3 = make_since_chart(['US', 'France'])
chart3

In [105]:
#hide_input
chart4 = make_since_chart(['Germany', 'United Kingdom'])
chart4

Select a country from the drop down list below to toggle  the visualization.

In [106]:
#hide_input
base = alt.Chart(dff2, width=600).encode(
    x='Days since 100 cases:Q',
    y=alt.Y('Confirmed Cases:Q', scale=alt.Scale(type='log')),
    color=alt.Color('Country:N', scale=alt.Scale(domain=color_domain, range=color_range), legend=None),
    tooltip=['Country', 'Date', 'Confirmed Cases', 'Days since 100 cases']
)

country_selection = alt.selection_single(
    name='Select', fields=['Country'], 
    bind=alt.binding_select(options=list(sorted(set(countries) - set(baseline_countries)))),
    init={'Country': 'US'})

date_filter = alt.datum['Date'] >= int(max_date.timestamp() * 1000)
base2 = base.transform_filter(alt.FieldOneOfPredicate('Country', baseline_countries))
base3 = base.transform_filter(country_selection)
base4 = base3.transform_filter(date_filter)

max_day = dff2['Days since 100 cases'].max()
ref = pd.DataFrame([[x, 100*1.33**x] for x in range(max_day+1)], columns=['Days since 100 cases', 'Confirmed Cases'])
base_ref = alt.Chart(ref).encode(x='Days since 100 cases:Q', y='Confirmed Cases:Q')
base_ref_f = base_ref.transform_filter(alt.datum['Days since 100 cases'] >= max_day)

chart5 = (
 base_ref.mark_line(color='black', opacity=.5, strokeDash=[3,3]) + 
 base_ref_f.mark_text(dy=-6, align='right', fontSize=10, text='33% Daily Growth') + 
 base2.mark_line(point=True, tooltip=True) +
 base3.mark_line(point={'size':50}, tooltip=True) +
 base2.transform_filter(date_filter).mark_text(dy=-8, align='right').encode(text='Country:N') +
 base4.mark_text(dx=8, align='left', fontWeight='bold').encode(text='Country:N') +
 base4.mark_text(dx=8, dy=12, align='left', fontWeight='bold').encode(text='Confirmed Cases:Q')
).add_selection(country_selection).properties(
    title=f"Country's Trajectory compared to {', '.join(baseline_countries)}"
)
chart5

[^1]: Source: ["2019 Novel Coronavirus COVID-19 (2019-nCoV) Data Repository by Johns Hopkins CSSE"](https://systems.jhu.edu/research/public-health/ncov/) [GitHub repository](https://github.com/CSSEGISandData/COVID-19). Link to [original notebook](https://github.com/pratapvardhan/notebooks/blob/master/covid19/covid19-compare-country-trajectories.ipynb).

# Estimating Future Infections from today's infections
> Future infections of COVID-19 does not rely on time, but **today**'s number of deaths and infected (fatality rate and contract rate)

In [149]:
#hide
import numpy as np
import matplotlib.pyplot as plt
from datetime import timedelta, datetime, date
%config InlineBackend.figure_format = 'retina'

chart_width = 550
chart_height= 400

In [150]:
#hide
def plot(data, type1, levels):
    data_countries_pc2 = data.copy()
    for i in range(0,len(countries)):
        data_countries_pc2[i] = data_countries_pc2[i].reset_index()
        data_countries_pc2[i]['n_days'] = data_countries_pc2[i].index
        if type1 == "scatter":
            data_countries_pc2[i]['cases'] = data_countries_pc2[i]["total_cases"]
        data_countries_pc2[i]['infected'] = data_countries_pc2[i]["total_infected"]
    data_plot = data_countries_pc2[0]
    for i in range(1, len(countries)):    
        data_plot = pd.concat([data_plot, data_countries_pc2[i]], axis=0)
    
    if type1 == "scatter":
        data_plot["45_line"] = data_plot["cases"]

    # Plot it using Altair
    source = data_plot
    
    if levels == True:
        ylabel = "Total"
    else :
        ylabel = "Per Million"

    scales = alt.selection_interval(bind='scales')
    selection = alt.selection_multi(fields=['location'], bind='legend')

    if type1 == "line": 
        base = alt.Chart(source, title =  "Estimated Infected Population By Country").encode(
            x = alt.X('n_days:Q', title = "Days since outbreak"),
            y = alt.Y("infected:Q",title = ylabel),
            color = alt.Color('location:N', legend=alt.Legend(title="Country", labelFontSize=15, titleFontSize=17),
                             scale=alt.Scale(scheme='tableau20'))
        )
        
        shades = base.mark_area().encode(
            x='n_days:Q',
            y='total_infected_lower:Q',
            y2='total_infected_upper:Q',
            opacity = alt.condition(selection, alt.value(0.2), alt.value(0.05))
        )
    
        lines = base.mark_line().encode(
            opacity = alt.condition(selection, alt.value(1), alt.value(0.1))
        ).add_selection(
            scales
        ).add_selection(
            selection
        ).properties(
            width=chart_width,
            height=chart_height
        )
        return(
        ( lines + shades)
        .configure_title(fontSize=20)
        .configure_axis(labelFontSize=15,titleFontSize=18)
        )
    
    if levels == True:
        ylabel = "Infected"
        xlabel = "Cases"
    else :
        ylabel = "Per Million Infected"
        xlabel = "Per Million Cases"
        
    if type1 == "scatter":
        base = alt.Chart(source, title = "COVID-19 Cases VS Infected").encode(
            x = alt.X('cases:Q', title = xlabel),
            y = alt.Y("infected:Q",title = ylabel),
            color = alt.Color('location:N', legend=alt.Legend(title="Country", labelFontSize=15, titleFontSize=17),
                             scale=alt.Scale(scheme='tableau20')),
            opacity = alt.condition(selection, alt.value(1), alt.value(0.1))
        )

        
        scatter = base.mark_point().add_selection(
            scales
        ).add_selection(
            selection
        ).properties(
            width=chart_width,
            height=chart_height
        )

        line_45 = alt.Chart(source).encode(
            x = "cases:Q",
            y = alt.Y("45_line:Q",  scale=alt.Scale(domain=(0, max(data_plot["infected"])))),
        ).mark_line(color="grey", strokeDash=[3,3])
        
        return(
        (scatter + line_45)
        .configure_title(fontSize=20)
        .configure_axis(labelFontSize=15,titleFontSize=18)
        )

In [159]:
#hide 
# Get data on deaths D_t
data = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv", 
                   error_bad_lines=False)
data = data.drop(columns=["Lat", "Long"])
data = data.melt(id_vars= ["Province/State", "Country/Region"])
data = pd.DataFrame(data.groupby(['Country/Region', "variable"]).sum())
data.reset_index(inplace=True)  
data = data.rename(columns={"Country/Region": "location", "variable": "date", "value": "total_deaths"})
data['date'] =pd.to_datetime(data.date)
data = data.sort_values(by = "date")
data.loc[data.location == "US","location"] = "United States"
data.loc[data.location == "Korea, South","location"] = "South Korea"

#hide
# Get data and clean it
data_cases = pd.read_csv("https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv", error_bad_lines=False)

data_cases = data_cases.drop(columns=["Lat", "Long"])
data_cases = data_cases.melt(id_vars= ["Province/State", "Country/Region"])
data_cases = pd.DataFrame(data_cases.groupby(['Country/Region', "variable"]).sum())
data_cases.reset_index(inplace=True)  
data_cases = data_cases.rename(columns={"Country/Region": "location", "variable": "date", "value": "total_cases"})
data_cases['date'] =pd.to_datetime(data_cases.date)
data_cases = data_cases.sort_values(by = "date")
data_cases.loc[data_cases.location == "US","location"] = "United States"
data_cases.loc[data_cases.location == "Korea, South","location"] = "South Korea"
# Add countries
countries = ["China", "Italy", "Spain", "France", "United Kingdom", "Germany", 
             "Portugal", "United States", "Singapore","South Korea", "Japan", 
             "Brazil","Iran", "India", "Switzerland", "Canada", "Australia"]

data_final = pd.merge(data,
                 data_cases
                 )
data_final["CFR"] = data_final["total_deaths"]/data_final["total_cases"]


data_final["total_infected"] = np.NaN
data_final = data_final.sort_values(by = ['location', 'date'])
data_final = data_final.reset_index(drop = True)


for j in countries:
    for i in data_final["date"].unique()[0:-8]:
        data_final.loc[(data_final.date == i) & (data_final.location == j), "total_infected"] = data_final.loc[(data_final.date == i + np.timedelta64(8, 'D')) & (data_final.location == j), "total_deaths"].iloc[0]/data_final.loc[(data_final.date == i + np.timedelta64(8, 'D')) & (data_final.location == j), "CFR"].iloc[0]
        

# Estimate growth rate of infected, g        
data_final['infected_g'] = np.log(data_final['total_infected'])
data_final['infected_g'] = data_final['infected_g'].diff() 

# Estimate number of infected given g
today = data_final.date.iloc[-1]
for j in countries:
    for i in range(7,-1,-1):
        data_final.loc[(data_final.location == j) & (data_final.date == today - timedelta(i)), "total_infected"] = data_final.loc[data_final.location == j, "total_infected"].iloc[-i-2]*(1+data_final.loc[data_final.location == j, "infected_g"][-12:-8].aggregate(func = "mean"))
    


# Upper Bound
data_final["total_infected_upper"] = np.NaN
data_final = data_final.sort_values(by = ['location', 'date'])
data_final = data_final.reset_index(drop = True)
for j in countries:
    for i in data_final["date"].unique()[0:-8]:
        data_final.loc[(data_final.date == i) & (data_final.location == j), "total_infected_upper"] = data_final.loc[(data_final.date == i + np.timedelta64(8, 'D')) & (data_final.location == j), "total_deaths"].iloc[0]/(data_final.loc[(data_final.date == i + np.timedelta64(8, 'D')) & (data_final.location == j), "CFR"].iloc[0]*0.7)
# Estimate growth rate of infected, g        
data_final['infected_g'] = np.log(data_final['total_infected_upper'])
data_final['infected_g'] = data_final['infected_g'].diff() 
# Estimate number of infected given g 
today = data_final.date.iloc[-1]
for j in countries:
    for i in range(7,-1,-1):
        data_final.loc[(data_final.location == j) & (data_final.date == today - timedelta(i)), "total_infected_upper"] = data_final.loc[data_final.location == j, "total_infected_upper"].iloc[-i-2]*(1+data_final.loc[data_final.location == j, "infected_g"][-12:-8].aggregate(func = "mean"))

# Lower Bound
data_final["total_infected_lower"] = np.NaN
data_final = data_final.sort_values(by = ['location', 'date'])
data_final = data_final.reset_index(drop = True)
for j in countries:
    for i in data_final["date"].unique()[0:-8]:
        data_final.loc[(data_final.date == i) & (data_final.location == j), "total_infected_lower"] = data_final.loc[(data_final.date == i + np.timedelta64(8, 'D')) & (data_final.location == j), "total_deaths"].iloc[0]/(data_final.loc[(data_final.date == i + np.timedelta64(8, 'D')) & (data_final.location == j), "CFR"].iloc[0]*1.3)
# Estimate growth rate of infected, g        
data_final['infected_g'] = np.log(data_final['total_infected_lower'])
data_final['infected_g'] = data_final['infected_g'].diff() 
# Estimate number of infected given g 
today = data_final.date.iloc[-1]
for j in countries:
    for i in range(7,-1,-1):
        data_final.loc[(data_final.location == j) & (data_final.date == today - timedelta(i)), "total_infected_lower"] = data_final.loc[data_final.location == j, "total_infected_lower"].iloc[-i-2]*(1+data_final.loc[data_final.location == j, "infected_g"][-12:-8].aggregate(func = "mean"))
data_final.loc[data_final.total_infected_lower < data_final.total_cases, "total_infected_lower"] = data_final.loc[data_final.total_infected_lower < data_final.total_cases, "total_cases"]


data_pc = data_final[['location', 'date', 'total_infected', 'total_infected_lower', 'total_infected_upper']].copy()

countries = ["China", "Italy", "Spain", "France", "United Kingdom", "Germany", 
              "United States", "South Korea","Iran"]
data_countries = []
data_countries_pc = []

for i in countries:
    data_pc.loc[data_pc.location == i,"total_infected"] = data_pc.loc[data_pc.location == i,"total_infected"]

# Get each country time series
filter1 = data_pc["total_infected"] > 1
for i in countries:
    filter_country = data_pc["location"]== i
    data_countries_pc.append(data_pc[filter_country & filter1])      



## Estimated Infected Population By Country

> Click (Shift+ for multiple) on countries in the legend to filter the visualization. 

In [160]:
#hide_input
# Plot estimated absolute number of infected
plot1 = plot(data_countries_pc, "line", True)
#plot1.save("../images/covid-estimate-infections.png")
plot1

Latest Country Estimates

In [161]:
#hide_input    
label = 'Estimated Infected'
temp = pd.concat([x.copy() for x in data_countries_pc]).loc[lambda x: x.date >= '3/1/2020']

metric_name = f'{label}'
temp.columns = ['Country', 'Date', metric_name, "Lower Bound Estimates", "Upper Bound Estimates"]
temp.loc[:, "Estimated Infected"] = temp.loc[:, "Estimated Infected"].round(0).map('{:,.0f}'.format) 
temp.loc[:, "Lower Bound Estimates"] = temp.loc[:, "Lower Bound Estimates"].round(0).map('{:,.0f}'.format) 
temp.loc[:, "Upper Bound Estimates"] = temp.loc[:, "Upper Bound Estimates"].round(0).map('{:,.0f}'.format) 
temp.groupby('Country').last()

Unnamed: 0_level_0,Date,Estimated Infected,Lower Bound Estimates,Upper Bound Estimates
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
China,2020-03-28,82821,81999,118315
France,2020-03-28,101555,78120,145079
Germany,2020-03-28,164317,126397,234738
Iran,2020-03-28,69992,53840,99988
Italy,2020-03-28,161954,124580,231363
South Korea,2020-03-28,10420,9478,14885
Spain,2020-03-28,227035,174642,324335
United Kingdom,2020-03-28,68649,52807,98070
United States,2020-03-28,536048,412344,765782


## Infected vs. number of confirmed cases
> Allows you to compare how countries have been tracking the true number of infected people. The smaller deviation from the dashed line (45 degree line) the better job at tracking the true number of infected people.

In [162]:
#hide_input
# Plot it using Altair
data_pc = data_final[['location', 'date', 'total_cases', 'total_infected']].copy()

countries = ["China", "Italy", "Spain", "France", "United Kingdom", "Germany", 
             "Portugal", "United States", "Singapore","South Korea", "Japan", 
             "Brazil","Iran"]
data_countries = []
data_countries_pc = []

for i in countries:
    data_pc.loc[data_pc.location == i,"total_infected"] = data_pc.loc[data_pc.location == i,"total_infected"]
    data_pc.loc[data_pc.location == i,"total_cases"] = data_pc.loc[data_pc.location == i,"total_cases"]
    # get each country time series
filter1 = data_pc["total_infected"] > 1
for i in countries:
    filter_country = data_pc["location"]== i
    data_countries_pc.append(data_pc[filter_country & filter1])


plot(data_countries_pc, "scatter", True)

Latest Observed vs. Estimate of Infected Cases

In [163]:
#hide_input
label1 = 'Observed Cases'
label2 = 'Estimated Infected'
temp = pd.concat([x.copy() for x in data_countries_pc]).loc[lambda x: x.date >= '3/1/2020']

metric_name1 = f'{label1}'
metric_name2 = f'{label2}'
temp.columns = ['Country', 'Date', metric_name1, metric_name2]
# temp.loc[:, 'month'] = temp.date.dt.strftime('%Y-%m')
temp.loc[:, "Observed Cases"] = temp.loc[:, "Observed Cases"].round(0).map('{:,.0f}'.format)
temp.loc[:, "Estimated Infected"] = temp.loc[:, "Estimated Infected"].round(0).map('{:,.0f}'.format)
temp.groupby('Country').last()

Unnamed: 0_level_0,Date,Observed Cases,Estimated Infected
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Brazil,2020-03-28,3904,10989
China,2020-03-28,81999,82821
France,2020-03-28,38105,101555
Germany,2020-03-28,57695,164317
Iran,2020-03-28,35408,69992
Italy,2020-03-28,92472,161954
Japan,2020-03-28,1693,3312
Portugal,2020-03-28,5170,21621
Singapore,2020-03-28,802,1606
South Korea,2020-03-28,9478,10420


Analysis conducted by [Joao B. Duarte](https://www.jbduarte.com). 

1. [2019 Novel Coronavirus COVID-19 (2019-nCoV) Data Repository by Johns Hopkins CSSE](https://systems.jhu.edu/research/public-health/ncov/) [GitHub repository](https://github.com/CSSEGISandData/COVID-19). 

2. [Feenstra, Robert C., Robert Inklaar and Marcel P. Timmer (2015), "The Next Generation of the Penn World Table" American Economic Review, 105(10), 3150-3182](https://www.rug.nl/ggdc/productivity/pwt/related-research)
