# Open COVID-19 Analysis 

This notebook is meant to serve as a starter for anyone interested in diving into the most recent data surrounding the global 2019 novel coronavirus (COVID-19) outbreak.

Datasets:
- Epidemiology Data: https://data.humdata.org/dataset/novel-coronavirus-2019-ncov-cases
- Population Data: https://population.un.org/wpp/Download/Standard/Population/

In [1]:
import pandas as pd
import cufflinks as cf
import plotly.graph_objs as go
import plotly.express as px
import numpy as np
from scipy.optimize import curve_fit
from typing import List
from IPython.display import display
cf.go_offline()

In [2]:
confirmed_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv"
deaths_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv"
recovered_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Recovered.csv"

In [3]:
raw_confirmed = pd.read_csv(confirmed_url)
raw_deaths = pd.read_csv(deaths_url)
raw_recovered = pd.read_csv(recovered_url)

In [4]:
total_pop = pd.read_csv("total_pop.csv")
total_pop.index = total_pop["Country"]
total_pop.drop(axis=1, labels=["Country"], inplace=True)
def current_pop(country: str):
    return int(total_pop.T[country]["2020"].replace(" ", "")) * 1000

In [5]:
def clean_country_names(df):
    df.rename(index={'Korea, South':'South Korea'}, inplace=True)
    df.rename(index={'Iran (Islamic Republic of)':'Iran'}, inplace=True)
    df.rename(index={'China':'Mainland China'}, inplace=True)
def country_agg(raw_df):
    df = raw_df.copy()
    df.drop(axis=1, labels=["Province/State", "Lat", "Long"], inplace=True)
    df = df.groupby(by="Country/Region").sum()
    df.drop(axis=0, labels=["Cruise Ship"], inplace=True) # Drop cases not associated with a country
    clean_country_names(df)
    return df

In [6]:
confirmed = country_agg(raw_confirmed)
deaths = country_agg(raw_deaths)
recovered = country_agg(raw_recovered)
infected = confirmed - recovered - deaths

In [7]:
# Top countries by number of confirmed cases
confirmed.T.max().sort_values(ascending=False).head(20)

Country/Region
Mainland China    81033
Italy             27980
Iran              14991
Spain              9942
South Korea        8236
Germany            7272
France             6650
US                 4632
Switzerland        2200
United Kingdom     1551
Netherlands        1414
Norway             1333
Sweden             1103
Belgium            1058
Austria            1018
Denmark             932
Japan               839
Malaysia            566
Qatar               439
Canada              415
dtype: int64

In [8]:
def sird_plot(country_name: str, pop_prop: bool = True):
    pop_size = current_pop(country_name)/100000 if pop_prop else 1
    plot_data = pd.DataFrame([infected.T[country_name]/pop_size, recovered.T[country_name]/pop_size, deaths.T[country_name]/pop_size], index=["Infected", "Recovered", "Dead"]).T
    if(pop_prop):
        plot_data.iplot(title="COVID-19 Outbreak: " + country_name, yTitle="Count per 100,000 People")
    else:
        plot_data.iplot(title="COVID-19 Outbreak: " + country_name,  yTitle="Count")
def country_confirmed_plot(cnames: List[str], pop_prop: bool = True):
    col_data = [];
    for c in cnames:
        pop_size = current_pop(c)/100000 if pop_prop else 1
        col_data.append(confirmed.T[c]/pop_size);
    plot_data = pd.DataFrame(col_data, index=cnames).T
    if(pop_prop):
        plot_data.iplot(subplots=True, subplot_titles=True, shared_yaxis=True, title="COVID-19 Confirmed Cases Per 100,000 People")
    else:
        plot_data.iplot(subplots=True, subplot_titles=True, shared_yaxis=True, title="COVID-19 Confirmed Cases")
def country_deaths_plot(cnames: List[str], pop_prop: bool = True):
    col_data = [];
    for c in cnames:
        pop_size = current_pop(c)/100000 if pop_prop else 1
        col_data.append(deaths.T[c]/pop_size);
    plot_data = pd.DataFrame(col_data, index=cnames).T
    if(pop_prop):
        plot_data.iplot(subplots=True, subplot_titles=True, shared_yaxis=True, title="COVID-19 Deaths Per 100,000 People")
    else:
        plot_data.iplot(subplots=True, subplot_titles=True, shared_yaxis=True, title="COVID-19 Deaths")

In [9]:
top_countries = list(confirmed.T.sum().sort_values(ascending=False).head(4).index)
top_countries.append("US")
country_confirmed_plot(top_countries, pop_prop=False)
country_confirmed_plot(top_countries, pop_prop=True)
country_deaths_plot(top_countries, pop_prop=False)
country_deaths_plot(top_countries, pop_prop=True)

In [10]:
for c in list(confirmed.T.sum().sort_values(ascending=False).head(10).index):
    sird_plot(c, pop_prop=False)

# Regression Analysis

## Logistic Modeling of Outbreaks

The following models show logistic regressions using the most recent data for each country's outbreak. 95% confidence intervals are provided (if not shown, the intervals are too large to depict, and models may be highly inaccurate).  

<img src="http://andymath.com/wp-content/uploads/2019/08/Logistic-Function.jpg" width="240px" style="float: left">

In [11]:
# Removes rows prior to onset of outbreak and sets index to days since outbreak began in country
def confirmed_by_days(country_name: str):
    c_series = confirmed.T[country_name]
    c_series = c_series[c_series.cumsum() > 0]
    return c_series.reset_index(drop=True)
def dead_by_days(country_name: str):
    d_series = confirmed.T[country_name]
    d_series = c_series[c_series.cumsum() > 0]
    return d_series.reset_index(drop=True)

In [12]:
def logist_reg(country_name: str, forecast_days: int = 5, show_ci: bool = True):
    def logist_fn(x, a, b, c):
        return a / (1 + np.exp(-b*(x-c)))
    reg_data = 100000 * confirmed_by_days(country_name) / current_pop(country_name)
    try:
        param_est, param_cov = curve_fit(logist_fn, reg_data.index.values, reg_data.values, bounds=(0, [200, 1, 100]))
        #compute 1x StDev errors for params
        param_stdev = np.sqrt(np.diag(param_cov))
        pred_data = logist_fn(range(len(reg_data.index) + forecast_days), *param_est)
        formula_str = "%0.2f / (1 + exp(-%0.2f(x-%0.2f)))" % (param_est[0], param_est[1], param_est[2])
        if show_ci and np.all(np.less(param_stdev, [20, 10, 5])):
            plot_stdev = param_stdev * [1, 1, -1]
            pred_data_upper = logist_fn(range(len(reg_data.index) + forecast_days), *np.maximum([0,0,0], param_est + 2*plot_stdev))
            pred_data_lower = logist_fn(range(len(reg_data.index) + forecast_days), *np.maximum([0,0,0], param_est - 2*plot_stdev))
            pd.DataFrame([pred_data, reg_data, pred_data_upper, pred_data_lower], index=[formula_str, "Actual", "Upper", "Lower"]).T.iplot(
                title="Forecasting COVID-19 Confirmed Cases: " + country_name,
                xTitle="Days since first confirmed case in country",
                yTitle="# of confirmed cases per 100,000 people",
                dash=["solid", "solid", "dash", "dash"],
                legend="bottom"
            )
        else:
          pd.DataFrame([pred_data, reg_data], index=[formula_str, "Actual"]).T.iplot(
            title="Forecasting COVID-19 Confirmed Cases: " + country_name,
            xTitle="Days since first confirmed case in country",
            yTitle="# of confirmed cases per 100,000 people",
            dash=["dash", "solid"],
            legend="bottom"
          )
        return (param_est, param_stdev)
    except RuntimeError:
        print("Failed to generate adequate regression for: " + country_name)
        return [None, None, None, None]
logist_param_est = []
logist_param_stdev = []
country_list = list(confirmed.T.sum().sort_values(ascending=False).head(10).index)
for c in country_list:
    res = logist_reg(c)
    logist_param_est.append(res[0])
    logist_param_stdev.append(res[1])
print("Logistic Regression Parameter Estimates")
display(pd.DataFrame(logist_param_est, index=country_list, columns=["Max Value", "Growth Rate", "Midpoint"]))
logist_param_95ci = [zip(np.maximum([0, 0, 0], x - 2*logist_param_stdev[i]).round(5), np.maximum([0, 0, 0], x + 2*logist_param_stdev[i]).round(5)) for i,x in enumerate(logist_param_est)]
print("")
print("Logistic Regression Parameter 95% Confidence Intervals")
display(pd.DataFrame(logist_param_95ci, index=country_list, columns=["Max Value", "Growth Rate", "Midpoint"]))

Logistic Regression Parameter Estimates


Unnamed: 0,Max Value,Growth Rate,Midpoint
Mainland China,5.621381,0.222185,17.754593
Italy,153.108563,0.212518,48.869368
South Korea,15.865332,0.342034,39.597651
Iran,21.91837,0.25633,20.794181
Spain,36.915166,0.439243,43.319803
Germany,79.750906,0.277325,56.53723
France,49.671129,0.250614,57.615699
US,8.970287,0.28899,59.878124
Japan,1.392656,0.112411,54.507456
Switzerland,86.486694,0.313197,22.50919



Logistic Regression Parameter 95% Confidence Intervals


Unnamed: 0,Max Value,Growth Rate,Midpoint
Mainland China,"(5.55258, 5.69018)","(0.20812, 0.23625)","(17.42408, 18.08511)"
Italy,"(84.9907, 221.22643)","(0.19376, 0.23127)","(45.63284, 52.10589)"
South Korea,"(15.74253, 15.98813)","(0.33274, 0.35133)","(39.49991, 39.69539)"
Iran,"(19.33665, 24.50009)","(0.2259, 0.28676)","(19.68586, 21.9025)"
Spain,"(27.2764, 46.55393)","(0.37827, 0.50022)","(42.12732, 44.51229)"
Germany,"(0.0, 216.99867)","(0.24757, 0.30709)","(48.85053, 64.22393)"
France,"(0.0, 115.71834)","(0.21126, 0.28997)","(50.28548, 64.94592)"
US,"(4.22856, 13.71201)","(0.2751, 0.30288)","(57.47093, 62.28531)"
Japan,"(1.1523, 1.63301)","(0.10531, 0.11951)","(51.73469, 57.28023)"
Switzerland,"(0.0, 221.31945)","(0.20998, 0.41641)","(14.69217, 30.32621)"


### Retrospective Logistic Regression

The following code goes back in time and performs logistic regressions using only the data available from the 10 days prior to the date of analysis. See how the predictions evolve over time as we gain more information on the trajectory of each country's outbreak!

In [13]:
#Generate animated plots showing the change in forecast as each days stats are added
def logist_reg_retro(country_name: str, retrospect_days: int = 15, forecast_days: int = 10, show_ci: bool = True):
    def logist_fn(x, a, b, c):
        return a / (1 + np.exp(-b*(x-c)))
    reg_data = 100000 * confirmed_by_days(country_name) / current_pop(country_name)
    plot_data = pd.DataFrame(columns=["Outbreak Day", "Predicted", "Actual", "Upper", "Lower", "Prediction Day"])
    for forecast_day in range(len(reg_data.index) - retrospect_days, len(reg_data.index)):
        limited_reg_data = reg_data[:forecast_day]
        try:
            param_est, param_cov = curve_fit(logist_fn, limited_reg_data.index.values, limited_reg_data.values, bounds=(0, [200, 1, 100]))
            #compute 1x StDev errors for params
            param_stdev = np.sqrt(np.diag(param_cov))
            pred_data = logist_fn(range(len(reg_data.index) + forecast_days), *param_est)
            if show_ci and np.all(np.less(param_stdev, [100, 50, 25])):
                plot_stdev = param_stdev * [1, 1, -1]
                pred_data_upper = logist_fn(range(len(reg_data.index) + forecast_days), *np.maximum([0,0,0], param_est + 2*plot_stdev))
                pred_data_lower = logist_fn(range(len(reg_data.index) + forecast_days), *np.maximum([0,0,0], param_est - 2*plot_stdev))
                forecast_data = pd.DataFrame([range(len(pred_data)), pred_data, reg_data, pred_data_upper, pred_data_lower, [forecast_day]*len(pred_data)], index=["Outbreak Day", "Predicted", "Actual", "Upper", "Lower", "Prediction Day"]).T
                plot_data = plot_data.append(forecast_data)
            else:
                forecast_data = pd.DataFrame([reg_data.index.values, pred_data, reg_data, [None]*len(pred_data), [None]*len(pred_data), [forecast_day]*len(pred_data)], index=["Outbreak Day", "Predicted", "Actual", "Upper", "Lower", "Prediction Day"]).T
                plot_data = plot_data.append(forecast_data)
        except RuntimeError:
            print("Failed to generate adequate regression for: " + country_name)
    melt_plot_data = pd.melt(plot_data, id_vars=["Outbreak Day", "Prediction Day"], value_vars=["Predicted", "Actual", "Upper", "Lower"])
    melt_plot_data.columns = ["Outbreak Day", "Prediction Day", "Trend Line", "# of confirmed cases per 100,000 people"]
    plt = px.line(melt_plot_data, x="Outbreak Day", y="# of confirmed cases per 100,000 people", title="Forecasting Retrospective: " + country_name,
            line_group="Trend Line", color="Trend Line", animation_frame="Prediction Day")
    plt["layout"]["updatemenus"][0]["buttons"][0]["args"][1]["transition"]["duration"] = 0
    plt["layout"]["updatemenus"][0]["buttons"][0]["args"][1]["frame"]["duration"] = 250
    plt.show()
for c in country_list:
    logist_reg_retro(c);


## Exponential Modeling of Early Outbreaks

In [14]:
def exp_reg(country_name: str):
    def exp_fn(x, a, b):
        return a * np.exp(b * x)
    reg_data = 100000 * confirmed_by_days(country_name) / current_pop(country_name)
    try:
        popt, pcov = curve_fit(exp_fn, reg_data.index.values, reg_data.values, bounds=(0, [np.inf, np.inf]))
        pred_data = exp_fn(range(len(reg_data.index) + 5), *popt)
        formula_str = "%0.2e*exp(%0.2f*x)" % (popt[0], popt[1])
        pd.DataFrame([pred_data, reg_data], index=[formula_str, "Actual"]).T.iplot(
            title="Forecasting COVID-19 Confirmed Cases: " + country_name,
            xTitle="Days since first confirmed case in country",
            yTitle="# of confirmed cases per 100,000 people",
            legend="bottom"
        )
    except RuntimeError:
        print("Failed to generate adequate regression for: " + country_name)
for c in list(confirmed.T.sum().sort_values(ascending=False).head(10).index):
    exp_reg(c)