# Open COVID-19 Analysis 

This notebook is meant to serve as a starter for anyone interested in diving into the most recent data surrounding the global 2019 novel coronavirus (COVID-19) outbreak.

Datasets:
- Epidemiology Data: https://data.humdata.org/dataset/novel-coronavirus-2019-ncov-cases
- Population Data: https://population.un.org/wpp/Download/Standard/Population/

In [1]:
import pandas as pd
import cufflinks as cf
import plotly.graph_objs as go
import numpy as np
from scipy.optimize import curve_fit
from typing import List
cf.go_offline()

In [2]:
confirmed_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv"
deaths_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv"
recovered_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Recovered.csv"

In [3]:
raw_confirmed = pd.read_csv(confirmed_url)
raw_deaths = pd.read_csv(deaths_url)
raw_recovered = pd.read_csv(recovered_url)

In [4]:
total_pop = pd.read_csv("total_pop.csv")
total_pop.index = total_pop["Country"]
total_pop.drop(axis=1, labels=["Country"], inplace=True)
def current_pop(country: str):
    return int(total_pop.T[country]["2020"].replace(" ", "")) * 1000

In [5]:
def country_agg(raw_df):
    df = raw_df.copy()
    df.drop(axis=1, labels=["Province/State", "Lat", "Long"], inplace=True)
    df = df.groupby(by="Country/Region").sum()
    df.drop(axis=0, labels=["Others"], inplace=True) # Drop cases not associated with a country
    return df

In [6]:
confirmed = country_agg(raw_confirmed)
deaths = country_agg(raw_deaths)
recovered = country_agg(raw_recovered)
infected = confirmed - recovered - deaths

In [7]:
# Top countries by number of confirmed cases
confirmed.T.max().sort_values(ascending=False).head(20)

Country/Region
Mainland China    80735
Italy              9172
South Korea        7478
Iran               7161
France             1209
Germany            1176
Spain              1073
US                  605
Japan               511
Switzerland         374
Netherlands         321
UK                  321
Sweden              248
Belgium             239
Norway              205
Singapore           150
Austria             131
Malaysia            117
Hong Kong           115
Bahrain              95
dtype: int64

In [8]:
def sird_plot(country_name: str, pop_prop: bool = True):
    pop_size = current_pop(country_name)/100000 if pop_prop else 1
    plot_data = pd.DataFrame([infected.T[country_name]/pop_size, recovered.T[country_name]/pop_size, deaths.T[country_name]/pop_size], index=["Infected", "Recovered", "Dead"]).T
    if(pop_prop):
        plot_data.iplot(title="COVID-19 Outbreak: " + country_name, yTitle="Count per 100,000 People")
    else:
        plot_data.iplot(title="COVID-19 Outbreak: " + country_name,  yTitle="Count")
def country_plot(cnames: List[str], pop_prop: bool = True):
    col_data = [];
    for c in cnames:
        pop_size = current_pop(c)/100000 if pop_prop else 1
        col_data.append(confirmed.T[c]/pop_size);
    plot_data = pd.DataFrame(col_data, index=cnames).T
    if(pop_prop):
        plot_data.iplot(subplots=True, subplot_titles=True, shared_yaxis=True, title="COVID-19 Confirmed Cases Per 100,000 People")
    else:
        plot_data.iplot(subplots=True, subplot_titles=True, shared_yaxis=True, title="COVID-19 Confirmed Cases")

In [9]:
top_countries = list(confirmed.T.sum().sort_values(ascending=False).head(4).index)
top_countries.append("US")
country_plot(top_countries, pop_prop=False)
country_plot(top_countries, pop_prop=True)

In [10]:
for c in list(confirmed.T.sum().sort_values(ascending=False).head(10).index):
    sird_plot(c, pop_prop=False)

# Regression Analysis

## Exponential Modeling of Early Outbreaks

In [11]:
# Removes rows prior to onset of outbreak and sets index to days since outbreak began in country
def confirmed_by_days(country_name: str):
    c_series = confirmed.T[country_name]
    c_series = c_series[c_series.cumsum() > 0]
    return c_series.reset_index(drop=True)
def exp_reg(country_name: str):
    def exp_fn(x, a, b):
        return a * np.exp(b * x)
    reg_data = 100000 * confirmed_by_days(country_name) / current_pop(country_name)
    try:
        popt, pcov = curve_fit(exp_fn, reg_data.index.values, reg_data.values, bounds=(0, [np.inf, np.inf]))
        pred_data = exp_fn(range(len(reg_data.index) + 5), *popt)
        formula_str = "%0.2e*exp(%0.2f*x)" % (popt[0], popt[1])
        pd.DataFrame([pred_data, reg_data], index=[formula_str, "Actual"]).T.iplot(
            title="Forecasting COVID-19 Confirmed Cases: " + country_name,
            xTitle="Days since first confirmed case in country",
            yTitle="# of confirmed cases per 100,000 people",
            legend="bottom"
        )
    except RuntimeError:
        print("Failed to generate adequate regression for: " + country_name)
for c in list(confirmed.T.sum().sort_values(ascending=False).head(10).index):
    exp_reg(c)

## Logistic Modeling of Outbreaks

<img src="http://andymath.com/wp-content/uploads/2019/08/Logistic-Function.jpg" width="240px" style="float: left">

In [12]:
def logist_reg(country_name: str):
    def logist_fn(x, a, b, c):
        return a / (1 + np.exp(-b*(x-c)))
    reg_data = 100000 * confirmed_by_days(country_name) / current_pop(country_name)
    try:
        popt, pcov = curve_fit(logist_fn, reg_data.index.values, reg_data.values, bounds=(0, [200, 1, 100]))
        pred_data = logist_fn(range(len(reg_data.index) + 5), *popt)
        formula_str = "%0.2f / (1 + exp(-%0.2f(x-%0.2f)))" % (popt[0], popt[1], popt[2])
        pd.DataFrame([pred_data, reg_data], index=[formula_str, "Actual"]).T.iplot(
            title="Forecasting COVID-19 Confirmed Cases: " + country_name,
            xTitle="Days since first confirmed case in country",
            yTitle="# of confirmed cases per 100,000 people",
            legend="bottom"
        )
        return popt
    except RuntimeError:
        print("Failed to generate adequate regression for: " + country_name)
        return [None, None, None]
country_logist_params = []
country_list = list(confirmed.T.sum().sort_values(ascending=False).head(10).index)
for c in country_list:
    country_logist_params.append(logist_reg(c))

In [13]:
pd.DataFrame(country_logist_params, index=country_list, columns=["Max Value", "Growth Rate", "Midpoint"])

Unnamed: 0,Max Value,Growth Rate,Midpoint
Mainland China,5.6102,0.222642,17.743017
South Korea,15.51795,0.355668,39.414627
Italy,68.921787,0.253982,43.027365
Iran,10.498977,0.444037,15.633968
Japan,1.283863,0.112709,53.610422
France,2.580366,0.50694,42.85097
Germany,2.024616,0.45783,40.130169
Spain,199.999999,0.336729,50.358467
US,199.703562,0.244796,75.41916
Singapore,2.489016,0.109172,26.344688
