# Open COVID-19 Analysis 

This notebook is meant to serve as a starter for anyone interested in diving into the most recent data surrounding the global 2019 novel coronavirus (COVID-19) outbreak.

Datasets:
- Epidemiology Data: https://data.humdata.org/dataset/novel-coronavirus-2019-ncov-cases
- Population Data: https://population.un.org/wpp/Download/Standard/Population/

In [1]:
import pandas as pd
import cufflinks as cf
import plotly.graph_objs as go
from sklearn.linear_model import LogisticRegression
from typing import List
cf.go_offline()

In [2]:
confirmed_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Confirmed.csv"
deaths_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Deaths.csv"
recovered_url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_19-covid-Recovered.csv"

In [3]:
raw_confirmed = pd.read_csv(confirmed_url)
raw_deaths = pd.read_csv(deaths_url)
raw_recovered = pd.read_csv(recovered_url)

In [4]:
total_pop = pd.read_csv("total_pop.csv")
total_pop.index = total_pop["Country"]
total_pop.drop(axis=1, labels=["Country"], inplace=True)
def current_pop(country: str):
    return int(total_pop.T[country]["2020"].replace(" ", "")) * 1000

In [5]:
def country_agg(raw_df):
    df = raw_df.copy()
    df.drop(axis=1, labels=["Province/State", "Lat", "Long"], inplace=True)
    df = df.groupby(by="Country/Region").sum()
    df.drop(axis=0, labels=["Others"], inplace=True) # Drop cases not associated with a country
    return df

In [6]:
confirmed = country_agg(raw_confirmed)
deaths = country_agg(raw_deaths)
recovered = country_agg(raw_recovered)
infected = confirmed - recovered - deaths

In [7]:
confirmed.T.sum().sort_values(ascending=False).head(20)

Country/Region
Mainland China    2312052
South Korea         58078
Italy               35041
Iran                30003
Japan                5309
France               4411
Germany              4317
Singapore            2817
US                   2657
Spain                2574
Hong Kong            2443
Thailand             1392
UK                   1213
Switzerland          1184
Malaysia             1036
Taiwan               1012
Australia             926
Netherlands           761
Sweden                698
Norway                673
dtype: int64

In [8]:
def sird_plot(country_name: str, pop_prop: bool = True):
    pop_size = current_pop(country_name) if pop_prop else 1
    plot_data = pd.DataFrame([infected.T[country_name]/pop_size, recovered.T[country_name]/pop_size, deaths.T[country_name]/pop_size], index=["Infected", "Recovered", "Dead"]).T
    if(pop_prop):
        plot_layout = go.Layout(
            title="COVID-19 Outbreak: " + country_name,
            yaxis={'showexponent': 'all', 'exponentformat': 'E', 'rangemode': 'tozero', 'ticksuffix': '%'},
            xaxis={'visible': True},
        );
        plot_data.iplot(layout=plot_layout)
    else:
        plot_data.iplot(title="COVID-19 Outbreak: " + country_name)
def country_plot(cnames: List[str], pop_prop: bool = True):
    col_data = [];
    for c in cnames:
        pop_size = current_pop(c) if pop_prop else 1
        col_data.append(100*confirmed.T[c]/pop_size);
    plot_data = pd.DataFrame(col_data, index=cnames).T
    if(pop_prop):
        plot_layout = go.Layout(
            title="COVID-19 Confirmed Cases as % of Population By Country",
            yaxis={'showexponent': 'all', 'exponentformat': 'E', 'rangemode': 'tozero', 'ticksuffix': '%'},
            xaxis={'visible': True},
        );
        plot_data.iplot(subplots=True, subplot_titles=True, shared_yaxis=True, layout=plot_layout)
    else:
        plot_data.iplot(subplots=True, subplot_titles=True, shared_yaxis=True)

In [9]:
top_countries = list(confirmed.T.sum().sort_values(ascending=False).head(4).index)
top_countries.append("US")
country_plot(top_countries, pop_prop=True)

In [10]:
for c in list(confirmed.T.sum().sort_values(ascending=False).head(10).index):
    sird_plot(c, pop_prop=False)