In [1]:
# strongly inspired by https://towardsdatascience.com/analyzing-coronavirus-covid-19-data-using-pandas-and-plotly-2e34fe2c4edc
## Import Libraries
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.linear_model import LinearRegression 
from sklearn.preprocessing import PolynomialFeatures 
plt.rcParams['figure.figsize'] = [15, 5]
from IPython import display
from ipywidgets import interact, widgets
import datetime

## Read Data for Cases, Deaths and Recoveries
srcconfirmed="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"
srcrecovered="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv"
srcdeaths="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv"

Cases_confirmed=pd.read_csv(srcconfirmed)
Cases_recovered=pd.read_csv(srcrecovered)
Cases_deaths=pd.read_csv(srcdeaths)

print("Data source: https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/")
print("Retrieved on: " + str(datetime.datetime.now()))

Data source: https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/
Retrieved on: 2020-04-07 00:10:27.188070


In [2]:
def fix_date(df):
    return datetime.datetime.strptime(df["date"], '%m/%d/%y').strftime('%Y-%m-%d')

def fix_dataset(cases, value_name):
    cases = cases.rename(columns={'Province/State':'label', 'Country/Region':'parent'})
    cases.drop(['Lat', 'Long'], axis=1, inplace=True)
    cases = cases.melt(id_vars=['label','parent'], var_name="date", value_name=value_name)
    cases.fillna(value="", inplace=True)
    cases["date"] = cases.apply(fix_date, axis=1)
    cases.set_index(["parent","label","date"], inplace=True)
    return cases

# Transform
Cases_confirmed_clean = fix_dataset(Cases_confirmed, "confirmed")
Cases_recovered_clean = fix_dataset(Cases_recovered, "recovered")
Cases_deaths_clean = fix_dataset(Cases_deaths, "deaths")

Cases_raw = Cases_deaths_clean.merge(Cases_confirmed_clean, left_on=['parent','label','date'], right_on=['parent','label','date'])
Cases_raw = Cases_raw.merge(Cases_recovered_clean, left_on=['parent','label','date'], right_on=['parent','label','date'])
Cases_raw.loc['US']

Unnamed: 0_level_0,Unnamed: 1_level_0,deaths,confirmed,recovered
label,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,2020-01-22,0,1,0
,2020-01-23,0,1,0
,2020-01-24,0,2,0
,2020-01-25,0,2,0
,2020-01-26,0,5,0
,...,...,...,...
,2020-04-01,4757,213372,8474
,2020-04-02,5926,243453,9001
,2020-04-03,7087,275586,9707
,2020-04-04,8407,308850,14652


In [5]:
def add_derived(Cases):
#    Cases.set_index('date', inplace=True)
#    Cases['active'] = Cases['confirmed']-Cases['deaths']-Cases['recovered']
    Cases['lethality'] = np.round(Cases['deaths']/Cases['confirmed'], 3)
    
#    Cases.drop(['derivedConfirmed_'], axis=1, inplace=True)
    return Cases

# Get Daily Data
#Cases_diff = Cases_raw.groupby(['parent','label','date'])
#Cases_diff = Cases_diff.sum()
#Cases_diff = Cases_diff.diff().fillna(0)
#Cases_diff = Cases_diff.rename(columns={"confirmed":"confirmed_new","deaths":"deaths_new","recovered":"recovered_new"})
#Cases = Cases_raw.merge(Cases_diff, left_on=['parent','label','date'], right_on=['parent','label','date'])

Cases = Cases_raw
Cases.loc['Germany']
#Cases

Unnamed: 0_level_0,Unnamed: 1_level_0,deaths,confirmed,recovered
label,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,2020-01-22,0,0,0
,2020-01-23,0,0,0
,2020-01-24,0,0,0
,2020-01-25,0,0,0
,2020-01-26,0,0,0
,...,...,...,...
,2020-04-01,920,77872,18700
,2020-04-02,1107,84794,22440
,2020-04-03,1275,91159,24575
,2020-04-04,1444,96092,26400


In [17]:
# Shift data
#CasesShiftedB = Cases.tail(0)
#for country in Cases.droplevel('date').index.unique().tolist():
#    firstcase = Cases.loc[country]['deaths'].reset_index().set_index('date')
#    firstcase = firstcase[firstcase.ne(0)].dropna().reset_index()
#    firstcase['parent'] = country
#    firstcase = firstcase.set_index(['parent', 'label', 'date'])
#    CasesShiftedB = CasesShiftedB.append(firstcase)
#CasesShiftedB

#CasesS = {}

#for place in Cases.droplevel('date').index.unique().tolist():
#    firstcase = Cases.loc[place]['deaths'].reset_index().set_index('date')
#    firstcase = firstcase[firstcase.ne(0)]
#    CasesS[place] = firstcase

#CasesS

def add_day(df):
    firstcase = Cases.loc[place]['deaths'].reset_index().set_index('date')
    Cases['day'] = Cases.loc[place][firstcase.ne(0)]
    return day

def combine_projectedDeaths(row):
    if row['deaths'] == False:
        return row['projectedDeaths']
    return row['deaths']

def add_simulated(Cases):
    Cases['deaths_bak'] = Cases['deaths']
    Cases['deaths'] = Cases.apply(combine_projectedDeaths, axis=1)
    Cases['derivedConfirmed_'] = np.round(Cases['deaths'] / 0.03, 0) # mortality
    Cases['derivedConfirmed'] = Cases['derivedConfirmed_'].shift(periods=-14) # time to death after turning infectious
    
#    Cases['derivedRecovered_'] = Cases['derivedConfirmed_'].shift(periods=-3) # periods = time to death + time to recovery
#    Cases['derivedRecovered'] = Cases['derivedRecovered_'] - Cases['deaths']
    
#    Cases['derivedActive'] = Cases['derivedConfirmed_'].shift(periods=-12) - Cases['deaths'] - Cases['derivedRecovered'] # periods = time to death + incubation period
    Cases['derivedLethality'] = np.round(Cases['deaths']/Cases['derivedConfirmed'], 3)
    Cases['testRate'] = np.minimum(1,np.round(Cases['confirmed']/Cases['derivedConfirmed'], 3))
    
    Cases['deaths'] = Cases['deaths_bak']
    #Cases.drop(['derivedConfirmed_','derivedRecovered_','deaths_bak'], axis=1, inplace=True)
    Cases.drop(['derivedConfirmed_','deaths_bak'], axis=1, inplace=True)
    return Cases

#Cases_sim = add_simulated(Cases.loc['Germany'].droplevel('label'))
firstrecordeddate = Cases.reset_index().head(1)['date'].values[0]
lastrecordeddate = Cases.reset_index().tail(1)['date'].values[0]
startdate = datetime.datetime.strptime(lastrecordeddate, '%Y-%m-%d')
projectionstartdate = (startdate + datetime.timedelta(days=1)).strftime('%Y-%m-%d')
enddate = (startdate + datetime.timedelta(days=(90 - 15))).strftime('%Y-%m-%d')

def extenddata(cases, title=""):
    cases = cases.reset_index().groupby(["date"]).sum() #.reset_index().set_index('date')
    y = cases['deaths'].values
    x = range(0, y.size)

    # calculate polynomial
    z = np.polyfit(x[-7:], y[-7:], 2)
#    z = np.polyfit(x, y, 2)
    f = np.poly1d(z)
    print(f)

    # calculate new x's and y's
    lookahead = 90
    cutoffsimulated = -15 # days we calculate in the simulation, but later don't display
    cutoffpolynomial = -14 # days we calculate in the polynomial, but later don't display
    x_new = np.linspace(x[-1], x[-1]+lookahead, lookahead)
    y_new = f(x_new)
    
    # prepare new date range
    startdate = datetime.datetime.strptime(cases.index[-1], '%Y-%m-%d')
    x_new_dates = [] # in the fit graph we need to start one day early for it to look smooth
    last_val = cases['deaths'].values[-1]
    for x_delta in range(1,lookahead): # generate future dates
        date = (startdate + datetime.timedelta(days=x_delta)).strftime('%Y-%m-%d')
        x_new_dates.append(date)
        val = max(0,np.round(f(x[-1]+1+x_delta)),0)
        
        # once we are over the peak deaths, the curve ends and goes linear
        if val < last_val:
            val = last_val
        last_val = val
        
        cases = cases.append(pd.DataFrame({'date':[date], 'deaths': False, 'projectedDeaths':[val]}).set_index("date"))
#        print('delta '+date+" y "+str(f(x[-1]+1+x_delta)))

    # cut displayed data and add simulated numbers
    firstdate = cases.index[1]
    enddate = (startdate + datetime.timedelta(days=(lookahead + cutoffsimulated))).strftime('%Y-%m-%d')
#    date = (startdate + datetime.timedelta(days=1)).strftime('%Y-%m-%d')
#    cases = add_simulated(cases)[date:enddate].fillna(0)
    cases = add_derived(cases)
    cases = add_simulated(cases)[firstdate:enddate].fillna(0)
#    y_new = y_new[:cutoffpolynomial]

    # export data
    if title != "":
        cases.to_csv("sim_"+title+".csv")
        cases.to_html("sim_"+title+".html")
#        cases.to_excel("output.xslx")

    # in the fit graph we need to start one day early for it to look smooth
    #cases = cases.append(pd.DataFrame({'date':[startdate], 'deaths':[f(x[-1]+1)]}).set_index("date"))
        
    return cases

GlobalTotals = Cases.reset_index().groupby('date').sum()
#GlobalTotals = extenddata(GlobalTotals)
#GlobalTotals.loc["2020-03-01":"2020-03-14"]
#GlobalTotals
#Cases.loc['US']
extenddata(Cases.loc['China'])
#extenddata(Cases.loc['Germany']).loc[lastrecordeddate]
#extenddata(Cases.loc['China'].reset_index().groupby(["label", "date"]).sum())
#extenddata(Cases.loc['Germany']).loc[projectionstartdate]
#Cases.loc['Germany']

         2
-0.1071 x + 19.75 x + 2459


Unnamed: 0_level_0,deaths,confirmed,recovered,projectedDeaths,lethality,derivedConfirmed,derivedLethality,testRate
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-01-23,18,643.0,30.0,0.0,0.028,21100.0,0.001,0.030
2020-01-24,26,920.0,36.0,0.0,0.028,23933.0,0.001,0.038
2020-01-25,42,1406.0,39.0,0.0,0.030,26833.0,0.002,0.052
2020-01-26,56,2075.0,49.0,0.0,0.027,30167.0,0.002,0.069
2020-01-27,82,2877.0,58.0,0.0,0.029,33733.0,0.002,0.085
...,...,...,...,...,...,...,...,...
2020-06-15,0,0.0,0.0,3369.0,0.000,112300.0,0.030,0.000
2020-06-16,0,0.0,0.0,3369.0,0.000,112300.0,0.030,0.000
2020-06-17,0,0.0,0.0,3369.0,0.000,112300.0,0.030,0.000
2020-06-18,0,0.0,0.0,3369.0,0.000,112300.0,0.030,0.000


In [28]:
%matplotlib inline
plt.style.use('seaborn-dark')

Cases = Cases.reset_index().set_index(['parent','label'])

def plotData(cases, title):
    cases = cases.groupby('date').sum().reset_index().set_index('date')
    sim = extenddata(cases, title)
#    sim = extenddata(cases[firstrecordeddate:"2020-04-04"], title)
    projected = sim[lastrecordeddate:]
    
    fig = make_subplots(rows=3, cols=2,shared_xaxes=True,
                        specs=[[{}, {}],[{},{}],
                           [{"colspan": 2}, None]],
                        subplot_titles=('Total Confirmed Cases','Active Cases','Deaths','Recoveries','Death to Cases Ratio'))
    fig = go.Figure()

    fig.add_trace(go.Scatter(x=cases.index,y=cases['confirmed'],
                             mode='lines',
                             name='Confirmed Cases',
                             line=dict(color='red',width=2)))
#                             row=1,col=1)
    #fig.add_trace(go.Scatter(x=cases.index,y=cases['active'],
    #                         mode='lines',
    #                         name='Active Cases',
    #                         line=dict(color='purple',width=2)))
#                             row=1,col=2)
    #fig.add_trace(go.Scatter(x=cases.index,y=cases['recovered'],
    #                         mode='lines',
    #                         name='Recoveries',
    #                         line=dict(color='green',width=2)))
#                             row=1,col=1)
    fig.add_trace(go.Scatter(x=cases.index,y=cases['deaths'],
                             mode='lines',
                             name='Deaths',
                             line=dict(color='black',width=2)))
#                             row=1,col=1)
    fig.add_trace(go.Scatter(x=projected.index,y=projected['projectedDeaths'],
                             mode='lines',
                             name='Projected deaths',
                             line=dict(color='black',width=2,dash='dot')))
#                             row=1,col=1)
        
    fig.add_trace(go.Scatter(x=sim.index,y=sim['derivedConfirmed'],
                             mode='lines',
                             name='Derived Cases',
                             line=dict(color='red',width=2,dash='dot')))
                             #row=1,col=1)
    #fig.add_trace(go.Scatter(x=sim.index,y=sim['derivedActive'],
    #                         mode='lines',
    #                         name='Derived Active Cases',
    #                         line=dict(color='purple',width=2,dash='dot')))
                             #row=1,col=2)
    #fig.add_trace(go.Scatter(x=sim.index,y=sim['derivedRecovered'],
    #                         mode='lines',
    #                         name='Derived Recoveries',
    #                         line=dict(color='green',width=2,dash='dot')))
                             #row=2,col=2)

    fig.update_layout(showlegend=True)
    fig.update_layout(title=title,
                       yaxis_title='Cases',
#                       yaxis_title='Cases (log)', yaxis_type="log",
                       xaxis_title='Date')

    fig.show()
    fig.update_layout(title=title,
                       yaxis_title='Cases (log)', yaxis_type="log",
                       xaxis_title='Date')

    fig.show()
#    fig.write_image("test.png")

#plotData(GlobalTotals, 'Global')
#plotData(Cases.loc["US"], 'US')
plotData(Cases.loc['United Kingdom'], 'United Kingdom')
#plotData(Cases.loc["China"].groupby('date').sum(), 'China')
#plotData(Cases.loc['China'], 'China')
#plotData(Cases.loc['Italy'], 'Italy')
#plotData(Cases.loc['Germany'], 'Germany')
#plotData(Cases.loc['Sweden'], 'Sweden')
plotData(Cases.loc['Belgium'], 'Belgium')

       2
25.74 x - 3051 x + 8.983e+04


        2
-1.869 x + 418 x - 1.926e+04


In [8]:
Cases.loc['United Kingdom']

Unnamed: 0_level_0,date,deaths,confirmed,recovered
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bermuda,2020-01-22,0,0,0
Cayman Islands,2020-01-22,0,0,0
Channel Islands,2020-01-22,0,0,0
Gibraltar,2020-01-22,0,0,0
Isle of Man,2020-01-22,0,0,0
...,...,...,...,...
,2020-04-05,4934,47806,135
Anguilla,2020-04-05,0,3,0
British Virgin Islands,2020-04-05,0,3,0
Turks and Caicos Islands,2020-04-05,1,5,0


In [9]:
Cases.loc['Germany']

Unnamed: 0_level_0,date,deaths,confirmed,recovered
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,2020-01-22,0,0,0
,2020-01-23,0,0,0
,2020-01-24,0,0,0
,2020-01-25,0,0,0
,2020-01-26,0,0,0
...,...,...,...,...
,2020-04-01,920,77872,18700
,2020-04-02,1107,84794,22440
,2020-04-03,1275,91159,24575
,2020-04-04,1444,96092,26400


In [19]:
countries = ["France","Italy","Spain","US","United Kingdom","Iran","China","Netherlands","Germany","Belgium","Sweden","Belgium"]
sims_lastrecordeddate = {}
sims_enddate = {}
columns = {}
for place in countries:
    print(place)
    sim = extenddata(Cases.loc[place], place)
    sims_lastrecordeddate[place] = sim.loc[lastrecordeddate]
    sims_enddate[place] = sim.loc[enddate]
#    if "label" not in sims_lastrecordeddate[place].index.values:
#        sims_lastrecordeddate[place] = sims_lastrecordeddate[place].reset_index().groupby("date").sum().loc[lastrecordeddate]
#        sims_enddate[place] = sims_enddate[place].drop("label")
#    else:
#        sims_lastrecordeddate[place] = sims_lastrecordeddate[place].drop("label")
#        sims_enddate[place] = sims_enddate[place].drop("label")
    columns = sims_lastrecordeddate[place].index
    sims_lastrecordeddate[place] = sims_lastrecordeddate[place].values
    sims_enddate[place] = sims_enddate[place].values
    print("ok")
    
pd_sims = pd.DataFrame(sims_lastrecordeddate, columns = sims_lastrecordeddate.keys())
pd_sims.insert(0, 'name', columns)
pd_sims.set_index('name', inplace=True)
pd_sims_enddate = pd.DataFrame(sims_enddate, columns = sims_enddate.keys())
pd_sims_enddate.insert(0, 'name', columns)
pd_sims_enddate.set_index('name', inplace=True)

France
       2
27.79 x - 3026 x + 8.012e+04
ok
Italy
        2
-21.17 x + 3730 x - 1.442e+05
ok
Spain
        2
-16.21 x + 3144 x - 1.311e+05
ok
US
       2
44.63 x - 5219 x + 1.515e+05
ok
United Kingdom
       2
25.74 x - 3051 x + 8.983e+04
ok
Iran
       2
2.024 x - 148 x + 3466
ok
China
         2
-0.1071 x + 19.75 x + 2459
ok
Netherlands
        2
-2.131 x + 454.9 x - 2.021e+04
ok
Germany
       2
1.571 x - 62.07 x - 2414
ok
Belgium
        2
-1.869 x + 418 x - 1.926e+04
ok
Sweden
        2
-3.429 x + 532.2 x - 2.02e+04
ok


In [20]:
pd_sims

Unnamed: 0_level_0,France,Italy,Spain,US,United Kingdom,Iran,China,Netherlands,Germany,Belgium,Sweden
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
deaths,8093.0,15887.0,12641.0,9619.0,4943.0,3603.0,3333.0,1771.0,1584.0,1447.0,401.0
confirmed,93773.0,128948.0,131646.0,337072.0,48436.0,58226.0,82602.0,17953.0,100123.0,19691.0,6830.0
recovered,16349.0,21815.0,38080.0,17448.0,229.0,19736.0,77207.0,257.0,28700.0,3751.0,205.0
projectedDeaths,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
lethality,0.086,0.123,0.096,0.029,0.102,0.062,0.04,0.099,0.016,0.073,0.059
derivedConfirmed,1030033.0,671367.0,673167.0,1350300.0,738500.0,210967.0,112267.0,113133.0,150300.0,104633.0,15000.0
derivedLethality,0.008,0.024,0.019,0.007,0.007,0.017,0.03,0.016,0.011,0.014,0.027
testRate,0.091,0.192,0.196,0.25,0.066,0.276,0.736,0.159,0.666,0.188,0.455


In [21]:
totalcases = Cases.max(level=0)['deaths'].reset_index().set_index('parent')
totalcases = totalcases.sort_values(by='deaths',ascending=False)
top10 = totalcases.head(10)
fig = go.Figure(go.Bar(x=top10.index, y=top10['deaths'],
                      text=top10['deaths'],
            textposition='outside'))
fig.update_layout(title_text='Top 10 Countries by Deaths '+lastrecordeddate)
fig.update_yaxes(showticklabels=False)

fig.show()

In [22]:
#pd_sims_enddate
pd_sims

Unnamed: 0_level_0,France,Italy,Spain,US,United Kingdom,Iran,China,Netherlands,Germany,Belgium,Sweden
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
deaths,8093.0,15887.0,12641.0,9619.0,4943.0,3603.0,3333.0,1771.0,1584.0,1447.0,401.0
confirmed,93773.0,128948.0,131646.0,337072.0,48436.0,58226.0,82602.0,17953.0,100123.0,19691.0,6830.0
recovered,16349.0,21815.0,38080.0,17448.0,229.0,19736.0,77207.0,257.0,28700.0,3751.0,205.0
projectedDeaths,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
lethality,0.086,0.123,0.096,0.029,0.102,0.062,0.04,0.099,0.016,0.073,0.059
derivedConfirmed,1030033.0,671367.0,673167.0,1350300.0,738500.0,210967.0,112267.0,113133.0,150300.0,104633.0,15000.0
derivedLethality,0.008,0.024,0.019,0.007,0.007,0.017,0.03,0.016,0.011,0.014,0.027
testRate,0.091,0.192,0.196,0.25,0.066,0.276,0.736,0.159,0.666,0.188,0.455


In [26]:
totalcases = pd_sims.loc['testRate'].sort_values(ascending=False)
top10 = totalcases.head(10)
fig = go.Figure(go.Bar(x=top10.index, y=top10.values,
                      text=top10.values,
            textposition='outside'))
fig.update_layout(title_text='Top Countries by Test Rate '+lastrecordeddate)
fig.update_yaxes(showticklabels=False)

fig.show()

In [24]:
totalcases = pd_sims.loc['derivedConfirmed'].sort_values(ascending=False)
top10 = totalcases.head(10)
fig = go.Figure(go.Bar(x=top10.index, y=top10.values,
                      text=top10.values,
            textposition='outside'))
fig.update_layout(title_text='Top Countries by Derived Cases '+lastrecordeddate)
fig.update_yaxes(showticklabels=False)

fig.show()

In [25]:
totalcases = pd_sims_enddate.loc['projectedDeaths'].sort_values(ascending=False)
top10 = totalcases.head(10)
fig = go.Figure(go.Bar(x=top10.index, y=top10.values,
                      text=top10.values,
            textposition='outside'))
fig.update_layout(title_text='Top Countries by Projected Deaths by '+enddate)
fig.update_yaxes(showticklabels=False)

fig.show()