In [3]:
import pandas as pd
import os
import numpy as np
import csv
#import geopandas as gpd
from bokeh.plotting import figure, show, ColumnDataSource

**Transform John hopkins data to a single file**

In [2]:
dfConfirmed = pd.read_csv("/project_data/data_asset/sun/casenumbers/time_series_covid19_confirmed_global.csv").fillna(0)
dfDeaths = pd.read_csv("/project_data/data_asset/sun/casenumbers/time_series_covid19_deaths_global.csv").fillna(0)
dfRecovered = pd.read_csv("/project_data/data_asset/sun/casenumbers/time_series_covid19_recovered_global.csv").fillna(0)


dfMapping = pd.read_csv("/project_data/data_asset/country_name_mapping.csv")
#dfMapping

In [3]:
all_countries = set(dfConfirmed["Country/Region"])

def compute_days_since_double_cases(df):
    """
    compute_days_since_double_cases
    
    Computes the days, as a floating point number, the number of active cases have doubled.
    """
    ddf = df.copy()
    ddf.index = ddf.date
    xdf = ddf[ddf.active>5].resample("1H").mean().interpolate().reindex()

    ddf["ddbl"] = 0.

    for i,row in ddf[ddf.active>0].iloc[::-1].iterrows():
        if len(xdf[xdf.active < row.active/2]) > 0:
            half_dt = xdf[xdf.active < row.active/2].index[-1]
            deltat = row.date-half_dt
            ddbl = deltat.total_seconds()/86400.
            if ddbl > 0.:
                ddf.at[i,"ddbl"] = ddbl
        else:
            ddf.at[i,"ddbl"] = 0.
    return ddf.replace([np.inf, -np.inf], np.nan).fillna(0.)["ddbl"]


acdsCountries = {}
#caw gdf = gpd.read_file("/project_data/data_asset/sun/www.naturalearthdata.com/ne_50m_admin_0_countries.shp")[['ADMIN', 'ADM0_A3', 'geometry']]

dfAllCountries = pd.DataFrame()

#wProgress = IntProgress(min=0,max=len(all_countries))
#display(wProgress)

for country in all_countries:
    #wProgress.description = country
    dfCountry = dfConfirmed[dfConfirmed["Country/Region"] == country].transpose()
    columns = list(dfCountry.columns)
    dfCountry["date"] = pd.to_datetime(dfCountry.index,errors="coerce")
    dfCountry = dfCountry.dropna()
    dfCountry["confirmed"] = dfCountry[columns].sum(axis=1).astype(int)

    for c in columns:
        del dfCountry[c]
    
    ddf = dfDeaths[dfDeaths["Country/Region"] == country].transpose()
    columns = list(ddf.columns)
    ddf["date"] = pd.to_datetime(ddf.index,errors="coerce")
    ddf = ddf.dropna()
    ddf["deaths"] = ddf[columns].sum(axis=1).astype(int)

    for c in columns:
        del ddf[c]

    dfCountry = dfCountry.join(ddf,rsuffix = "_tmp")
    del dfCountry["date_tmp"]

    ddf = dfRecovered[dfRecovered["Country/Region"] == country].transpose()
    columns = list(ddf.columns)
    ddf["date"] = pd.to_datetime(ddf.index,errors="coerce")
    ddf = ddf.dropna()
    ddf["recovered"] = ddf[columns].sum(axis=1).astype(int)

    for c in columns:
        del ddf[c]
    
    dfCountry = dfCountry.join(ddf,rsuffix = "_tmp")
    del dfCountry["date_tmp"]

    dfCountry.replace([np.inf, -np.inf], np.nan).fillna(0.,inplace=True)

    dfCountry["new_cases"] = dfCountry.confirmed.diff()
    dfCountry["growth_rate_3"] = dfCountry[["confirmed"]].pct_change(periods=3)
    dfCountry["growth_rate_7"] = dfCountry[["confirmed"]].pct_change(periods=7)
    dfCountry["growth_rate_14"] = dfCountry[["confirmed"]].pct_change(periods=14)

    dfCountry["active"] = dfCountry.confirmed-dfCountry.deaths-dfCountry.recovered
    dfCountry["new_infected"] = dfCountry.active.diff()
    dfCountry["infection_rate_3"] = dfCountry[["active"]].pct_change(periods=3)
    dfCountry["infection_rate_7"] = dfCountry[["active"]].pct_change(periods=7)
    dfCountry["infection_rate_14"] = dfCountry[["active"]].pct_change(periods=14)
    
    dfCountry["goal_2"] = 2

    relevant = dfCountry[dfCountry.confirmed > 0].index
    last_update = dfCountry.loc[relevant].date.max()
    acdsCountries[country] = ColumnDataSource(dfCountry.replace([np.inf, -np.inf], np.nan).fillna(0.))#.loc[relevant])
    
    #if len(dfCountryAlignment[dfCountryAlignment["Country/Region"]==country]) > 0:
    if len(dfMapping[dfMapping["name"]==country]) > 0:
        days_double = compute_days_since_double_cases(dfCountry)
        #ADM0_A3 = dfCountryAlignment[dfCountryAlignment["Country/Region"]==country]["ADM0_A3"].values[0]
        ADM0_A3 = dfMapping[dfMapping.name == country].ADM0_A3.values[0]
        ISO_3_code_i = dfMapping[dfMapping.name == country].ISO_3_code_i.values[0]
        #if dfCountry["active"].iloc[-1] > 0:
        #    active_log10 = np.log10(dfCountry["active"].iloc[-1])
        #else:
        #    active_log10 = 0
        dfCountry["active_log10"] = dfCountry[["active"]].applymap(np.log10)
        dfAllCountries = dfAllCountries.append(pd.DataFrame({"days_double":days_double,
                                                "date":dfCountry["date"],
                                                "ADM0_A3":[ADM0_A3 for i in range(len(dfCountry))],
                                                "ISO_3_code_i":[ISO_3_code_i for i in range(len(dfCountry))],
                                                "country":[country for i in range(len(dfCountry))],
                                                "new_cases":dfCountry["new_cases"],#.iloc[-1],
                                                "growth_rate_7":dfCountry["growth_rate_7"],#.iloc[-1],
                                                "growth_rate_14":dfCountry["growth_rate_14"],#.iloc[-1],
                                                "active":dfCountry["active"],#.iloc[-1],
                                                "active_log10":dfCountry["active_log10"],
                                                "new_infected":dfCountry["new_infected"],#.iloc[-1],
                                                "infection_rate_7":dfCountry["infection_rate_7"],#.iloc[-1],
                                                "infection_rate_14":dfCountry["infection_rate_14"],#.iloc[-1]},
                                                "deaths":dfCountry["deaths"],
                                                "recovered":dfCountry["recovered"],
                                                "confirmed":dfCountry["confirmed"]},
                                               ),ignore_index=True)
    #wProgress.value += 1
    
#caw gdf = gdf.merge(dfAllCountries,on="ADM0_A3")

  return lib.map_infer(x.astype(object).values, func)


In [4]:
dfAllCountries.head(5)

Unnamed: 0,days_double,date,ADM0_A3,ISO_3_code_i,country,new_cases,growth_rate_7,growth_rate_14,active,active_log10,new_infected,infection_rate_7,infection_rate_14,deaths,recovered,confirmed
0,0.0,2020-01-22,BFA,854,Burkina Faso,,,,0,-inf,,,,0,0,0
1,0.0,2020-01-23,BFA,854,Burkina Faso,0.0,,,0,-inf,0.0,,,0,0,0
2,0.0,2020-01-24,BFA,854,Burkina Faso,0.0,,,0,-inf,0.0,,,0,0,0
3,0.0,2020-01-25,BFA,854,Burkina Faso,0.0,,,0,-inf,0.0,,,0,0,0
4,0.0,2020-01-26,BFA,854,Burkina Faso,0.0,,,0,-inf,0.0,,,0,0,0


In [5]:
!ls /project_data/data_asset/mercury/
#!mkdir /project_data/data_asset/mercury/casenumbers

casenumbers  geo  ws3


In [6]:
dfAllCountries.to_parquet("/project_data/data_asset/mercury/casenumbers/johns_hopkins_casenumbers_all_countries.parquet")
dfAllCountries.to_csv("/project_data/data_asset/mercury/casenumbers/johns_hopkins_casenumbers_all_countries.csv",index=False,quoting=csv.QUOTE_NONNUMERIC)

**Preprocess the RKI data**
Data Catalogue: https://www.arcgis.com/home/item.html?id=dd4580c810204019a7b8eb3e0b329dd6

Validation of cases (Comparison of the different dates, calculation methods with the columns "New x")
Calculation of cumulated numbers
df_rki.columns.values: ['IdBundesland', 'Bundesland', 'Landkreis', 'Altersgruppe', 'Geschlecht', 'AnzahlFall', 'AnzahlTodesfall', 'ObjectId', 'Meldedatum', 'IdLandkreis', 'Datenstand', 'NeuerFall', 'NeuerTodesfall', 'Refdatum', 'NeuGenesen', 'AnzahlGenesen']

NeuerFall: 0: Fall ist in der Publikation für den aktuellen Tag und in der für den Vortag enthalten 1: Fall ist nur in der aktuellen Publikation enthalten -1: Fall ist nur in der Publikation des Vortags enthalten damit ergibt sich: Anzahl Fälle der aktuellen Publikation als Summe(AnzahlFall), wenn NeuerFall in (0,1); Delta zum Vortag als Summe(AnzahlFall) wenn NeuerFall in (-1,1) Delta zum Vortag als Summe(AnzahlFall) wenn NeuerFall in (-1,1)

**To make the data comparable with CSSEGI data**
- we use the reporting data ("Meldedatum") and not the symptoms-onset date ("Refdatum")
- number of new cases per day
- cumulative new cases

Use df_rki_melde for reporting date 

Use df_rki_sum for symptoms-onset date

Use df_rki_val for getting actual cases more granular.

In [4]:
df_rki = pd.read_csv("/project_data/data_asset/sun/casenumbers/rki_covid19.csv")

In [5]:
# Validation and check of published numbers
# For actual cases
df_rki_val=df_rki[((df_rki["NeuerFall"]==0) | (df_rki["NeuerFall"]==1))]

# For the difference to the day before:
df_rki_diff=df_rki[((df_rki["NeuerFall"]==-1) | (df_rki["NeuerFall"]==1))]

# Summation overall Germany
df_rki_sum=df_rki_val.groupby(["Refdatum"],as_index=False)[["AnzahlFall"]].sum()
df_rki_sum.set_index("Refdatum", inplace=True, drop=True)
df_rki_sum.index=pd.to_datetime(df_rki_sum.index, format="%Y-%m-%d")
df_rki_sum.sort_index(inplace=True)
df_rki_sum["CumFall"]=np.cumsum(df_rki_sum["AnzahlFall"])

df_rki_melde=df_rki_val.groupby(["Meldedatum"],as_index=False)[["AnzahlFall"]].sum()
df_rki_melde.set_index("Meldedatum", inplace=True, drop=True)
df_rki_melde.index=pd.to_datetime(df_rki_melde.index, format="%Y-%m-%d")
df_rki_melde.sort_index(inplace=True)
df_rki_melde["CumFall"]=np.cumsum(df_rki_melde["AnzahlFall"])

# Calculation of checking values with dashboard:
# https://experience.arcgis.com/experience/478220a4c454480e823b17327b2bf1d4
df_rki_diff_sum=df_rki_diff.groupby(["Refdatum"],as_index=False)[["AnzahlFall","AnzahlTodesfall","AnzahlGenesen"]].sum()
df_rki_diff_sum.set_index("Refdatum", inplace=True, drop=True)
df_rki_diff_sum.index=pd.to_datetime(df_rki_diff_sum.index, format="%Y-%m-%d")
df_rki_diff_sum.sort_index(inplace=True)

print("Last data entry: ",df_rki_sum.index[-1])
print("Numbers for Validation with RKI Dashboard: https://experience.arcgis.com/experience/478220a4c454480e823b17327b2bf1d4")
print("Cum sum confirmed cases: ",df_rki_sum.iloc[-1, 1])
print("Difference to day before: ",df_rki_diff["AnzahlFall"].sum())
temp=df_rki[((df_rki["NeuerTodesfall"]==1) | (df_rki["NeuerTodesfall"]==0))].groupby(["Refdatum"],as_index=False)["AnzahlTodesfall"].sum().set_index("Refdatum", drop=True).sort_index()
print("Cum sum fatalities: ", np.cumsum(temp["AnzahlTodesfall"])[-1])
temp=df_rki[((df_rki["NeuerTodesfall"]==1) | (df_rki["NeuerTodesfall"]==-1))].groupby(["Refdatum"],as_index=False)["AnzahlTodesfall"].sum().set_index("Refdatum", drop=True).sort_index()
print("Difference to day before: ",np.cumsum(temp["AnzahlTodesfall"])[-1])
temp=df_rki[((df_rki["NeuGenesen"]==1) | (df_rki["NeuGenesen"]==0))].groupby(["Refdatum"],as_index=False)["AnzahlGenesen"].sum().set_index("Refdatum", drop=True).sort_index()
print("Cum sum recovered: ", np.cumsum(temp["AnzahlGenesen"])[-1])
temp=df_rki[((df_rki["NeuGenesen"]==1) | (df_rki["NeuGenesen"]==-1))].groupby(["Refdatum"],as_index=False)["AnzahlGenesen"].sum().set_index("Refdatum", drop=True).sort_index()
print("Difference to day before: ",np.cumsum(temp["AnzahlGenesen"])[-1])

Last data entry:  2020-06-17 00:00:00
Numbers for Validation with RKI Dashboard: https://experience.arcgis.com/experience/478220a4c454480e823b17327b2bf1d4
Cum sum confirmed cases:  187764
Difference to day before:  580
Cum sum fatalities:  8856
Difference to day before:  26
Cum sum recovered:  174062
Difference to day before:  441


In [7]:
df_rki_sum.to_csv("/project_data/data_asset/mercury/casenumbers/RKI_ConfirmedCases_SymptomsOnsetDate.csv",index_label=False)
df_rki_melde.to_csv("/project_data/data_asset/mercury/casenumbers/RKI_ConfirmedCases_ReportingDate.csv",index_label=False)
df_rki_melde.head()

Unnamed: 0_level_0,AnzahlFall,CumFall
Meldedatum,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-28,2,2
2020-01-29,2,4
2020-01-31,3,7
2020-02-03,1,8
2020-02-04,4,12
