In [821]:
#pip installs
%pip install plotly
%pip install statsmodels

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [822]:
#libraries
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
#import seaborn as sns

#plotting
import plotly.express as px
import plotly.offline as py

#linear regression
from sklearn.linear_model import LinearRegression
import datetime as dt

#prediction
from statsmodels.tsa.arima_model import ARIMA

In [823]:
#load data from our csv file
#https://github.com/J535D165/CoronaWatchNL/tree/master/data-geo/data-national
cov_data = pd.read_csv('nl_cov_data/dutch_covid_data.csv')

#replace all empty values with 0's
cov_data = cov_data.fillna(0)

#remove row with column names
cov_data = cov_data.iloc[1: , :]

#get contaminations row only
cov_bes = cov_data.iloc[2::3, :]

#get deaths row only
cov_dea = cov_data.iloc[1::3, :]

#get hospital row only
cov_hos = cov_data.iloc[0::3, :]

#duplicates
cov_cum = cov_bes
dea_cum = cov_dea
hos_cum = cov_hos

#get date column
cov_bes_date = cov_bes.loc[:,"Datum"]
cov_dea_date = cov_dea.loc[:,"Datum"]
cov_hos_date = cov_hos.loc[:,"Datum"]

#convert to datetime
cov_bes_date = pd.to_datetime(cov_bes_date)
cov_dea_date = pd.to_datetime(cov_dea_date)
cov_hos_date = pd.to_datetime(cov_hos_date)

#get cumulative contaminations/other stats
cov_cum = cov_cum.loc[:,"AantalCumulatief"]
dea_cum = dea_cum.loc[:,"AantalCumulatief"]
hos_cum = hos_cum.loc[:,"AantalCumulatief"]

#get amount column
cov_bes = cov_bes.loc[:,"Aantal"]
cov_dea = cov_dea.loc[:,"Aantal"]
cov_hos = cov_hos.loc[:,"Aantal"]

#convert to numpy array
num_bes = np.array(cov_bes)
num_cum = np.array(cov_cum)
num_dea = np.array(cov_dea)
num_hos = np.array(cov_hos)
num_dea_cum = np.array(dea_cum)
num_hos_cum = np.array(hos_cum)

In [824]:
# plot contaminations and cumulative contaminations
contaminations = px.bar(x=cov_bes_date, y=num_bes, height=550, 
               title='Contaminations per day')
contaminations.show()

contaminations_cum = px.bar(x=cov_bes_date, y=num_cum, height=550, 
               title='Cumulative contaminations in the Netherlands')
contaminations_cum.show()

In [825]:
# plot deaths and cumulative deaths
deaths = px.bar(x=cov_dea_date, y=num_dea, height=550, 
               title='Deaths per day')
deaths.show()

deaths_cum = px.bar(x=cov_dea_date, y=num_dea_cum, height=550, 
               title='Cumulative deaths in the Netherlands')
deaths_cum.show()

In [826]:
# plot deaths and cumulative deaths
hospital_num = px.bar(x=cov_hos_date, y=num_hos, height=550, 
               title='People entering the hospital per day')
hospital_num.show()

hospital_cum = px.bar(x=cov_hos_date, y=num_hos_cum, height=550, 
               title='Cumulative hospital intakes in the Netherlands')
hospital_cum.show()

When we want to say something about efficiency of measures and lockdowns, we first need to maintain a list of dates in which the covid regulations changed by big amounts.


12-03-2020: thuiswerkadvies, evenementen afgelast

13-03-2020: vliegverbod bepaalde landen

15-03-2020: sluiting horeca en scholen

23-03-2020: begin intelligente lockdown

11-05-2020: versoepelen contactberoepen

27-05-2020: diverse versoepelingen

30-06-2020: diverse versoepelingen

18-08-2020: iets meer maatregelen

27-09-2020: iets meer maatregelen

13-10-2020: lockdown, zware maatregelen

29-04-2020: versoepelingen


In [827]:
#load data from our csv file
#from https://ourworldindata.org/grapher/covid-stringency-index
stri_data = pd.read_csv('nl_cov_data/covid-stringency-index.csv')

#filter out non dutch values
stri_data = stri_data[stri_data.Entity == "Netherlands"]

#get date data
stri_date = stri_data.loc[:,"Day"]


#convert to datetime
stri_date = pd.to_datetime(stri_date)

#get stringency index data
stri_index = stri_data.loc[:,"stringency_index"]

# plot stringency index
stringency = px.bar(x=stri_date, y=stri_index, height=550, 
               title='Graph of stringency index in the Netherlands')
stringency.show()

We know the virus has a certain incubation time, hence we can only see the effect 7 days later. 
Lets take the first intelligent lockdown on the 23th of march, including an incubation time of 7 days.

In [828]:

#filter date
lock1_bes = cov_bes_date.loc[cov_bes_date <= '2020-03-29']
lock1_size = len(lock1_bes)
lock1_num = num_bes[:lock1_size]

#show pre lockdown data
contaminations = px.line(x=lock1_bes, y=lock1_num, height=550, 
               title='Contaminations per day', markers=True)
contaminations.show()


In [829]:
groeifactoren = []

# calculating the growth factor
def groeifactor(delta_cn, delta_cn_old):
    """Returns the number of confirmed cases, Growth Factor=ΔC𝑛/ΔC𝑛−1"""
    return(delta_cn/delta_cn_old)

for i in range(len(cov_bes)):
    #we skip the first element, at is has no predecessor
    if i != 0:
        groeifact = groeifactor(num_bes[i], num_bes[i-1])
        groeifactoren.append(groeifact)

#convert list to numpy array
groeifactoren = np.array(groeifactoren)

#remove first element of dates as this has no r number
cov_bes_date_g = cov_bes_date[1:]

g_grafiek = px.bar(x=cov_bes_date_g, y=groeifactoren, height=550, 
               title='Evolutie groeifactor covid-19')
g_grafiek.show()

In [830]:
#there is one less data point in our dataset for the first day of contaminations, we adjust this difference for synchronous data
num_bes = num_bes[7:]
num_dea = num_dea[8:]
num_hos = num_hos[8:]
cov_bes_date_g = cov_bes_date_g[6:]

In [831]:
#remove the 25th of march as this data was not collected properly by the RIVM due to technical problems
outlier = int(cov_bes_date_g.index[cov_bes_date_g == '2021-03-25'].tolist()[0] / 3)
num_bes = np.delete(num_bes, 383)
num_dea = np.delete(num_dea, 383)
num_hos = np.delete(num_hos, 383)
cov_bes_date_g = cov_bes_date_g.loc[cov_bes_date_g != '2021-03-25']

# #DEBUG: check if data is synched
# print(len(num_bes))
# print(len(num_dea))
# print(len(num_hos))
# print(len(cov_bes_date_g))

In [832]:
#update the stringency index list to fit our time period and remove the 24th and 25th of march
#timeframe = 2020-03-12 until 2020-04-17

#get boundaries and exemptions indexes from dataset
begin = stri_date.index[stri_date == '2020-03-06'].tolist()[0] #81944
end = stri_date.index[stri_date == '2021-04-17'].tolist()[0] #82351
ma24 = stri_date.index[stri_date == '2021-03-24'].tolist()[0] #82327

#adjust timeframe of stringency index dates to contaminations timeframe
stri_date = stri_date.loc[stri_date >= '2020-03-06']
stri_date = stri_date.loc[stri_date <= '2021-04-17']

#adjust timeframe of stringency index dates to remove wrong data points in other dataset
stri_date = stri_date.loc[stri_date != '2021-03-24']
stri_date = stri_date.loc[stri_date != '2021-03-25']

#adjust the stringency index data to above frame
stri_index = stri_index.iloc[45:]
stri_index = stri_index.iloc[:408]

#remove missing elements from dataframe in context
stri_index = stri_index.drop(stri_index.index[384])
stri_index = stri_index.drop(stri_index.index[384])


In [833]:
#aquire and adjust data about the R number
# The data about the groeifactor was gotten from https://data.rivm.nl/covid-19/COVID-19_reproductiegetal.json
# note that this data is in .json format, we have converted it using http://convertcsv.com/json-to-csv.htm
g_data = pd.read_csv('nl_cov_data/reproductiegetallen.csv')

#get date data
g_date = g_data.loc[:,"Date"]

#convert to datetime
g_date = pd.to_datetime(g_date)

#get average R data
g_data = g_data.loc[:,"Rt_avg"]
# TODO: #2 filter above data on timeframe and with removal of exclusions

In [834]:
#update the reprodutctiegetallen list to fit our time period and remove the 24th and 25th of march
#timeframe = 2020-03-12 until 2020-04-17

#get boundaries and exemptions indexes from dataset
begin = g_date.index[g_date == '2020-03-06'].tolist()[0] #18
end = g_date.index[g_date == '2021-04-17'].tolist()[0] #425
ma24 = g_date.index[g_date == '2021-03-24'].tolist()[0] #401 

#adjust timeframe of reproductiegetallen dates to contaminations timeframe
g_date = g_date.loc[g_date >= '2020-03-06']
g_date = g_date.loc[g_date <= '2021-04-17']

#adjust timeframe of reproductiegetallen dates to remove wrong data points in other dataset
g_date = g_date.loc[g_date != '2021-03-24']
g_date = g_date.loc[g_date != '2021-03-25']

#adjust the reproductiegetallen data to above frame
g_data = g_data.iloc[18:]
g_data = g_data.iloc[:408]

#remove missing elements from dataframe in context
g_data = g_data.drop(g_data.index[401])
g_data = g_data.drop(g_data.index[401])

406


In [840]:
#DEBUG
#assure all data is of equal size
print(len(num_bes))
print(len(num_dea))
print(len(num_hos))
print(len(cov_bes_date_g))
print(len(g_data))
print(len(g_date))
print(len(stri_index))
print(len(stri_date))

#DATA explanation
# num_bes = amount of contaminations due to covid-19 in the Netherlands
# num_dea = amount of deaths due to covid-19 in the Netherlands
# num_hos = amount of people taken into the hospital due to covid-19 in the Netherlands
# stri_index = stringency of covid-19 measures in the Netherlands
# g_data = R measure of infectivity covid-19 in the Netherlands
# cov_bes_date_g/g_date/stri_date = timeframe used for our work

406
406
406
406
406
406
406
406


In [843]:
#Before making a model it is very neccessar to take some weekly averages for our model to use for decision making
#TODO: get averages for all our data points as data
#TODO: get deltas for all our data points as data
avg_r = g_data.rolling(7).mean()
print(avg_r)

#linear regression model
# we want to predict the stringency index
# we want to predict this using contaminations, hospital admissions, deaths and the R number

#we want to make a pandas dataframe X of the input data

#we want to make a pandas dataframe Y of the output data
Y = pd.DataFrame()

#TODO: predict stringency index based on inputs: besmettingen, ziekenhuisopnames, doden en R getal
#TODO: #2 algoritme 1: neural network
#TODO: #3 algoritme 2: C4.5
#TODO: #4 optimaliseer beide algoritmes
#TODO: #5 vergelijk en schrijf paper van 6-8 kantjes

18          NaN
19          NaN
20          NaN
21          NaN
22          NaN
         ...   
421    1.038571
422    1.034286
423    1.034286
424    1.034286
425    1.031429
Name: Rt_avg, Length: 406, dtype: float64
