In [1]:
# As usual, importing the libraries we need
import json
import numpy as np
import pandas as pd
import statsmodels.api as sm
from datetime import timedelta
from scipy.stats import pearsonr, spearmanr

In [4]:
# Clean the data
corona_df = pd.read_csv("../fyp2022p0105/data/raw/corona/dk_corona.csv", sep = "\t")

with open("../fyp2022p0105/data/raw/metadata/dk_metadata.json", 'r') as f:
       country_metadata = json.load(f)

region_map = {country_metadata["country_metadata"][i]["covid_region_code"]: country_metadata["country_metadata"][i]["iso3166-2_code"] for i in range(len(country_metadata["country_metadata"]))}
corona_df["region"] = corona_df["region_code"].map(region_map)

population_map = {country_metadata["country_metadata"][i]["iso3166-2_code"]: country_metadata["country_metadata"][i]["population"] for i in range(len(country_metadata["country_metadata"]))}
corona_df["population"] = corona_df["region"].map(population_map)
corona_df["cases_pc"] = corona_df["hospitalized_addition"] / corona_df["population"]

weather_df = pd.read_csv("../fyp2022p0105/data/raw/weather/weather.csv")

weather_df["TemperatureAboveGround"] = weather_df["TemperatureAboveGround"] - 273.15
weather_df = weather_df[weather_df["iso3166-2"].str.startswith("DK")]

df = corona_df.merge(weather_df, left_on = ["date", "region"], right_on = ["date", "iso3166-2"])
df = df.drop(["region_code", "region"], axis = 1)

df

Unnamed: 0,date,hospitalized_addition,population,cases_pc,iso3166-2,RelativeHumiditySurface,SolarRadiation,Surfacepressure,TemperatureAboveGround,Totalprecipitation,UVIndex,WindSpeed
0,2020-03-01,1,1846023,5.417051e-07,DK-84,79.371362,3.383109e+06,2.370635e+06,5.064128,0.000764,2.595843,6.668466
1,2020-03-02,0,1846023,0.000000e+00,DK-84,86.574612,3.303007e+06,2.380293e+06,4.470362,0.001416,4.286374,2.475038
2,2020-03-03,1,1846023,5.417051e-07,DK-84,93.285949,9.690623e+04,2.395165e+06,3.884757,0.002084,1.676674,2.345198
3,2020-03-04,0,1846023,0.000000e+00,DK-84,86.105840,3.227602e+06,2.407377e+06,4.677848,0.000926,4.771363,4.631544
4,2020-03-05,1,1846023,5.417051e-07,DK-84,86.688654,2.998848e+06,2.403363e+06,3.949029,0.000420,4.919169,2.801289
...,...,...,...,...,...,...,...,...,...,...,...,...
1755,2021-02-11,1,589936,1.695099e-06,DK-81,73.558470,3.624393e+06,2.475768e+06,-6.216205,0.000383,1.495042,4.113037
1756,2021-02-12,1,589936,1.695099e-06,DK-81,74.618363,4.379149e+06,2.491939e+06,-6.035219,0.000006,1.992372,1.915713
1757,2021-02-13,1,589936,1.695099e-06,DK-81,76.532522,4.910543e+06,2.494230e+06,-4.408170,0.000000,2.279176,1.357024
1758,2021-02-14,1,589936,1.695099e-06,DK-81,74.459283,4.752374e+06,2.484782e+06,-3.379998,0.000000,2.772693,2.861502


In [12]:
# Let's load Our World in Data, do some data selection, and merge it
owid_df = pd.read_csv("../fyp2022p0105/data/raw/corona/covid-data.csv")
owid_df = owid_df[owid_df["iso_code"] == "DNK"][["date", "stringency_index"]].dropna()

df = df.merge(owid_df, on = "date")
df

FileNotFoundError: [Errno 2] No such file or directory: '../fyp2022p0105/data/raw/corona/covid-data.csv'

In [8]:
# Let's get some regional data on Denmark and merge it
hdi_df = pd.read_csv("../fyp2022p0105/data/raw/metadata/dk_hdi.csv").drop("Region", axis = 1)

df = df.merge(hdi_df, on = "iso3166-2")
df

FileNotFoundError: [Errno 2] No such file or directory: '../fyp2022p0105/data/raw/metadata/dk_hdi.csv'

In [7]:
# First question (naive check): did the national level stringency had an effect on hospitalizations?
Xs = ['TemperatureAboveGround', 'UVIndex']
df = sm.add_constant(df)
Xs.append("const")

for region in set(df["iso3166-2"]):
    if region != "DK-81":
        df[region] = (df["iso3166-2"] == region).astype(int)
        Xs.append(region)

Xs.append("stringency_index")
est = sm.OLS(np.log(df["cases_pc"] + 1), df[Xs], hasconst = True).fit(cov_type = "cluster", cov_kwds = {"groups": df["iso3166-2"]}, use_t = True)
print(est.summary())

  x = pd.concat(x[::order], 1)


KeyError: "['stringency_index'] not in index"

In [None]:
# Second question (naive check): what is the HDI impact on covid?
Xs = ['TemperatureAboveGround', 'UVIndex']
Xs.append("const")
Xs.append("HDI")

est = sm.OLS(np.log(df["cases_pc"] + 1), df[Xs], hasconst = True).fit(cov_type = "cluster", cov_kwds = {"groups": df["iso3166-2"]}, use_t = True)
print(est.summary())

                            OLS Regression Results                            
Dep. Variable:               cases_pc   R-squared:                       0.297
Model:                            OLS   Adj. R-squared:                  0.296
Method:                 Least Squares   F-statistic:                     19.83
Date:                Mon, 07 Feb 2022   Prob (F-statistic):            0.00728
Time:                        16:39:16   Log-Likelihood:                 18527.
No. Observations:                1760   AIC:                        -3.705e+04
Df Residuals:                    1756   BIC:                        -3.702e+04
Df Model:                           3                                         
Covariance Type:              cluster                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
TemperatureAboveGround -2.75

In [None]:
# Final question (naive check): what intervention did what, and let's take into account one week delay.

npi_df = pd.read_csv("../data/raw/corona/OxCGRT_latest.csv")
varlist = ["Date", "C1_School closing", "C2_Workplace closing", "C3_Cancel public events", "C4_Restrictions on gatherings", "C5_Close public transport",
           "C6_Stay at home requirements", "C7_Restrictions on internal movement", "C8_International travel controls"]
npi_df = npi_df[npi_df["CountryCode"] == "DNK"][varlist]
npi_df["date"] = npi_df["Date"].map(lambda x: f"{str(x)[:4]}-{str(x)[4:6]}-{str(x)[6:8]}")
npi_df = npi_df.drop("Date", axis = 1).dropna()
npi_df["date"] = pd.to_datetime(npi_df["date"], format = "%Y-%m-%d") + timedelta(days = 7)
npi_df = (npi_df.set_index("date") > 0).astype(int).reset_index()
npi_df["date"] = npi_df["date"].dt.strftime("%Y-%m-%d")

npi_df

df = df.merge(npi_df, on = "date")
df

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,const,date,hospitalized_addition,population,cases_pc,iso3166-2,RelativeHumiditySurface,SolarRadiation,Surfacepressure,TemperatureAboveGround,...,DK-84,DK-83,C1_School closing,C2_Workplace closing,C3_Cancel public events,C4_Restrictions on gatherings,C5_Close public transport,C6_Stay at home requirements,C7_Restrictions on internal movement,C8_International travel controls
0,1.0,2020-03-01,1,1846023,5.417051e-07,DK-84,79.371362,3.383109e+06,2.370635e+06,5.064128,...,1,0,0,0,0,0,0,0,0,0
1,1.0,2020-03-01,0,837359,0.000000e+00,DK-85,80.658130,3.376985e+06,2.370150e+06,5.111854,...,0,0,0,0,0,0,0,0,0,0
2,1.0,2020-03-01,0,1223105,0.000000e+00,DK-83,80.691986,3.601074e+06,2.360351e+06,5.451110,...,0,1,0,0,0,0,0,0,0,0
3,1.0,2020-03-01,0,1326340,0.000000e+00,DK-82,83.944243,4.346587e+06,2.351536e+06,4.795169,...,0,0,0,0,0,0,0,0,0,0
4,1.0,2020-03-01,0,589936,0.000000e+00,DK-81,83.830573,4.183014e+06,2.351863e+06,4.667017,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1755,1.0,2021-02-15,4,1846023,2.166820e-06,DK-84,78.648719,4.083331e+05,2.464868e+06,-1.458766,...,1,0,1,1,1,1,0,1,0,1
1756,1.0,2021-02-15,7,837359,8.359616e-06,DK-85,78.951895,1.382502e+04,2.461460e+06,-1.277317,...,0,0,1,1,1,1,0,1,0,1
1757,1.0,2021-02-15,5,1223105,4.087956e-06,DK-83,70.380505,6.083510e+04,2.450499e+06,-0.896892,...,0,1,1,1,1,1,0,1,0,1
1758,1.0,2021-02-15,3,1326340,2.261863e-06,DK-82,72.866680,3.564665e+04,2.447235e+06,-0.881744,...,0,0,1,1,1,1,0,1,0,1


In [None]:
Xs = ['const', 'TemperatureAboveGround', 'UVIndex',
     "C1_School closing", "C2_Workplace closing", "C3_Cancel public events", "C4_Restrictions on gatherings", "C5_Close public transport",
           "C6_Stay at home requirements", "C7_Restrictions on internal movement", "C8_International travel controls"]

for region in set(df["iso3166-2"]):
    if region != "DK-81":
        df[region] = (df["iso3166-2"] == region).astype(int)
        Xs.append(region)

est = sm.OLS(np.log(df["cases_pc"] + 1), df[Xs], hasconst = True).fit(cov_type = "cluster", cov_kwds = {"groups": df["iso3166-2"]}, use_t = True)
print(est.summary())

                            OLS Regression Results                            
Dep. Variable:               cases_pc   R-squared:                       0.462
Model:                            OLS   Adj. R-squared:                  0.457
Method:                 Least Squares   F-statistic:                     3.907
Date:                Mon, 07 Feb 2022   Prob (F-statistic):              0.108
Time:                        16:39:17   Log-Likelihood:                 18761.
No. Observations:                1760   AIC:                        -3.749e+04
Df Residuals:                    1745   BIC:                        -3.741e+04
Df Model:                          14                                         
Covariance Type:              cluster                                         
                                           coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------




In [None]:
# Always be on the lookout for potential confounders
# For instance, not as many tests/hospitalizations done in weekends and holidays so let's control for that
df["weekend"] = (pd.to_datetime(df["date"], format = "%Y-%m-%d").dt.weekday >= 5).astype(int)
Xs.append("weekend")

est = sm.OLS(np.log(df["cases_pc"] + 1), df[Xs], hasconst = True).fit(cov_type = "cluster", cov_kwds = {"groups": df["iso3166-2"]}, use_t = True)
print(est.summary())

                            OLS Regression Results                            
Dep. Variable:               cases_pc   R-squared:                       0.464
Model:                            OLS   Adj. R-squared:                  0.460
Method:                 Least Squares   F-statistic:                     3.669
Date:                Mon, 07 Feb 2022   Prob (F-statistic):              0.118
Time:                        16:39:17   Log-Likelihood:                 18765.
No. Observations:                1760   AIC:                        -3.750e+04
Df Residuals:                    1744   BIC:                        -3.741e+04
Df Model:                          15                                         
Covariance Type:              cluster                                         
                                           coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------


