In [1]:
# As usual, importing the libraries we need
import json
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy.stats import pearsonr, spearmanr

In [2]:
# LOAD AND CLEAN DATA

# Load Corona data
corona_df = pd.read_csv("../fyp2022p0105/data/raw/corona/dk_corona.csv", sep = "\t")

# Load json file with country metadata
with open("../fyp2022p0105/data/raw/metadata/dk_metadata.json", 'r') as f:
       country_metadata = json.load(f)

# Make a dictionary of the json file for each covid region code and the corresponding region code      
region_map = {country_metadata["country_metadata"][i]["covid_region_code"]: country_metadata["country_metadata"][i]["iso3166-2_code"] for i in range(len(country_metadata["country_metadata"]))}
# Add a new column 'region' that maps to the corresponding region from region_map
corona_df["region"] = corona_df["region_code"].map(region_map)

# Create dictionary with region and corresponding population
population_map = {country_metadata["country_metadata"][i]["iso3166-2_code"]: country_metadata["country_metadata"][i]["population"] for i in range(len(country_metadata["country_metadata"]))}
# Add population column to dataframe
corona_df["population"] = corona_df["region"].map(population_map)
# Create column for cases per capita
corona_df["cases_pc"] = corona_df["hospitalized_addition"] / corona_df["population"]

# Load weather data
weather_df = pd.read_csv("../fyp2022p0105/data/raw/weather/weather.csv")

# Transform temperature from kelvin to celsius
weather_df["TemperatureAboveGround"] = weather_df["TemperatureAboveGround"] - 273.15
weather_df = weather_df[weather_df["iso3166-2"].str.startswith("DK")] # Mask weather from specific country

# Merge weather and corona data, drop the region columns
df = corona_df.merge(weather_df, left_on = ["date", "region"], right_on = ["date", "iso3166-2"])
df = df.drop(["region_code", "region"], axis = 1)

# Sanity check on the dataframe
df

Unnamed: 0,date,hospitalized_addition,population,cases_pc,iso3166-2,RelativeHumiditySurface,SolarRadiation,Surfacepressure,TemperatureAboveGround,Totalprecipitation,UVIndex,WindSpeed
0,2020-03-01,1,1846023,5.417051e-07,DK-84,79.371362,3.383109e+06,2.370635e+06,5.064128,0.000764,2.595843,6.668466
1,2020-03-02,0,1846023,0.000000e+00,DK-84,86.574612,3.303007e+06,2.380293e+06,4.470362,0.001416,4.286374,2.475038
2,2020-03-03,1,1846023,5.417051e-07,DK-84,93.285949,9.690623e+04,2.395165e+06,3.884757,0.002084,1.676674,2.345198
3,2020-03-04,0,1846023,0.000000e+00,DK-84,86.105840,3.227602e+06,2.407377e+06,4.677848,0.000926,4.771363,4.631544
4,2020-03-05,1,1846023,5.417051e-07,DK-84,86.688654,2.998848e+06,2.403363e+06,3.949029,0.000420,4.919169,2.801289
...,...,...,...,...,...,...,...,...,...,...,...,...
1755,2021-02-11,1,589936,1.695099e-06,DK-81,73.558470,3.624393e+06,2.475768e+06,-6.216205,0.000383,1.495042,4.113037
1756,2021-02-12,1,589936,1.695099e-06,DK-81,74.618363,4.379149e+06,2.491939e+06,-6.035219,0.000006,1.992372,1.915713
1757,2021-02-13,1,589936,1.695099e-06,DK-81,76.532522,4.910543e+06,2.494230e+06,-4.408170,0.000000,2.279176,1.357024
1758,2021-02-14,1,589936,1.695099e-06,DK-81,74.459283,4.752374e+06,2.484782e+06,-3.379998,0.000000,2.772693,2.861502


In [3]:
print(df.info())
df.dtypes

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1760 entries, 0 to 1759
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   date                     1760 non-null   object 
 1   hospitalized_addition    1760 non-null   int64  
 2   population               1760 non-null   int64  
 3   cases_pc                 1760 non-null   float64
 4   iso3166-2                1760 non-null   object 
 5   RelativeHumiditySurface  1760 non-null   float64
 6   SolarRadiation           1760 non-null   float64
 7   Surfacepressure          1760 non-null   float64
 8   TemperatureAboveGround   1760 non-null   float64
 9   Totalprecipitation       1760 non-null   float64
 10  UVIndex                  1760 non-null   float64
 11  WindSpeed                1760 non-null   float64
dtypes: float64(8), int64(2), object(2)
memory usage: 178.8+ KB
None


date                        object
hospitalized_addition        int64
population                   int64
cases_pc                   float64
iso3166-2                   object
RelativeHumiditySurface    float64
SolarRadiation             float64
Surfacepressure            float64
TemperatureAboveGround     float64
Totalprecipitation         float64
UVIndex                    float64
WindSpeed                  float64
dtype: object

In [4]:
# Selection of features
Xs = ['RelativeHumiditySurface', 'SolarRadiation', 'Surfacepressure', 'TemperatureAboveGround',
             'Totalprecipitation', 'UVIndex', 'WindSpeed']

# We now prepare for running a multivariate linear regresion using statsmodel
# The library requires us to create a constant variable, to calculate the intercept.
df = sm.add_constant(df)
Xs.append("const")

  x = pd.concat(x[::order], 1)


In [5]:
# First we run the linear multivariate regression
est = sm.OLS(df["hospitalized_addition"], df[Xs], hasconst = True).fit()
# Lots to unpack here, but let's focus on the basics. The R-squared (top-right)
# is a measure of prediction quality: how much of the daily variation in number
# of cases can we explain? The "P>|t|" column tells you the (non Bonferroni
# corrected) p-values of each variable *when keeping all the other constant*.
# For instance, this regression tells us that varying SolarRadiation doesn't
# tell us anything interesting if everything else is held constant.
print(est.summary())

                              OLS Regression Results                             
Dep. Variable:     hospitalized_addition   R-squared:                       0.191
Model:                               OLS   Adj. R-squared:                  0.188
Method:                    Least Squares   F-statistic:                     59.04
Date:                   Tue, 22 Feb 2022   Prob (F-statistic):           2.93e-76
Time:                           12:55:28   Log-Likelihood:                -6669.8
No. Observations:                   1760   AIC:                         1.336e+04
Df Residuals:                       1752   BIC:                         1.340e+04
Df Model:                              7                                         
Covariance Type:               nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------

In [6]:
# We now log-transform the number of cases
# 1 is added to the log to avoid error when taking the log of 0
est = sm.OLS(np.log(df["hospitalized_addition"] + 1), df[Xs], hasconst = True).fit()
# And many things changes, but for now let's only focus on the fact that the
# R-squared has much improved, i.e. this model is more powerful.
print(est.summary())

                              OLS Regression Results                             
Dep. Variable:     hospitalized_addition   R-squared:                       0.433
Model:                               OLS   Adj. R-squared:                  0.431
Method:                    Least Squares   F-statistic:                     191.2
Date:                   Tue, 22 Feb 2022   Prob (F-statistic):          1.09e-210
Time:                           12:55:28   Log-Likelihood:                -2201.4
No. Observations:                   1760   AIC:                             4419.
Df Residuals:                       1752   BIC:                             4463.
Df Model:                              7                                         
Covariance Type:               nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------

In [7]:
# Select Features
Xs = ['RelativeHumiditySurface', 'SolarRadiation', 'Surfacepressure', 'TemperatureAboveGround',
             'Totalprecipitation', 'UVIndex', 'WindSpeed']
df = sm.add_constant(df)
Xs.append("const")

  x = pd.concat(x[::order], 1)


In [8]:
# Run the linear multivariate regression
est = sm.OLS(np.log(df["cases_pc"] + 1), df[Xs], hasconst = True).fit()
print(est.summary())

                            OLS Regression Results                            
Dep. Variable:               cases_pc   R-squared:                       0.283
Model:                            OLS   Adj. R-squared:                  0.280
Method:                 Least Squares   F-statistic:                     98.63
Date:                Tue, 22 Feb 2022   Prob (F-statistic):          1.19e-121
Time:                        12:55:30   Log-Likelihood:                 18508.
No. Observations:                1760   AIC:                        -3.700e+04
Df Residuals:                    1752   BIC:                        -3.696e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
RelativeHumiditySurface  3

In [9]:
# Here we add a "dummy" variable: a region fixed effect, identify which rows belong
# to which region. This dummy variable absorbs every possible omitted variable that
# distinguishes a region from all other regions.
regions = ["const",]

for region in set(df["iso3166-2"]):
    if region != "DK-81":
        df[region] = (df["iso3166-2"] == region).astype(int)
        regions.append(region)
        Xs.append(region)

In [10]:
est = sm.OLS(np.log(df["cases_pc"] + 1), df[regions], hasconst = True).fit()
# Let's first see how regions did overall.
print(est.summary())

                            OLS Regression Results                            
Dep. Variable:               cases_pc   R-squared:                       0.097
Model:                            OLS   Adj. R-squared:                  0.095
Method:                 Least Squares   F-statistic:                     47.23
Date:                Tue, 22 Feb 2022   Prob (F-statistic):           9.40e-38
Time:                        12:55:31   Log-Likelihood:                 18306.
No. Observations:                1760   AIC:                        -3.660e+04
Df Residuals:                    1755   BIC:                        -3.657e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       3.694e-06   3.93e-07      9.408      0.0

In [11]:
est = sm.OLS(np.log(df["cases_pc"] + 1), df[Xs], hasconst = True).fit()
# We don't really care about the coefficients or p-values of the dummy variables,
# but they keep fixed the actions of local governments when these differ from
# national counter-measures.
print(est.summary())

                            OLS Regression Results                            
Dep. Variable:               cases_pc   R-squared:                       0.396
Model:                            OLS   Adj. R-squared:                  0.392
Method:                 Least Squares   F-statistic:                     104.0
Date:                Tue, 22 Feb 2022   Prob (F-statistic):          3.90e-182
Time:                        12:55:31   Log-Likelihood:                 18659.
No. Observations:                1760   AIC:                        -3.729e+04
Df Residuals:                    1748   BIC:                        -3.723e+04
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
RelativeHumiditySurface  1

In [12]:
# Repeat linear regression using Clustered standard error
est = sm.OLS(np.log(df["cases_pc"] + 1), df[Xs], hasconst = True).fit(cov_type = "cluster", cov_kwds = {"groups": df["iso3166-2"]}, use_t = True)
print(est.summary())

                            OLS Regression Results                            
Dep. Variable:               cases_pc   R-squared:                       0.396
Model:                            OLS   Adj. R-squared:                  0.392
Method:                 Least Squares   F-statistic:                     7789.
Date:                Tue, 22 Feb 2022   Prob (F-statistic):           4.94e-08
Time:                        12:55:31   Log-Likelihood:                 18659.
No. Observations:                1760   AIC:                        -3.729e+04
Df Residuals:                    1748   BIC:                        -3.723e+04
Df Model:                          11                                         
Covariance Type:              cluster                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
RelativeHumiditySurface  1

