# Understanding Covid19 Case Counts using machine learning

In [3]:
# used library imports
import pandas as pd

### Covid Data from Our World in Data

Last downloaded: 16th of May 2024

https://github.com/owid/covid-19-data/blob/master/public/data/owid-covid-data.csv

In [4]:
df = pd.read_csv('resources/owid-covid-data.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397677 entries, 0 to 397676
Data columns (total 67 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   iso_code                                    397677 non-null  object 
 1   continent                                   378618 non-null  object 
 2   location                                    397677 non-null  object 
 3   date                                        397677 non-null  object 
 4   total_cases                                 358581 non-null  float64
 5   new_cases                                   386444 non-null  float64
 6   new_cases_smoothed                          385214 non-null  float64
 7   total_deaths                                336105 non-null  float64
 8   new_deaths                                  386794 non-null  float64
 9   new_deaths_smoothed                         385564 non-null  float64
 

### Inspecting the Data

Findings from a first look at the data:
- need to decide wether to use smoothed variables or not, can't use both
- total tests probably redundant
- total vaccinations are redundant if we have people vaccinated, fully vaccinated and boosters
- some columns have high numbers of missing values, like handwashing_facilities and smokers
- for icu and hospital patients, per million might be better, due to comparability



In [5]:
df

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-01-05,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
1,AFG,Asia,Afghanistan,2020-01-06,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
2,AFG,Asia,Afghanistan,2020-01-07,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
3,AFG,Asia,Afghanistan,2020-01-08,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
4,AFG,Asia,Afghanistan,2020-01-09,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397672,ZWE,Africa,Zimbabwe,2024-04-24,266359.0,0.0,0.0,5740.0,0.0,0.0,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,
397673,ZWE,Africa,Zimbabwe,2024-04-25,266359.0,0.0,0.0,5740.0,0.0,0.0,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,
397674,ZWE,Africa,Zimbabwe,2024-04-26,266359.0,0.0,0.0,5740.0,0.0,0.0,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,
397675,ZWE,Africa,Zimbabwe,2024-04-27,266359.0,0.0,0.0,5740.0,0.0,0.0,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,


Selecting a subset of columns that can be used for predictions

In [6]:
df_subset = df[['continent','location','date','new_cases_per_million','new_deaths_per_million','reproduction_rate','icu_patients_per_million','hosp_patients_per_million','new_tests_per_thousand','people_vaccinated_per_hundred','people_fully_vaccinated_per_hundred','total_boosters_per_hundred','stringency_index','population_density','median_age','aged_65_older','aged_70_older','gdp_per_capita','cardiovasc_death_rate','diabetes_prevalence','female_smokers','male_smokers','hospital_beds_per_thousand','life_expectancy','human_development_index',]]
df_subset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397677 entries, 0 to 397676
Data columns (total 25 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   continent                            378618 non-null  object 
 1   location                             397677 non-null  object 
 2   date                                 397677 non-null  object 
 3   new_cases_per_million                386444 non-null  float64
 4   new_deaths_per_million               386794 non-null  float64
 5   reproduction_rate                    184817 non-null  float64
 6   icu_patients_per_million             38837 non-null   float64
 7   hosp_patients_per_million            40377 non-null   float64
 8   new_tests_per_thousand               75403 non-null   float64
 9   people_vaccinated_per_hundred        79959 non-null   float64
 10  people_fully_vaccinated_per_hundred  76865 non-null   float64
 11  total_booster

In [7]:
df_subset.isnull().sum()

continent                               19059
location                                    0
date                                        0
new_cases_per_million                   11233
new_deaths_per_million                  10883
reproduction_rate                      212860
icu_patients_per_million               358840
hosp_patients_per_million              357300
new_tests_per_thousand                 322274
people_vaccinated_per_hundred          317718
people_fully_vaccinated_per_hundred    320812
total_boosters_per_hundred             345300
stringency_index                       200385
population_density                      59412
median_age                              83279
aged_65_older                           94084
aged_70_older                           86431
gdp_per_capita                          89356
cardiovasc_death_rate                   88789
diabetes_prevalence                     72815
female_smokers                         165785
male_smokers                      

overall there are a lot of null values especially for vaccinations, so we need to check if that's because some countries just don't report vaccinations, or if those values are all from a time where there were no vaccinations, also if there are gaps in the reports, so fill them with 0 till the first vaccination and then last observation carried forward

In [8]:
df_subset['continent'].unique()

array(['Asia', nan, 'Europe', 'Africa', 'Oceania', 'North America',
       'South America'], dtype=object)

In [9]:
df_cleaned = df_subset.dropna(subset=['continent'])
df_cleaned.loc[:, 'new_cases_per_million']= df_cleaned['new_cases_per_million'].fillna(0)
df_cleaned.loc[:, 'new_deaths_per_million']= df_cleaned['new_deaths_per_million'].fillna(0)
df_cleaned.loc[:, 'reproduction_rate']= df_cleaned['reproduction_rate'].fillna(0)
df_cleaned.loc[:, 'icu_patients_per_million']= df_cleaned['icu_patients_per_million'].fillna(0)
df_cleaned.loc[:, 'hosp_patients_per_million']= df_cleaned['hosp_patients_per_million'].fillna(0)
df_cleaned.loc[:, 'new_tests_per_thousand']= df_cleaned['new_tests_per_thousand'].fillna(0)
df_cleaned.loc[:, 'stringency_index']= df_cleaned['new_tests_per_thousand'].fillna(0)

#sort by 'location' and 'date'
df_cleaned = df_cleaned.sort_values(by=['location', 'date'])

# Use groupby on 'location' and apply forward fill
df_cleaned['people_vaccinated_per_hundred'] = df_cleaned.groupby('location')['people_vaccinated_per_hundred'].ffill()
df_cleaned['people_fully_vaccinated_per_hundred'] = df_cleaned.groupby('location')['people_fully_vaccinated_per_hundred'].ffill()
df_cleaned['total_boosters_per_hundred'] = df_cleaned.groupby('location')['total_boosters_per_hundred'].ffill()

#fill the rest of values with 0 (those are values before the first value, therefore they are 0)
df_cleaned.loc[:, 'people_vaccinated_per_hundred']= df_cleaned['people_vaccinated_per_hundred'].fillna(0)
df_cleaned.loc[:, 'people_fully_vaccinated_per_hundred']= df_cleaned['people_fully_vaccinated_per_hundred'].fillna(0)
df_cleaned.loc[:, 'total_boosters_per_hundred']= df_cleaned['total_boosters_per_hundred'].fillna(0)

#df_cleaned[df_cleaned['location'] == 'Austria']
#the locations with missing values for population_density are small islands, so we can drop them
#df_cleaned[df_cleaned['population_density'].isna()]['location'].unique()
df_cleaned.dropna(subset=['population_density'],inplace=True)
#same for median_age the locations, that have missing values are small islands, so we can drop them
df_cleaned.dropna(subset=['median_age'],inplace=True)
#missing values for 70 or older are assumed to be null
df_cleaned['aged_70_older'].fillna(0, inplace=True)
#locations with missing gdp per capita are countries with limited information like, North Korea, Cuba or Somalia, so we drop them
df_cleaned.dropna(subset=['gdp_per_capita'],inplace=True)
#same for cardiovasc_death_rate
df_cleaned.dropna(subset=['cardiovasc_death_rate'],inplace=True)
#smokers are filled with the median value (they are also mainly small 3rd world countries, so maybe should be dropped)
# Calculate the median of 'female_smokers' and 'male_smokers'
female_smokers_median = df_cleaned['female_smokers'].median()
male_smokers_median = df_cleaned['male_smokers'].median()

# Fill the missing values with the calculated median
df_cleaned['female_smokers'].fillna(female_smokers_median, inplace=True)
df_cleaned['male_smokers'].fillna(male_smokers_median, inplace=True)

#hospital beds are also small countries or poor countries, so we assume 0 
df_cleaned['hospital_beds_per_thousand'].fillna(0, inplace=True)
#only puerto rico has no human development index, so we drop it
df_cleaned.dropna(subset=['human_development_index'],inplace=True)

#df_cleaned.to_csv('resources/covid_data_cleaned.csv')
df_cleaned

Unnamed: 0,continent,location,date,new_cases_per_million,new_deaths_per_million,reproduction_rate,icu_patients_per_million,hosp_patients_per_million,new_tests_per_thousand,people_vaccinated_per_hundred,...,aged_65_older,aged_70_older,gdp_per_capita,cardiovasc_death_rate,diabetes_prevalence,female_smokers,male_smokers,hospital_beds_per_thousand,life_expectancy,human_development_index
0,Asia,Afghanistan,2020-01-05,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,2.581,1.337,1803.987,597.029,9.59,6.2,31.4,0.5,64.83,0.511
1,Asia,Afghanistan,2020-01-06,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,2.581,1.337,1803.987,597.029,9.59,6.2,31.4,0.5,64.83,0.511
2,Asia,Afghanistan,2020-01-07,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,2.581,1.337,1803.987,597.029,9.59,6.2,31.4,0.5,64.83,0.511
3,Asia,Afghanistan,2020-01-08,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,2.581,1.337,1803.987,597.029,9.59,6.2,31.4,0.5,64.83,0.511
4,Asia,Afghanistan,2020-01-09,0.0,0.0,0.0,0.0,0.0,0.0,0.00,...,2.581,1.337,1803.987,597.029,9.59,6.2,31.4,0.5,64.83,0.511
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397672,Africa,Zimbabwe,2024-04-24,0.0,0.0,0.0,0.0,0.0,0.0,39.45,...,2.822,1.882,1899.775,307.846,1.82,1.6,30.7,1.7,61.49,0.571
397673,Africa,Zimbabwe,2024-04-25,0.0,0.0,0.0,0.0,0.0,0.0,39.45,...,2.822,1.882,1899.775,307.846,1.82,1.6,30.7,1.7,61.49,0.571
397674,Africa,Zimbabwe,2024-04-26,0.0,0.0,0.0,0.0,0.0,0.0,39.45,...,2.822,1.882,1899.775,307.846,1.82,1.6,30.7,1.7,61.49,0.571
397675,Africa,Zimbabwe,2024-04-27,0.0,0.0,0.0,0.0,0.0,0.0,39.45,...,2.822,1.882,1899.775,307.846,1.82,1.6,30.7,1.7,61.49,0.571


In [10]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 282307 entries, 0 to 397676
Data columns (total 25 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   continent                            282307 non-null  object 
 1   location                             282307 non-null  object 
 2   date                                 282307 non-null  object 
 3   new_cases_per_million                282307 non-null  float64
 4   new_deaths_per_million               282307 non-null  float64
 5   reproduction_rate                    282307 non-null  float64
 6   icu_patients_per_million             282307 non-null  float64
 7   hosp_patients_per_million            282307 non-null  float64
 8   new_tests_per_thousand               282307 non-null  float64
 9   people_vaccinated_per_hundred        282307 non-null  float64
 10  people_fully_vaccinated_per_hundred  282307 non-null  float64
 11  total_boosters_per

In [12]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestRegressor
from doubleml import DoubleMLData, DoubleMLPLR

# Load your data
data = df_cleaned

# Define outcome, treatment, and base covariates
outcome = 'new_cases_per_million'
treatments = ['people_fully_vaccinated_per_hundred', 'stringency_index']
base_covariates = ['population_density', 'median_age', 'gdp_per_capita', 'life_expectancy']

# One-hot encode the continent column
data = pd.get_dummies(data, columns=['continent'], drop_first=True)

# Update covariates to include the one-hot encoded continent columns
covariates = base_covariates + [col for col in data.columns if col.startswith('continent_')]

# Create interaction terms with continent
for treatment in treatments:
    for continent_col in [col for col in data.columns if col.startswith('continent_')]:
        data[f'{treatment}_x_{continent_col}'] = data[treatment] * data[continent_col]

# Prepare data for DoubleML
dml_data = DoubleMLData(data, y_col=outcome, d_cols=treatments, x_cols=covariates)

# Specify the machine learning models for the nuisance parameters
ml_g = RandomForestRegressor()
ml_m = RandomForestRegressor()

# Initialize the DoubleMLPLR model
dml_plr = DoubleMLPLR(dml_data, ml_g, ml_m)

# Fit the model
dml_plr.fit()

# Print the results
print(dml_plr.summary)

TypeError: 'DataFrame' object is not callable

In [14]:
dml_plr.summary

Unnamed: 0,coef,std err,t,P>|t|,2.5 %,97.5 %
people_fully_vaccinated_per_hundred,1.869575,0.104332,17.919461,8.313033e-72,1.665088,2.074062
stringency_index,-30.180387,6.895338,-4.376926,1.203646e-05,-43.695001,-16.665773
icu_patients_per_million,22.173431,1.185349,18.706241,4.403644999999999e-78,19.850189,24.496674
