In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date
import math

In [64]:
# Jupyter configuration
pd.set_option('display.max_rows', 500)

In [65]:
dataset = pd.read_csv('owid-covid-data.csv')

Separate the data into two datasets

In [66]:
countries_dataset = dataset[dataset['location'] != 'World']
world_dataset = dataset[dataset['location'] == 'World']

countries_dataset.to_csv('countries-covid-data.csv')
world_dataset.to_csv('world-covid-data.csv')

dataset = countries_dataset


In [67]:
dataset.describe()

Unnamed: 0,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,total_cases_per_million,new_cases_per_million,new_cases_smoothed_per_million,total_deaths_per_million,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
count,120124.0,120120.0,119108.0,109128.0,109281.0,119108.0,119490.0,119486.0,118479.0,108507.0,...,87227.0,85955.0,56113.0,101866.0,120559.0,112239.0,4580.0,4580.0,4580.0,4580.0
mean,1064025.0,4443.356477,4458.129641,26592.06,98.623558,90.1392,19282.105114,85.898716,85.960025,397.157848,...,10.622174,32.732056,50.767923,3.032268,73.262043,0.726217,30724.468472,8.455262,15.6589,755.471243
std,5234394.0,21975.616262,21596.55115,113662.1,428.976497,397.723157,32180.002429,198.380924,167.812853,658.903174,...,10.536183,13.560467,31.991141,2.463929,7.553243,0.150484,86337.136808,15.822375,31.093173,1139.369805
min,1.0,-74347.0,-6223.0,1.0,-1918.0,-232.143,0.001,-3125.829,-272.971,0.001,...,0.1,7.7,1.188,0.1,53.28,0.394,-31959.4,-27.35,-95.92,-1749.128494
25%,2157.0,3.0,9.571,73.0,0.0,0.0,389.553,0.279,1.585,11.324,...,1.9,21.6,19.351,1.3,67.92,0.602,-186.075,-1.09,-0.99,-46.8954
50%,22972.5,91.0,115.429,636.0,2.0,1.714,3018.5855,10.781,15.171,75.451,...,6.3,31.4,49.542,2.397,74.79,0.744,1992.15,5.03,6.275,345.233709
75%,226465.8,914.0,966.03575,5448.0,18.0,15.429,24285.9015,83.88675,93.985,506.708,...,19.3,41.3,83.241,4.0,78.8,0.848,19256.7,13.44,21.7425,1304.780877
max,79459950.0,525129.0,497521.143,1309961.0,8524.0,5721.857,230382.564,8620.69,3385.473,6003.582,...,44.0,78.1,100.0,13.8,86.75,0.957,754457.3,106.83,373.48,6142.92247


In [68]:
# All available variables
dataset.columns.tolist()

['iso_code',
 'continent',
 'location',
 'date',
 'total_cases',
 'new_cases',
 'new_cases_smoothed',
 'total_deaths',
 'new_deaths',
 'new_deaths_smoothed',
 'total_cases_per_million',
 'new_cases_per_million',
 'new_cases_smoothed_per_million',
 'total_deaths_per_million',
 'new_deaths_per_million',
 'new_deaths_smoothed_per_million',
 'reproduction_rate',
 'icu_patients',
 'icu_patients_per_million',
 'hosp_patients',
 'hosp_patients_per_million',
 'weekly_icu_admissions',
 'weekly_icu_admissions_per_million',
 'weekly_hosp_admissions',
 'weekly_hosp_admissions_per_million',
 'new_tests',
 'total_tests',
 'total_tests_per_thousand',
 'new_tests_per_thousand',
 'new_tests_smoothed',
 'new_tests_smoothed_per_thousand',
 'positive_rate',
 'tests_per_case',
 'tests_units',
 'total_vaccinations',
 'people_vaccinated',
 'people_fully_vaccinated',
 'total_boosters',
 'new_vaccinations',
 'new_vaccinations_smoothed',
 'total_vaccinations_per_hundred',
 'people_vaccinated_per_hundred',
 'peo

In [72]:
# The shape of the dataset
dataset.shape

(127167, 65)

In [70]:
# Variable types
dataset.dtypes


iso_code                                    object
continent                                   object
location                                    object
date                                        object
total_cases                                float64
new_cases                                  float64
new_cases_smoothed                         float64
total_deaths                               float64
new_deaths                                 float64
new_deaths_smoothed                        float64
total_cases_per_million                    float64
new_cases_per_million                      float64
new_cases_smoothed_per_million             float64
total_deaths_per_million                   float64
new_deaths_per_million                     float64
new_deaths_smoothed_per_million            float64
reproduction_rate                          float64
icu_patients                               float64
icu_patients_per_million                   float64
hosp_patients                  

In [71]:
dataset['date'] = pd.to_datetime(dataset['date'])

### Variables description

| Variable                             | Description                                                                                                    |
|:-------------------------------------|:---------------------------------------------------------------------------------------------------------------|
| `icu_patients`                       | Number of COVID-19 patients in intensive care units (ICUs) on a given day                                      |
| `icu_patients_per_million`           | Number of COVID-19 patients in intensive care units (ICUs) on a given day per 1,000,000 people                 |
| `hosp_patients`                      | Number of COVID-19 patients in hospital on a given day                                                         |
| `hosp_patients_per_million`          | Number of COVID-19 patients in hospital on a given day per 1,000,000 people                                    |
| `weekly_icu_admissions`              | Number of COVID-19 patients newly admitted to intensive care units (ICUs) in a given week                      |
| `weekly_icu_admissions_per_million`  | Number of COVID-19 patients newly admitted to intensive care units (ICUs) in a given week per 1,000,000 people |
| `weekly_hosp_admissions`             | Number of COVID-19 patients newly admitted to hospitals in a given week                                        |
| `weekly_hosp_admissions_per_million` | Number of COVID-19 patients newly admitted to hospitals in a given week per 1,000,000 people                   |

### Policy responses
| Variable           | Description                                                                                                                                                                                                         |
|:-------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `stringency_index` | Government Response Stringency Index: composite measure based on 9 response indicators including school closures, workplace closures, and travel bans, rescaled to a value from 0 to 100 (100 = strictest response) |

### Reproduction rate
| Variable            | Description                                                                                                                                   |
|:--------------------|:----------------------------------------------------------------------------------------------------------------------------------------------|
| `reproduction_rate` | Real-time estimate of the effective reproduction rate (R) of COVID-19. See https://github.com/crondonm/TrackingR/tree/main/Estimates-Database |

### Tests & positivity
| Variable                          | Description                                                                                                                                                                                                                                                                                                          |
|:----------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| `total_tests`                     | Total tests for COVID-19                                                                                                                                                                                                                                                                                             |
| `new_tests`                       | New tests for COVID-19 (only calculated for consecutive days)                                                                                                                                                                                                                                                        |
| `total_tests_per_thousand`        | Total tests for COVID-19 per 1,000 people                                                                                                                                                                                                                                                                            |
| `new_tests_per_thousand`          | New tests for COVID-19 per 1,000 people                                                                                                                                                                                                                                                                              |
| `new_tests_smoothed`              | New tests for COVID-19 (7-day smoothed). For countries that don't report testing data on a daily basis, we assume that testing changed equally on a daily basis over any periods in which no data was reported. This produces a complete series of daily figures, which is then averaged over a rolling 7-day window |
| `new_tests_smoothed_per_thousand` | New tests for COVID-19 (7-day smoothed) per 1,000 people                                                                                                                                                                                                                                                             |
| `positive_rate`                   | The share of COVID-19 tests that are positive, given as a rolling 7-day average (this is the inverse of tests_per_case)                                                                                                                                                                                              |
| `tests_per_case`                  | Tests conducted per new confirmed case of COVID-19, given as a rolling 7-day average (this is the inverse of positive_rate)                                                                                                                                                                                          |
| `tests_units`                     | Units used by the location to report its testing data 

Looking at these variables one can observe that for some variables there are pair variables that end wirh 'per_thousand' or 'per_million'. These variables generalize the information regarding the variable that is represented by tehm. We can exclude these variables because they have the same behaviour as the real measurements. For example, icu_patients in a way represents the same information as variable icu_patients_per_million and should have the same behaviour. The variables to be excluded are: 

* icu_patients_per_million
* hosp_patients_per_million
* weekly_icu_admissions_per_million
* weekly_hosp_admissions_per_million
* total_tests_per_thousand
* new_tests_per_thousand
* new_tests_smoothed
* new_tests_smoothed_per_thousand.

In [73]:
dataset.drop([
        'icu_patients_per_million',
        'hosp_patients_per_million', 
        'weekly_icu_admissions_per_million',
        'weekly_hosp_admissions_per_million',
        'total_tests_per_thousand',
        'new_tests_per_thousand',
        'new_tests_smoothed',
        'new_tests_smoothed_per_thousand'
    ],
    axis = 'columns',
    inplace=True
)

dataset.shape

(127167, 57)

Excluding the other variables that belong to the other team members

In [79]:
dataset = dataset[[
    'icu_patients',
    'hosp_patients',
    'weekly_icu_admissions', 
    'weekly_hosp_admissions', 
    'stringency_index', 
    'reproduction_rate', 
    'total_tests', 
    'new_tests', 
    'positive_rate', 
    'tests_per_case', 
    'tests_units'
]]

dataset.shape

(127167, 11)