In [1]:
import io
import requests
import numpy as np
import pandas as pd
from collections import namedtuple

# Datasets

## Government Measurement Dataset

In [4]:
# Oxford Covid-19 Government Response Tracker (OxCGRT)
oxcgrt_url = 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv'
oxcgrt_data = requests.get(oxcgrt_url).content
oxcgrt_data = pd.read_csv(io.StringIO(oxcgrt_data.decode('utf-8')))

## Testing Cases Dataset

In [6]:
test_url = 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/testing/covid-testing-all-observations.csv'
test_data = requests.get(test_url).content
test_data = pd.read_csv(io.StringIO(test_data.decode('utf-8')))

## Confirmed Cases Dataset

In [8]:
confirmed_url = 'https://datahub.io/core/covid-19/r/time-series-19-covid-combined.csv'
confirmed_data = requests.get(confirmed_url).content
confirmed_data = pd.read_csv(io.StringIO(confirmed_data.decode('utf-8')))

# Data Cleaning

#### Due to different date format adopted by different dataset, we need to unify date format across different datasets.

In [11]:
oxcgrt_data['Date'] = oxcgrt_data['Date'].apply(lambda x: pd.to_datetime(x, format='%Y%m%d'))
test_data['Date'] = test_data['Date'].apply(lambda x: pd.to_datetime(x, format='%Y-%m-%d'))
confirmed_data['Date'] = confirmed_data['Date'].apply(lambda x: pd.to_datetime(x, format='%Y-%m-%d'))

#### Map from measurement name to its abbreviation and vice versa for government measure dataset.

In [13]:
a2m = {'S1': 'S1_School closing', 'S2': 'S2_Workplace closing', 'S3': 'S3_Cancel public events',
       'S4': 'S4_Close public transport', 'S5': 'S5_Public information campaigns',
       'S6': 'S6_Restrictions on internal movement', 'S7': 'S7_International travel controls'}
m2a = {v: k for k, v in a2m.items()}

#### Government measuers is divided into 13 indicators (S1 ~ S13), and we focus on S1 to S7,
#### because S8 ~ S11 are fiscal and monetary policies and S12 ~ S13 are testing and contact tracing policies.
#### A "Stringency Index" is also provided to measure the strigency of government measures.
#### More descriptions can be found: https://www.bsg.ox.ac.uk/sites/default/files/2020-04/BSG-WP-2020-031-v4.0_0.pdf,
#### and https://www.bsg.ox.ac.uk/sites/default/files/Calculation%20and%20presentation%20of%20the%20Stringency%20Index.pdf

In [14]:
# select S1 to S7 and Stringency Index
dc_measure_data = dict()
Measure = namedtuple('Measure', 'S1 S2, S3, S4, S5, S6, S7, stringency')
for index, row in oxcgrt_data.iterrows():
    value = []
    for abbr, measure in a2m.items():
        if np.isnan(row[measure]):
            value.append(None)
        else:
            value.append(row[measure])
    if np.isnan(row['StringencyIndexForDisplay']):
        value.append(None)
    else:
        value.append(row['StringencyIndexForDisplay'])

    key = (row['Date'], row['CountryName'])
    value = Measure(*value)
    dc_measure_data[key] = value

#### Due to incomplete reports by different governments, there are some invalid (NAN) values in the govrenment measurement dataset.
#### We refill invalid values by the following steps:
1. sort dictionary by keys which are tuples: first by country, second by date
2. refill None values using the value of the before/after days, as measures won't change significantly
3. delete a data sample if we cannot find a valid value in maximum 7 before/after days

In [16]:
dc_measure_data = {elem[0]: elem[1] for elem in sorted(dc_measure_data.items(), key=lambda x: (x[0][1], x[0][0]))}
delete_key = []
for key, value in dc_measure_data.items():
    for s, v in value._asdict().items():
        if v is None:
            refill = False
            # forward pass
            cnt = 1
            prev_date = key[0] + pd.DateOffset(-1)
            prev_key = (prev_date, key[1])
            while cnt < 7:
                if dc_measure_data.get(prev_key, None) is not None:
                    prev_v = getattr(dc_measure_data.get(prev_key, None), s)
                    if prev_v is not None:
                        dc_measure_data[key] = dc_measure_data[key]._replace(**{s: prev_v})
                        refill = True
                        break
                    else:
                        prev_date = prev_date + pd.DateOffset(-1)
                        prev_key = (prev_date, key[1])
                        cnt += 1
                else:
                    prev_date = prev_date + pd.DateOffset(-1)
                    prev_key = (prev_date, key[1])
                    cnt += 1
            if not refill:
                cnt = 1
                after_date = key[0] + pd.DateOffset(1)
                after_key = (after_date, key[1])
                while cnt < 7:
                    if dc_measure_data.get(after_key, None) is not None:
                        after_v = getattr(dc_measure_data.get(after_key, None), s)
                        if after_v is not None:
                            dc_measure_data[key] = dc_measure_data[key]._replace(**{s: after_v})
                            refill = True
                            break
                        else:
                            after_date = after_date + pd.DateOffset(-1)
                            after_key = (after_date, key[1])
                            cnt += 1
                    else:
                        after_date = after_date + pd.DateOffset(-1)
                        after_key = (after_date, key[1])
                        cnt += 1
                if not refill:
                    delete_key.append(key)
                    break

dc_measure_data = {k: v for k, v in dc_measure_data.items() if k not in delete_key}

#### In the test cases dataset, different countries report test numbers under one or more standards, including:
#### 'tests performed', 'cases tested', 'people tested', ..., and 'unit unclear'.
#### More information can be found: https://ourworldindata.org/covid-testing#our-checklist-for-covid-19-testing-data.
#### To keep consistency, we only keep the test numbers under one standard which has the maximum available data samples.
#### Note that as long as the standard in any single country is the same along the timeline, the analysis is valid.
#### Steps are as follows:

Step 1: get the number of available data for each country under each standard

In [22]:
sc_tested_dict = dict()     # 'sc' means standard and country
for index, row in test_data.iterrows():
    entity = row['Entity']
    country, standard = entity.split('-')
    country = country.strip()
    standard = standard.strip()
    if country not in sc_tested_dict.keys():
        sc_tested_dict[country] = {standard: 1}
    else:
        if standard not in sc_tested_dict[country]:
            sc_tested_dict[country][standard] = 1
        else:
            sc_tested_dict[country][standard] += 1

Step 2: find the standard with the maximum number of availabel data for each country

In [21]:
ssc_tested_dict = dict()    # 'ssc' means single standard and country
for country, value in sc_tested_dict.items():
    sorted_value = {k: v for k, v in sorted(value.items(), key=lambda item: -item[1])}
    standard = list(sorted_value)[0]
    ssc_tested_dict[country] = standard

Step 3: a). remove data that is not consistent with the selected standard; b). reformat Entity name

In [23]:
for index, row in test_data.iterrows():
    entity = row['Entity']
    country, standard = entity.split('-')
    country = country.strip()
    standard = standard.strip()
    if standard != ssc_tested_dict[country]:
        test_data.drop(index, inplace=True)
    else:
        test_data.loc[index, 'Entity'] = country

#### Record the number of daily test cases using the number of cumulative cases.

In [24]:
dc_tested_dict = dict()
Tested = namedtuple('Tested', 'cumulative daily')
for index, row in test_data.iterrows():
    key = (row['Date'], row['Entity'])
    if not np.isnan(row['Daily change in cumulative total']):
        dc_tested_dict[key] = Tested(int(row['Cumulative total']), int(row['Daily change in cumulative total']))
    else:
        dc_tested_dict[key] = Tested(int(row['Cumulative total']), None)

#### In the confirmed cases dataset, the number of cummulative confirmed cases in some countries are inconsistent, and we need to:
#### 1. Remove invalid (NaN) number of confirmed cases.
#### 2. For some countries, combine all province/state statistics into a single country-level statistic.

In [25]:
dc_confirmed_dict = dict()  # 'dc' means date and country
for index, row in confirmed_data.iterrows():
    key = (row['Date'], row['Country/Region'])
    if not np.isnan(row['Confirmed']):
        if key in dc_confirmed_dict.keys():
            dc_confirmed_dict[key] = dc_confirmed_dict[key] + int(row['Confirmed'])
        else:
            dc_confirmed_dict[key] = int(row['Confirmed'])

#### compute daily confirmed cases using the number of cumulative cases.

In [26]:
Confirmed = namedtuple('Confirmed', 'cumulative daily')
for key, value in dc_confirmed_dict.items():
    prev_date_key = (key[0] + pd.DateOffset(-1), key[1])
    if prev_date_key in dc_confirmed_dict.keys():
        if not isinstance(dc_confirmed_dict[prev_date_key], Confirmed):
            dc_confirmed_dict[key] = Confirmed(value, value - dc_confirmed_dict[prev_date_key])
        else:
            dc_confirmed_dict[key] = Confirmed(value, value - dc_confirmed_dict[prev_date_key].cumulative)
    else:
        dc_confirmed_dict[key] = Confirmed(value, None)

# Date Integration

#### Combine data from three datasets together using dictionary structure: key - (date, country), value - (Measure, Confirmed, Tested).
#### Because the three datasets are collected by different organizations/groups, we need to filter out incomplete data samples.
#### Each item represent a sample for a country in a day which appears in all three datasets.

In [29]:
dc_combined_data = dict()   # 'dc' means date and country
Combined = namedtuple('Combined', 'measure confirmed tested')
for key, value in dc_measure_data.items():
    if key in dc_confirmed_dict and key in dc_tested_dict:
        dc_combined_data[key] = Combined(value, dc_confirmed_dict[key], dc_tested_dict[key])

# Have a look at the final combined dataset

In [31]:
print('Total number of samples: {}'.format(len(dc_combined_data)))

Total number of samples: 2547


In [40]:
import random

key = random.choice(list(dc_combined_data.keys()))
print('A random sample in the integrated dataset:')
print(key)
print(dc_combined_data[key])

A random sample in the integrated dataset:
(Timestamp('2020-02-20 00:00:00'), 'South Africa')
Combined(measure=Measure(S1=0.0, S2=0.0, S3=0.0, S4=0.0, S5=0.0, S6=0.0, S7=1.0, stringency=4.76), confirmed=Confirmed(cumulative=0, daily=0), tested=Tested(cumulative=106, daily=11))
