In [1]:
import numpy as np
import pandas as pd
from datetime import date
from functools import reduce

## Datasets Used

In the `data` folder, we include a `data_sources.tsv` to keep track of the datasets we find useful. For each dataset, we record the granularity, url, and the date we retrieved the data. If the datasets were published on Github, we also record the url of the specific version.


We kept the original name of the datasets, and only performed necessary actions:

- All dataset from World Bank have metadata at the top (line 1-4). We removed those lines.
- `kff/raw_data.csv` includes some extra information in the spreadsheet (line 1-2, 56-71), which cause errors. We removed those lines from the table.

In [2]:
used_data = pd.read_csv('./data/data_sources.tsv', sep='\t')
used_data[['name', 'url', 'date_retrieved']]

Unnamed: 0,name,url,date_retrieved
0,covid-tracking-states-current,https://github.com/COVID19Tracking/covid-track...,2020-04-23
1,covid-tracking-states-daily,https://github.com/COVID19Tracking/covid-track...,2020-04-23
2,covid-tracking-us-current,https://github.com/COVID19Tracking/covid-track...,2020-04-23
3,covid-tracking-us-daily,https://github.com/COVID19Tracking/covid-track...,2020-04-23
4,khn-icu-beds-by-county,https://khn.org/news/as-coronavirus-spreads-wi...,2020-04-21
5,khn-hospital-by-county,https://khn.org/wp-content/uploads/sites/2/202...,2020-04-21
6,kff-state-actions,https://www.kff.org/health-costs/issue-brief/s...,2020-04-24
7,world-bank-hospital-beds,https://data.worldbank.org/indicator/SH.MED.BE...,2020-04-10
8,world-bank-physicians,https://data.worldbank.org/indicator/SH.MED.PH...,2020-04-24
9,world-bank-nurses,https://data.worldbank.org/indicator/SH.MED.NU...,2020-04-24


# Processing Country Level Data

At the country level, we are creating two tables `country_indicators.csv` and `country_responses.csv`. For the first table, we will collect the following information for each country:

| name  | meaning  | source  |
| :---- | :------- | :------ |
| `country_name` | the name of the country/region | `world-bank-hospital-beds` |
| `country_code` | the ISO 3 country code | `world-bank-hospital-beds` |
| `hospital_beds_per_1000` | number of hospital beds per 1000 people          |`world-bank-hospital-beds`     |
| `physicians_per_1000`   | number of physicians per 1000 people            |`world-bank-physicians`       |
| `nurses_per_1000`      | number of nurses and midwives per 1000 people      |`world-bank-nurses`          |
| `percentage_65up`      | population ages 65 and above (% of total population) |`world-bank-elderly-population`  |

In [3]:
# extract indicator SH.MED.BEDS.ZScountry_responses
hospital_beds_df = pd.read_csv('./data/world-bank/API_SH.MED.BEDS.ZS_DS2_en_csv_v2_988924.csv')

latest_numbers = []
for idx, row in hospital_beds_df.iterrows():
    latest = np.nan
    for y in range(2019,1959, -1):
        if pd.notnull(row[str(y)]):
            latest = row[str(y)]
            break
    latest_numbers.append(latest)

hospital_beds_df['hospital_beds_per_1000'] = pd.Series(latest_numbers)
hospital_beds = hospital_beds_df[['Country Name', 'hospital_beds_per_1000']]

hospital_beds.head(3)

Unnamed: 0,Country Name,hospital_beds_per_1000
0,Aruba,
1,Afghanistan,0.5
2,Angola,0.8


In [4]:
# extract indicator SH.MED.PHYS.ZS
physicians_df = pd.read_csv('./data/world-bank/API_SH.MED.PHYS.ZS_DS2_en_csv_v2_993645.csv')

latest_numbers = []
for idx, row in physicians_df.iterrows():
    latest = np.nan
    for y in range(2019,1959, -1):
        if pd.notnull(row[str(y)]):
            latest = row[str(y)]
            break
    latest_numbers.append(latest)

physicians_df['physicians_per_1000'] = pd.Series(latest_numbers)
physicians = physicians_df[['Country Name', 'physicians_per_1000']]

physicians.head(3)

Unnamed: 0,Country Name,physicians_per_1000
0,Aruba,1.12
1,Afghanistan,0.284
2,Angola,0.2149


In [5]:
# extract indicator SH.MED.NUMW.P3
nurses_df = pd.read_csv('./data/world-bank/API_SH.MED.NUMW.P3_DS2_en_csv_v2_993722.csv')

latest_numbers = []
for idx, row in nurses_df.iterrows():
    latest = np.nan
    for y in range(2019,1959, -1):
        if pd.notnull(row[str(y)]):
            latest = row[str(y)]
            break
    latest_numbers.append(latest)

nurses_df['nurses_per_1000'] = pd.Series(latest_numbers)
nurses = nurses_df[['Country Name', 'nurses_per_1000']]

nurses.head(3)

Unnamed: 0,Country Name,nurses_per_1000
0,Aruba,
1,Afghanistan,0.32
2,Angola,1.3123


In [6]:
# extract indicator SP.POP.65UP.TO.ZS
elders_df = pd.read_csv('./data/world-bank/API_SP.POP.65UP.TO.ZS_DS2_en_csv_v2_988979.csv')

latest_numbers = []
for idx, row in elders_df.iterrows():
    latest = np.nan
    for y in range(2019,1959, -1):
        if pd.notnull(row[str(y)]):
            latest = row[str(y)]
            break
    latest_numbers.append(latest)

elders_df['percentage_65up'] = pd.Series(latest_numbers)
elders = elders_df[['Country Name', 'percentage_65up']]

elders.head(3)

Unnamed: 0,Country Name,percentage_65up
0,Aruba,13.550947
1,Afghanistan,2.584927
2,Angola,2.216374


In [7]:
# collect ISO 3 codes for each country
ISO3 = hospital_beds_df[['Country Name', 'Country Code']]
ISO3.head(3)

Unnamed: 0,Country Name,Country Code
0,Aruba,ABW
1,Afghanistan,AFG
2,Angola,AGO


After collecting all the pieces, we can now merge them and output to the output directory (`processed_data`).

In [8]:

# merge dataframes together by country name
data_frames = [ISO3, hospital_beds, physicians, nurses, elders]
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['Country Name']), data_frames)

# rename columns names for consistency
df_merged = df_merged.rename(columns={
    'Country Name': 'country_name', 
    'Country Code': 'country_code'
})

df_merged.head(3)

Unnamed: 0,country_name,country_code,hospital_beds_per_1000,physicians_per_1000,nurses_per_1000,percentage_65up
0,Aruba,ABW,,1.12,,13.550947
1,Afghanistan,AFG,0.5,0.284,0.32,2.584927
2,Angola,AGO,0.8,0.2149,1.3123,2.216374


In [9]:
# write to output file in processed_data folder
df_merged.to_csv('./processed_data/country_indicators.csv', index=False)

For the second table `country_responses.csv`, we will collect the timeseries data, each containing the following attributes:

| name | meaning | source |
| :--- | :------ | :----- |
| `date` | date of the data collected | `oxford-government-responses` |
| `country_name` | name of the country/region | `oxford-government-responses` |
| `country_code` | ISO 3 country code | `oxford-government-responses` |
| `s1_school_closing` | closing of schools/universities | `oxford-government-responses` |
| `s1_is_general` | whether `s1` is general or targeted | `oxford-government-responses` |
| `s2_workplace_closing` | closing of workplaces | `oxford-government-responses` |
| `s2_is_general` | whether `s2` is general or targeted | `oxford-government-responses` |
| `s3_cancel_public_events` | cancelling public events | `oxford-government-responses` |
| `s3_is_general` | whether `s3` is general or targeted | `oxford-government-responses` |
| `s4_close_public_transport` | closing public transport | `oxford-government-responses` |
| `s4_is_general` | whether `s4` is general or targeted | `oxford-government-responses` |
| `s5_public_info_campaigns` | public info campaign | `oxford-government-responses` |
| `s5_is_general` | whether `s5` is general or targeted | `oxford-government-responses` |
| `s6_restrictions_on_internal_movement` | restricting domestic travels | `oxford-government-responses` |
| `s6_is_general` | whether `s6` is general or targeted | `oxford-government-responses` |
| `s7_international_traval_controls` | restricting international travel | `oxford-government-responses` |
| `s12_testing_policies` | who can get tested | `oxford-government-responses` |
| `s13_contact_tracking` | tracking closely contacted people | `oxford-government-responses` |
| `stringency_index` | the sum of policy scores, measuring the strictness of the government policies | `oxford-government-responses` |
| `confirmed` | the number of confirmed cases | `oxford-government-responses` |
| `recovered` | the number of recovered cases | `oxford-government-responses` |
| `deaths` | death toll | `oxford-government-responses` |


This table uses data from [Oxford COVID-19 Government Responses Tracker](https://www.bsg.ox.ac.uk/research/publications/variation-government-responses-covid-19). The authors use a novel index to measure the stringency of government responses. A total of 13 indicators (S1-S13) are used. Nine of them (the ones we chose) are non-financial indicators. A detailed explanation can be found [here](https://www.bsg.ox.ac.uk/sites/default/files/2020-04/BSG-WP-2020-031-v4.0_0.pdf). 

In [10]:
gov_responses_df = pd.read_csv('data/oxford/OxCGRT_Download_240420_164803_Full.csv')

interested_columns = [
    'Date', 'CountryName', 'CountryCode', 
    'S1_School closing', 'S1_IsGeneral',
    'S2_Workplace closing', 'S2_IsGeneral',
    'S3_Cancel public events', 'S3_IsGeneral',
    'S4_Close public transport', 'S4_IsGeneral',
    'S5_Public information campaigns', 'S5_IsGeneral',
    'S6_Restrictions on internal movement', 'S6_IsGeneral',
    'S7_International travel controls',
    'S12_Testing framework',
    'S13_Contact tracing',
    'StringencyIndex', 'ConfirmedCases','ConfirmedDeaths'
]

# select interested values
gov_responses = gov_responses_df[interested_columns]

gov_responses.head(3)

Unnamed: 0,Date,CountryName,CountryCode,S1_School closing,S1_IsGeneral,S2_Workplace closing,S2_IsGeneral,S3_Cancel public events,S3_IsGeneral,S4_Close public transport,...,S5_Public information campaigns,S5_IsGeneral,S6_Restrictions on internal movement,S6_IsGeneral,S7_International travel controls,S12_Testing framework,S13_Contact tracing,StringencyIndex,ConfirmedCases,ConfirmedDeaths
0,20200101,Aruba,ABW,0.0,,0.0,,0.0,,0.0,...,0.0,,0.0,,0.0,0.0,0.0,0.0,,
1,20200102,Aruba,ABW,0.0,,0.0,,0.0,,0.0,...,0.0,,0.0,,0.0,0.0,0.0,0.0,,
2,20200103,Aruba,ABW,0.0,,0.0,,0.0,,0.0,...,0.0,,0.0,,0.0,0.0,0.0,0.0,,


In [11]:
recovered_df = pd.read_csv('./data/jhu-csse/time_series_covid19_recovered_global.csv')

# ignore Province/State, Lat, and Long
recovered = recovered_df[['Country/Region'] + list(recovered_df.columns[4:])]

# convert to a long table by melting
recovered = recovered.melt(id_vars='Country/Region', var_name='date', value_name='recovered')

recovered.head(3)

Unnamed: 0,Country/Region,date,recovered
0,Afghanistan,1/22/20,0
1,Albania,1/22/20,0
2,Algeria,1/22/20,0


The JHU datasets does not have ISO 3 country code, which means we need to use country/region name to do the join. It is possible that different datasets use different names for the same country/region.

In [12]:
# country names in Oxford dataset (without dups)
countries1 = set(gov_responses['CountryName'])

# country names in JHU datasets (without dups)
countries2 = set(recovered['Country/Region'])

# names common to both dataset
common = countries1 & countries2

print(f'Not in gov_responses: {countries1-common}\n')
print(f'Not in recovered: {countries2 - common}\n')

Not in gov_responses: {'Lesotho', 'Macao', 'Kyrgyz Republic', 'Slovak Republic', 'Guam', 'South Korea', 'Cape Verde', 'Hong Kong', 'Puerto Rico', 'Taiwan', 'Bermuda', 'Democratic Republic of Congo', 'Aruba', 'Myanmar', 'Czech Republic', 'Palestine', 'Greenland', 'United States'}

Not in recovered: {'Cabo Verde', 'Burma', 'Haiti', 'US', 'Korea, South', 'Senegal', 'Central African Republic', 'Slovakia', 'Congo (Kinshasa)', 'Sao Tome and Principe', 'Cambodia', 'Maldives', 'Eritrea', 'Yemen', 'Montenegro', 'Armenia', 'Bhutan', 'Equatorial Guinea', 'Saint Lucia', 'Monaco', 'Benin', 'Georgia', 'Taiwan*', 'Czechia', 'Togo', 'Liechtenstein', 'Saint Kitts and Nevis', 'Saint Vincent and the Grenadines', 'Bahamas', 'Liberia', 'Grenada', 'Guinea', 'Western Sahara', 'Guinea-Bissau', 'Belarus', 'Latvia', 'Holy See', 'North Macedonia', 'Lithuania', 'Antigua and Barbuda', 'Kyrgyzstan', 'West Bank and Gaza', 'Diamond Princess', 'Fiji', 'Nepal', 'Somalia', 'Malta', 'Timor-Leste', 'MS Zaandam', "Cote d'I

After inspecting the output, we need to perform some translations.

- "Slovakia" to "Slovak Republic"
- "Korea, South" to "South Korea"
- "Kyrgyzsta" to "Kyrgyz Republic"
- "Taiwan*" to "Taiwan"
- "Congo (Kinshasa)" to "Democratic Republic of Congo"
- "US" to "United States"
- "Czechia" to "Czech Republic"

In [13]:
def translate_country(countryName):
    if countryName == "Slovakia":
        return "Slovak Republic"
    elif countryName == "Korea, South":
        return "South Korea"
    elif countryName == "Kyrgyzsta":
        return "Kyrgyz Republic"
    elif countryName == "Taiwan*":
        return "Taiwan"
    elif countryName == "Congo (Kinshasa)":
        return "Democratic Republic of Congo"
    elif countryName == "US":
        return "United States"
    elif countryName == "Czechia":
        return "Czech Republic"
    else:
        return countryName

# translate country/region names in the JHU dataset
recovered['Country/Region'] = recovered['Country/Region'].map(lambda x: translate_country(x))

Similarly, the two datasets represent date using different format, we need to translate that too.

- `M/DD/YY` to `YYYYMMDD`

In [14]:
def translate_date(datestr):
    m, d, _ = datestr.split('/')
    return 2020 * 10000 + int(m) * 100 + int(d)

#translate country/region names in the JHU dataset
recovered['date'] = recovered['date'].map(lambda x: translate_date(x))

Now we can combine two dataframes by inner join and write the result to a CSV file.

In [15]:
# rename for joinning
recovered = recovered.rename(columns={
    'Country/Region': 'CountryName',
    'date': 'Date'
})

# perform inner join of two tables
df_merged = pd.merge(gov_responses, recovered, on=['Date', 'CountryName'])

# rename columns names for consistency
df_merged = df_merged.rename(columns={
    'Date': 'date',
    'CountryName': 'country_name',
    'CountryCode': 'country_code',
    'S1_School closing': 's1_school_closing',
    'S1_IsGeneral': 's1_is_general',
    'S2_Workplace closing': 's2_workplace_closing',
    'S2_IsGeneral': 's2_is_general',
    'S3_Cancel public events': 's3_cancel_public_events',
    'S3_IsGeneral': 's3_is_general',
    'S4_Close public transport': 's4_close_public_transport',
    'S4_IsGeneral': 's4_is_general',
    'S5_Public information campaigns': 's5_public_information_campaigns',
    'S5_IsGeneral': 's5_is_general',
    'S6_Restrictions on internal movement': 's6_restrictions_on_internal_movement',
    'S6_IsGeneral': 's6_is_general',
    'S7_International travel controls': 's7_international_travel_controls',
    'S12_Testing framework': 's12_testing_framework',
    'S13_Contact tracing': 's13_contact_tracing',
    'StringencyIndex': 'stringency_index',
    'ConfirmedCases': 'confirmed',
    'ConfirmedDeaths': 'deaths'
})

df_merged.head(3)

Unnamed: 0,date,country_name,country_code,s1_school_closing,s1_is_general,s2_workplace_closing,s2_is_general,s3_cancel_public_events,s3_is_general,s4_close_public_transport,...,s5_is_general,s6_restrictions_on_internal_movement,s6_is_general,s7_international_travel_controls,s12_testing_framework,s13_contact_tracing,stringency_index,confirmed,deaths,recovered
0,20200122,Afghanistan,AFG,0.0,,0.0,,0.0,,0.0,...,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0
1,20200123,Afghanistan,AFG,0.0,,0.0,,0.0,,0.0,...,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0
2,20200124,Afghanistan,AFG,0.0,,0.0,,0.0,,0.0,...,,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0


In [16]:
# write to output file in processed_data folder
df_merged.to_csv('./processed_data/country_responses.csv', index=False)

## Processing State Level Data

At the state level, we are creating two tables `state_indicators.csv` and `state_responses.csv`. For the first table, we will collect the following information for each state: 

| name  | meaning  | source  |
| :---- | :------- | :------ |
| `state_name` | the name of the state | `khn-icu-beds-by-county` |
| `state_code` | the code of the state | `khn-hospital-by-county` |
| `hospitals_per_1000`      | the number of hospitals per 1000 people |`khn-hospital-by-county`  |
| `icu_beds_per_1000` | number of icu beds per 1000 people | `khn-icu-beds-by-county` |
| `percentage_60up`      | population ages 60 and above (% of total population) |`khn-icu-beds-by-county`  |

In [27]:
#collect the number of icu beds per 1000
icu_beds_df = pd.read_csv('./data/khn/data-FPBfZ.csv')
icu_beds_df = icu_beds_df[['State', 'ICU Beds','Total Population']]
icu_beds = icu_beds_df.groupby(['State'], as_index=False).sum()

icu_beds['ICU Beds'] = 1000 * icu_beds['ICU Beds']/icu_beds['Total Population']
icu_beds = icu_beds[['State', 'ICU Beds']]

icu_beds = icu_beds.rename(columns={
    'State': 'state_name', 
    'ICU Beds': 'icu_beds_per_1000'
})

icu_beds.head(3)

Unnamed: 0,state_name,icu_beds_per_1000
0,Alabama,0.316032
1,Alaska,0.161123
2,Arizona,0.22893


In [25]:
#collect the percentage of people ages 60 and above
elders_df = pd.read_csv('./data/khn/data-FPBfZ.csv')
elders_df = elders_df[['State', 'Population Aged 60+', 'Total Population']]
elders = elders_df.groupby(['State'], as_index=False).sum()

elders['Population Aged 60+'] = 100*elders['Population Aged 60+']/elders['Total Population']
elders = elders[['State', 'Population Aged 60+']]

elders = elders.rename(columns={
    'State': 'state_name', 
    'Population Aged 60+': 'percentage_60up'
})

elders.head(3)

Unnamed: 0,state_name,percentage_60up
0,Alabama,21.968157
1,Alaska,15.847894
2,Arizona,22.066078


In [18]:
#collect the hospitals number in cost reports
hospitals_in_df = pd.read_csv('./data/khn/KHN_ICU_bed_county_analysis_2.csv')

hospitals_in_cost_df = hospitals_in_df[['state', 'st', 'hospitals_in_cost_reports', 'Total_pop']]
hospitals_in_cost = hospitals_in_cost_df.groupby(['state','st'], as_index=False).sum()


hospitals_in_cost['hospitals_in_cost_reports'] = 1000*hospitals_in_cost['hospitals_in_cost_reports']/hospitals_in_cost['Total_pop']
hospitals_in_cost = hospitals_in_cost[['state', 'st', 'hospitals_in_cost_reports']]

hospitals_in_cost = hospitals_in_cost.rename(columns={
    'state': 'state_name', 
    'st': 'state_code',
    'hospitals_in_cost_reports': 'hospitals_per_1000'
})

hospitals_in_cost.head(3)

Unnamed: 0,state_name,state_code,hospitals_per_1000
0,Alabama,AL,0.017523
1,Alaska,AK,0.029787
2,Arizona,AZ,0.01072


In [26]:
# merge dataframes together by state name
data_frames = [hospitals_in_cost, icu_beds, elders]
df_merged = reduce(lambda left, right: pd.merge(left, right, on=['state_name']), data_frames)


df_merged.head(3)

Unnamed: 0,state_name,state_code,hospitals_per_1000,icu_beds_per_1000,percentage_60up
0,Alabama,AL,0.017523,0.316032,21.968157
1,Alaska,AK,0.029787,0.161123,15.847894
2,Arizona,AZ,0.01072,0.22893,22.066078


In [29]:
# write to output file in processed_data folder
df_merged.to_csv('./processed_data/state_indicators.csv', index=False)