In [114]:
import pickle
import numpy as np
import pandas as pd
import os
import urllib.request
import glob

In [115]:
# Main source for the training data
DATA_URL = 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv'

# Local file
DATA_FILE = 'OxCGRT_latest.csv'

#Download the data
urllib.request.urlretrieve(DATA_URL, DATA_FILE)

('OxCGRT_latest.csv', <http.client.HTTPMessage at 0x2c103bdb048>)

In [116]:
oxford_data = pd.read_csv(DATA_FILE, 
                 parse_dates=['Date'],
                 encoding="ISO-8859-1",
                 dtype={"RegionName": str,
                        "RegionCode": str},
                 error_bad_lines=False)

oxford_data['GeoID'] = oxford_data['CountryName'] + '__' + oxford_data['RegionName'].astype(str)
oxford_data['NewCases'] = oxford_data.groupby('GeoID').ConfirmedCases.diff().fillna(0)

oxford_data.update(oxford_data.groupby('GeoID').NewCases.apply(
    lambda group: group.interpolate()).fillna(0))

oxford_data['NewCasesSmoothed'] = oxford_data.groupby('GeoID')['NewCases'].transform(lambda s: s.rolling(7).mean())

In [117]:
def replace_implemented_plan_with_real_infections(file_dir, oxford_data = oxford_data):
    
    df = pd.read_csv(file_dir, index_col = 0)
    
    if 'Implemented plan real' in df.PrescriptionIndex.unique():
        # it means that this file has already been processed
        print(f'Skipping file {file_dir} because it has already been processed.')
        return
         
    implemented_plan = df[df['PrescriptionIndex'] == 'Implemented plan'].reset_index(drop=True)
    
    start_date = implemented_plan['Date'].iloc[0]
    end_date = implemented_plan['Date'].iloc[-1]
    country_name = implemented_plan['CountryName'].iloc[0]
    region_name = str(implemented_plan['RegionName'].iloc[0])
    
    sub_oxford_data = oxford_data[(oxford_data['CountryName'] == country_name) & (oxford_data['RegionName'].astype(str) == region_name)].reset_index(drop=True)
    sub_oxford_data = sub_oxford_data.loc[(sub_oxford_data['Date'] >= start_date) & (sub_oxford_data['Date'] <= end_date)]
        
    implemented_plan.loc[:, 'Infections'] = sub_oxford_data['NewCases'].array
    implemented_plan.loc[:, 'InfectionsRolling'] = sub_oxford_data['NewCasesSmoothed'].array
    implemented_plan.replace({'Implemented plan' : 'Implemented plan real'}, inplace = True)
        
    df = pd.concat([df, implemented_plan], axis=0)
    df = df.reset_index(drop=True)
    
    df.to_csv(file_dir, index = False)
    
    print(f'File {file_dir} has been processed.')
    
    return

In [119]:
CSV_DIR = 'data'

file_list = glob.glob(os.path.join(CSV_DIR, '*.csv'))

for file_dir in file_list:
    replace_implemented_plan_with_real_infections(file_dir)

File data\country-Argentina_category-0_weights-combined_start-20200925.csv has been processed.
File data\country-Argentina_category-0_weights-gdp_start-20200925.csv has been processed.
File data\country-Argentina_category-0_weights-social_start-20200925.csv has been processed.
File data\country-Argentina_category-1_weights-combined_start-20210110.csv has been processed.
File data\country-Argentina_category-1_weights-gdp_start-20210110.csv has been processed.
File data\country-Argentina_category-1_weights-social_start-20210110.csv has been processed.
File data\country-Argentina_category-2_weights-combined_start-20210417.csv has been processed.
File data\country-Argentina_category-2_weights-gdp_start-20210417.csv has been processed.
File data\country-Argentina_category-2_weights-social_start-20210417.csv has been processed.
File data\country-Argentina_category-m1_weights-combined_start-20201110.csv has been processed.
File data\country-Argentina_category-m1_weights-gdp_start-20201110.csv