In [9]:
import urllib
import json
import pandas as pd
import numpy as np
import warnings
import pickle
from datetime import datetime
from datetime import timedelta
from urllib.error import URLError
from functools import wraps
from keys import client_id, client_secret, app_id
warnings.filterwarnings('ignore')

In [10]:
DEATH_PATH = '../augmented_datasets/pickles/hopkins_death_withgr_augmented0605.pkl'
CONF_PATH = '../augmented_datasets/pickles/hopkins_conf_withgr_augmented0605_withsocietal.pkl'
COLS = ['GDP', 'Urbanization', 'Median Age',\
        'Democracy', 'Gini Index', 'State Population',\
        'Total Tests', 'Tests \\ Pop', 'avg_mobility', 'Tests per 1M']

In [11]:
def add_index(df, row_name, gap=5):
    """
    Adds a row on level 1 of a df
    """
    idx = df.index
    previous_coor = (0,0)
    i = gap
    for coor, data in df.iterrows():
        coor = coor[0]
        if coor != previous_coor:
            idx = idx.insert(i, (coor, row_name))
            i += (gap + 1)
            previous_coor = coor
    return df.copy().reindex(idx)

In [12]:
hopkins_death = pd.read_pickle(DEATH_PATH)
hopkins_conf = pd.read_pickle(CONF_PATH)

In [13]:
hopkins_death = add_index(hopkins_death, 'd_mob_change')
hopkins_conf.insert(11, 'avg_mobility', np.nan)
for col in COLS[::-1]:
    hopkins_death.insert(2, col, np.nan)

###### Merge with data from confirmed

In [14]:
conf_indexs = []
death_indexs = []
for row in hopkins_conf.iterrows():
    conf_indexs.append(row[0][0])
for row in hopkins_death.iterrows():
    death_indexs.append(row[0][0])

len(set(conf_indexs))
len(set(death_indexs))

conf_ind =  set(conf_indexs)
death_ind = set(death_indexs)
to_drop = [ind for ind in death_ind if ind not in conf_ind]
len(to_drop)
# hopkins_death.drop(index=to_drop, level=0, inplace=True)

coords = []
for row in hopkins_death.iterrows():
    coords.append(row[0][0])
coords = list(set(coords))


1837

1254

0

In [15]:
hopkins_death.insert(12, 'avg_interval_tmp', np.nan)
hopkins_death.insert(12, 'avg_interval_RH', np.nan)
hopkins_conf.insert(12, 'avg_interval_tmp', np.nan)
hopkins_conf.insert(12, 'avg_interval_RH', np.nan)

In [16]:
days = hopkins_conf.columns.tolist()[34:]
hopkins_conf['Tests per 1M'] = hopkins_conf['Tests per 1M'].iloc[:].str.replace(',', '').astype(float)
hopkins_conf['Democracy'] = hopkins_conf['Democracy'].iloc[:].str.replace(',', '').astype(float)

In [17]:
hopkins_death = pd.read_pickle('../augmented_datasets/pickles/death_data_gr_fix_0605.pkl')

###### this is a patch to get a new df with gf for death cases > 2 rather then 20

In [18]:
for coord in coords:
    for col in COLS:
        hopkins_death.loc[coord, col]['data'] = hopkins_conf.loc[coord, col]['data']
    for day in days:
        hopkins_death.loc[coord, day]['d_mob_change'] = hopkins_conf.loc[coord, day]['d_mob_change']

###### Calculate avg interval temp and mobility for both datasets

In [19]:
coords = set(hopkins_conf.index.get_level_values(0).tolist())
for coord in coords:
    try:
        last_rel_date = hopkins_conf.loc[coord]['last relevant date']['data']
        five_prcnt_date = hopkins_conf.loc[coord]['5%_Date']['data']

        five_prct_column = hopkins_conf.columns.get_loc(five_prcnt_date)
        last_rel_column = hopkins_conf.columns.get_loc(last_rel_date)
        interval = hopkins_conf[hopkins_conf.columns[five_prct_column:last_rel_column]]

        hopkins_conf.loc[coord, 'avg_interval_tmp']['data'] = interval.loc[coord].loc['avg_d_tmp'].mean()
        hopkins_conf.loc[coord, 'avg_interval_RH']['data'] = interval.loc[coord].loc['avg_d_RH'].mean()
        hopkins_conf.loc[coord, 'avg_mobility']['data'] = interval.loc[coord].loc['d_mob_change'].mean()
    except KeyError as e:
#         raise e
        print('key error, {0}'.format(coord))

coords = set(hopkins_death.index.get_level_values(0).tolist())
for coord in coords:
    try:
        last_rel_date = hopkins_death.loc[coord]['last relevant date']['data']
        five_prcnt_date = hopkins_death.loc[coord]['5%_Date']['data']

        five_prct_column = hopkins_death.columns.get_loc(five_prcnt_date)
        last_rel_column = hopkins_death.columns.get_loc(last_rel_date)
        interval = hopkins_death[hopkins_death.columns[five_prct_column:last_rel_column]]

        hopkins_death.loc[coord, 'avg_interval_tmp']['data'] = interval.loc[coord].loc['avg_d_tmp'].mean()
        hopkins_death.loc[coord, 'avg_interval_RH']['data'] = interval.loc[coord].loc['avg_d_RH'].mean()
        hopkins_death.loc[coord, 'avg_mobility']['data'] = interval.loc[coord].loc['d_mob_change'].mean()
    except KeyError as e:
        raise e
#         print('key error, {0}'.format(coord))

###### fill place with no mobility data with mean

In [87]:
# amount of lications with no google movment data
len([loc for loc in np.isnan(hopkins_conf['avg_mobility'][::7]) if loc])
hopkins_conf['avg_mobility'].fillna(hopkins_conf['avg_mobility'].mean(), inplace=True)
hopkins_conf['avg_mobility'][::7].fillna(hopkins_conf['avg_mobility'][::7].mean(), inplace=True)
len([loc for loc in np.isnan(hopkins_conf['avg_mobility'][::7]) if loc])

0

0

In [21]:
conf_bkp = hopkins_conf.copy()
death_bkp = hopkins_death.copy()

In [94]:
hopkins_conf['Democracy'] = hopkins_conf['Democracy'].iloc[:].str.replace(',', '').astype(float)
hopkins_death['avg_mobility'][::7].fillna(hopkins_death['avg_mobility'][::7].mean(), inplace=True)

###### Save the full dataset - along with daily data

In [95]:
hopkins_conf.to_pickle('../augmented_datasets/pickles/final_data/hopkins_conf_full_0605.pkl')
hopkins_death.to_pickle('../augmented_datasets/pickles/final_data/hopkins_death_full_0605.pkl')

###### Create narrow dataset and handle nans
We remove the first_7 nans and some other fields, and fill means in the others

In [96]:
hopkins_conf['Province_State'] = hopkins_conf['Province_State'].fillna('')
hopkins_death['Province_State'] = hopkins_death['Province_State'].fillna('')
narrow_conf = hopkins_conf.iloc[::7]
narrow_death = hopkins_death.iloc[::6]

Conf data

In [99]:
for column in narrow_conf.columns:
    any_nan = narrow_conf[column].loc[:, 'data'].isna().any()
    nulls = hopkins_conf[column].loc[:, 'data'].isna().tolist()
    if any_nan:
        size = len([nul for nul in nulls if nul])
        print('{0}: {1}, {2}'.format(column, any_nan, size))

Democracy: True, 13
State Population: True, 225
Total Tests: True, 225
Tests \ Pop: True, 225


In [98]:
narrow_conf['Gini Index'].fillna(narrow_conf['Gini Index'].mean(), inplace=True)
narrow_conf['Tests per 1M'].fillna(narrow_conf['Tests per 1M'].mean(), inplace=True)

narrow_conf['first_7'].dropna(inplace=True)
narrow_conf['GDP'].dropna(inplace=True)
narrow_conf['Urbanization'].dropna(inplace=True)

Death data

In [105]:
for column in narrow_death.columns:
    any_nan = narrow_death[column].loc[:, 'data'].isna().any()
    nulls = narrow_death[column].loc[:, 'data'].isna().tolist()
    if any_nan:
        size = len([nul for nul in nulls if nul])
        print('{0}: {1}, {2}'.format(column, any_nan, size))

State Population: True, 192
Total Tests: True, 192
Tests \ Pop: True, 192


In [101]:
narrow_death['Gini Index'].fillna(narrow_conf['Gini Index'].mean(), inplace=True)
narrow_death['Tests per 1M'].fillna(narrow_conf['Tests per 1M'].mean(), inplace=True)

narrow_death['first_7'].dropna(inplace=True)
narrow_death['avg_interval_RH'].dropna(inplace=True)
narrow_death['avg_interval_tmp'].dropna(inplace=True)
narrow_death['avg_mobility'].dropna(inplace=True)
narrow_death['GDP'].dropna(inplace=True)
narrow_death['Urbanization'].dropna(inplace=True)

###### Sanity checks

In [103]:
narrow_conf[narrow_conf['Country_Region'] == 'Israel']['avg_interval_tmp']
len(narrow_death)

coordinate    information
(31.0, 35.0)  data           18.928125
Name: avg_interval_tmp, dtype: float64

1254

###### Save augmented data
1. Multi index does not save well in csv, so we also save it as a pickle

In [104]:
narrow_conf.to_pickle('../augmented_datasets/pickles/final_data/hopkins_conf_narrow_0605.pkl')
narrow_death.to_pickle('../augmented_datasets/pickles/final_data/hopkins_death_narrow_0605.pkl')