In [1]:
import urllib
import json
import pandas as pd
import numpy as np
import warnings
import pickle
from datetime import datetime
from datetime import timedelta
from urllib.error import URLError
from functools import wraps
from keys import client_id, client_secret, app_id
warnings.filterwarnings('ignore')

In [3]:
DIAMOND_PRINCESS_COORD = (35.4437, 139.638)
BARBADOS_BELIZE_COORD = (13.1939, -59.5432)
CONGO_BRAZZAVILLE_KINSHASA_COORD = (-4.0383, 21.7587)
MASSACHUSETTS_COORD = (41.40674725, -70.68763497)
RADIUS = 350 # Miles
TIMEOUT = 10 # Seconds
LOG_PATH = 'weather_logs/'
GLOBAL_DEATH_PATH = '../original_datasets/remote_repo/csse_covid_19_data/csse_covid_19_time_series/'\
              'time_series_covid19_deaths_global.csv'
US_DEATH_PATH = '../original_datasets/remote_repo/csse_covid_19_data/csse_covid_19_time_series/'\
          'time_series_covid19_deaths_US.csv'
CONF_PICKLE_PATH = '../augmented_datasets/pickles/hopkins_conf_gf0904_GDP_urban_weather.pkl'

###### Load datasets
Notes on data: 
1. Notice that some of the dates on the hopkins dataset appear in different format in ecxel, they are infect all in the same format: %-m/%-d/20
2. Column names are modifyed to %-m/%-d/20 to fit weather API queries
3. All three rows reffering to 'Diamnond princess' have been removed
4. The location (0,0) has been removed
5. Two places with the same coordinate have been slightly modified to accomedate indexing
4. source: https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series

In [3]:
global_df = pd.read_csv(GLOBAL_DEATH_PATH)
us_df = pd.read_csv(US_DEATH_PATH)
us_df.drop(columns=['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Combined_Key'], inplace=True)
us_df.rename(columns={'Long_': 'Long'}, inplace=True)
global_df.rename(columns={'Country/Region': 'Country_Region',\
                                                  'Province/State': 'Province_State'}, inplace=True)
hopkins_death = pd.concat([global_df, us_df])
hopkins_death.drop(hopkins_death[hopkins_death['Lat']\
                                             == DIAMOND_PRINCESS_COORD[0]].index, inplace=True)

hopkins_death.drop(hopkins_death[hopkins_death['Lat']\
                                             == MASSACHUSETTS_COORD[0]].index, inplace=True)

hopkins_death.loc[hopkins_death['Country_Region']=='Barbados', 'Lat']\
                        = BARBADOS_BELIZE_COORD[0] + 0.00001

hopkins_death.loc[hopkins_death['Country_Region']=='Congo (Brazzaville)', 'Lat']\
                        = CONGO_BRAZZAVILLE_KINSHASA_COORD[0] + 0.00001

hopkins_death.drop(hopkins_death.loc[hopkins_death['Lat'] == 0].index, inplace=True)

In [7]:
GLOBAL_CONF_PATH = '../original_datasets/remote_repo/csse_covid_19_data/csse_covid_19_time_series/'\
              'time_series_covid19_confirmed_global.csv'
US_CONF_PATH = '../original_datasets/remote_repo/csse_covid_19_data/csse_covid_19_time_series/'\
          'time_series_covid19_confirmed_US.csv'

global_df = pd.read_csv(GLOBAL_CONF_PATH)
us_df = pd.read_csv(US_CONF_PATH)
us_df.drop(columns=['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Combined_Key'], inplace=True)
us_df.rename(columns={'Long_': 'Long'}, inplace=True)
global_df.rename(columns={'Country/Region': 'Country_Region',\
                                                  'Province/State': 'Province_State'}, inplace=True)
hopkins_conf = pd.concat([global_df, us_df])
hopkins_conf.drop(hopkins_conf[hopkins_conf['Lat']\
                                             == DIAMOND_PRINCESS_COORD[0]].index, inplace=True)

hopkins_conf.drop(hopkins_conf[hopkins_conf['Lat']\
                                             == MASSACHUSETTS_COORD[0]].index, inplace=True)

hopkins_conf.loc[hopkins_conf['Country_Region']=='Barbados', 'Lat']\
                        = BARBADOS_BELIZE_COORD[0] + 0.00001

hopkins_conf.loc[hopkins_conf['Country_Region']=='Congo (Brazzaville)', 'Lat']\
                        = CONGO_BRAZZAVILLE_KINSHASA_COORD[0] + 0.00001

hopkins_conf.drop(hopkins_conf.loc[hopkins_conf['Lat'] == 0].index, inplace=True)

This is a very ugly fix to remove duplicates from Utah that spoil the multiindex These indexes should be updated every time a new df is loaded

In [None]:
idx = [2805, 2809, 2782, 2783, 2784, 2785, 2786, 2788, 2789, 2790, 2791, 2792, 2793, 2794, 2795, 2796, 2797, 2798, 2800, 2801, 2802, 3255, 3256, 3257, 3258, 3259, 3260]
for ind in idx:
    hopkins_confirmed.drop(index=ind, inplace=True)

###### Setup multi-index

In [4]:
hopkins_death.drop(columns=['Population'], inplace=True)
# hopkins_death = hopkins_death.iloc[0:20]

In [5]:
coords = [x for x in zip(hopkins_death.pop('Lat'), hopkins_death.pop('Long'))]
hopkins_death.index = coords

for param in ['avg_m_wind', 'avg_m_precip', 'avg_m_RH', 'avg_m_tmp',]:
    hopkins_death.insert(2, param, np.nan)
hopkins_death.insert(6, 'weather', '')

columns = len(hopkins_death.columns)

for index in hopkins_death.index:
    for param in ['avg_d_tmp', 'avg_d_RH', 'avg_d_wind', 'avg_d_precip']:
        hopkins_death = hopkins_death.append\
                (pd.Series([np.nan]*6+[param]+[np.nan]*(columns-7), index=hopkins_death.columns, name=index))

hopkins_death.set_index('weather', append=True, inplace=True)
hopkins_death = hopkins_death.sort_index()
dates = {date: date + '20' for date in hopkins_death.columns[6:]}
hopkins_death = hopkins_death.rename(columns=dates)

In [6]:
hopkins_death.rename(index={'': 'data'}, inplace=True)
hopkins_death.rename_axis(['coordinate', 'information'],inplace=True)
for param in ['Max_Cases', 'GF_Q1', 'GF_Q2', 'GF_Q3'][::-1]:
    hopkins_death.insert(6, param, np.nan)

In [7]:
hopkins_death

Unnamed: 0_level_0,Unnamed: 1_level_0,Province_State,Country_Region,avg_m_tmp,avg_m_RH,avg_m_precip,avg_m_wind,Max_Cases,GF_Q1,GF_Q2,GF_Q3,...,3/29/2020,3/30/2020,3/31/2020,4/1/2020,4/2/2020,4/3/2020,4/4/2020,4/5/2020,4/6/2020,4/7/2020
coordinate,information,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
"(-51.7963, -59.5236)",data,Falkland Islands (Malvinas),United Kingdom,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(-51.7963, -59.5236)",avg_d_RH,,,,,,,,,,,...,,,,,,,,,,
"(-51.7963, -59.5236)",avg_d_precip,,,,,,,,,,,...,,,,,,,,,,
"(-51.7963, -59.5236)",avg_d_tmp,,,,,,,,,,,...,,,,,,,,,,
"(-51.7963, -59.5236)",avg_d_wind,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"(71.7069, -42.6043)",data,Greenland,Denmark,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(71.7069, -42.6043)",avg_d_RH,,,,,,,,,,,...,,,,,,,,,,
"(71.7069, -42.6043)",avg_d_precip,,,,,,,,,,,...,,,,,,,,,,
"(71.7069, -42.6043)",avg_d_tmp,,,,,,,,,,,...,,,,,,,,,,


###### Verify integrity, handle NaN and backup dataframe
1. Some coordinates are more then 350 miles away from any weather station, resulting in NaN values
2. Some stations don't save data as far back, resulting in NaN values
3. NaNs are not removed, rather when applying aggragate functions we discard then in the calculations

In [8]:
backup1 = hopkins_death.copy()
# data[['Province_State']] = data[['Province_State']].fillna(0)
# confirmed_time_data = confirmed_time_data.dropna()
# confirmed_time_data.isna().sum()
# confirmed_time_data['1/22/2020']
hopkins_death[['2/9/2020',
       '2/10/2020', '2/11/2020', '2/12/2020', '2/13/2020', '2/14/2020',
       '2/15/2020', '2/16/2020', '2/17/2020', '2/18/2020', '2/19/2020',
       '2/20/2020', '2/21/2020', '2/22/2020', '2/23/2020', '2/24/2020',
       '2/25/2020', '2/26/2020', '2/27/2020', '2/28/2020', '2/29/2020',
       '3/1/2020', '3/2/2020', '3/3/2020', '3/4/2020', '3/5/2020', '3/6/2020',
       '3/7/2020', '3/8/2020']].isnull().sum()

2/9/2020     13608
2/10/2020    13608
2/11/2020    13608
2/12/2020    13608
2/13/2020    13608
2/14/2020    13608
2/15/2020    13608
2/16/2020    13608
2/17/2020    13608
2/18/2020    13608
2/19/2020    13608
2/20/2020    13608
2/21/2020    13608
2/22/2020    13608
2/23/2020    13608
2/24/2020    13608
2/25/2020    13608
2/26/2020    13608
2/27/2020    13608
2/28/2020    13608
2/29/2020    13608
3/1/2020     13608
3/2/2020     13608
3/3/2020     13608
3/4/2020     13608
3/5/2020     13608
3/6/2020     13608
3/7/2020     13608
3/8/2020     13608
dtype: int64

###### Compute max cases and max date

In [10]:
hopkins_death.insert(7, '5%_Date', '')
hopkins_death.insert(7, 'Max_Date', '')

In [29]:
for coord in coords:
    try:
        max_cases = hopkins_death.iloc[:,12:].loc[coord].loc['data'].max()
        max_date_index = hopkins_death.iloc[:,12:].loc[coord].loc['data'].argmax()
        max_date = hopkins_death.iloc[:,12:].loc[coord].loc['data'].index[max_date_index]
        hopkins_death.loc[coord, 'Max_Cases'].loc['data'] = max_cases
        hopkins_death.loc[coord, 'Max_Date'].loc['data'] = max_date
    except Exception as e:
        print(e)
print('Lybia and malta were removed as they had NaN values')

Lybia and malta were removed as they had NaN values


In [31]:
backup2 = hopkins_death.copy()

###### Merge with data from confirmed

In [4]:
hopkins_conf = pd.read_pickle(CONF_PICKLE_PATH)

In [117]:
conf_indexs = []
death_indexs = []
for row in hopkins_conf.iterrows():
    conf_indexs.append(row[0][0])
for row in hopkins_death.iterrows():
    death_indexs.append(row[0][0])

len(set(conf_indexs))
len(set(death_indexs))

conf_ind =  set(conf_indexs)
death_ind = set(death_indexs)
to_drop = [ind for ind in death_ind if ind not in conf_ind]
len(to_drop)
hopkins_death.drop(index=to_drop, level=0, inplace=True)

coords = []
for row in hopkins_death.iterrows():
    coords.append(row[0][0])
coords = list(set(coords))

hopkins_death.insert(2, 'Urbanization', np.nan)
hopkins_death.insert(2, 'GDP', np.nan)

1070

3402

In [165]:
hopkins_conf.columns

Index(['Province_State', 'Country_Region', 'GDP', 'Urbanization', 'avg_m_tmp',
       'avg_m_RH', 'avg_m_precip', 'avg_m_wind', 'Max_Cases', 'first_7',
       'last relevant date', 'Max_Date', '5%_Date', 'avg_interval_tmp',
       'avg_interval_RH', 'GF_Q1', 'GF_Q2', 'GF_Q3', '1/22/2020', '1/23/2020',
       '1/24/2020', '1/25/2020', '1/26/2020', '1/27/2020', '1/28/2020',
       '1/29/2020', '1/30/2020', '1/31/2020', '2/1/2020', '2/2/2020',
       '2/3/2020', '2/4/2020', '2/5/2020', '2/6/2020', '2/7/2020', '2/8/2020',
       '2/9/2020', '2/10/2020', '2/11/2020', '2/12/2020', '2/13/2020',
       '2/14/2020', '2/15/2020', '2/16/2020', '2/17/2020', '2/18/2020',
       '2/19/2020', '2/20/2020', '2/21/2020', '2/22/2020', '2/23/2020',
       '2/24/2020', '2/25/2020', '2/26/2020', '2/27/2020', '2/28/2020',
       '2/29/2020', '3/1/2020', '3/2/2020', '3/3/2020', '3/4/2020', '3/5/2020',
       '3/6/2020', '3/7/2020', '3/8/2020', '3/9/2020', '3/10/2020',
       '3/11/2020', '3/12/2020', '3/13/20

In [166]:
for coord in coords:
    hopkins_death.loc[coord,'GDP']['data'] = hopkins_conf.loc[coord,'GDP']['data']
    hopkins_death.loc[coord,'Urbanization']['data'] = hopkins_conf.loc[coord,'Urbanization']['data']
    hopkins_death.loc[coord,'avg_m_tmp']['data'] = hopkins_conf.loc[coord,'avg_m_tmp']['data']
    hopkins_death.loc[coord,'avg_m_RH']['data'] = hopkins_conf.loc[coord,'avg_m_RH']['data']
    hopkins_death.loc[coord,'avg_m_precip']['data'] = hopkins_conf.loc[coord,'avg_m_precip']['data']
    hopkins_death.loc[coord,'avg_m_wind']['data'] = hopkins_conf.loc[coord,'avg_m_wind']['data']
    hopkins_death.loc[coord].loc['avg_d_RH'] = hopkins_conf.loc[coord].loc['avg_d_RH']
    hopkins_death.loc[coord].loc['avg_d_precip'] = hopkins_conf.loc[coord].loc['avg_d_precip']
    hopkins_death.loc[coord].loc['avg_d_tmp'] = hopkins_conf.loc[coord].loc['avg_d_tmp']
    hopkins_death.loc[coord].loc['avg_d_wind'] = hopkins_conf.loc[coord].loc['avg_d_wind']


In [167]:
backup3 = hopkins_death.copy()

###### Sanity checks
Manual heuristic comparison of selected samples from the dataset
We compared 5 randomly selected data entries as follows:
1. lat and lan on google map
2. Daily information with Aeris API
3. Daily information with a third party climate source - https://www.worldweatheronline.com/
4. monthly information with a third party data source (This will only be an approximation) https://www.timeanddate.com/weather/israel/tel-aviv/climate
Note that coordinates are in decimal representation

We conclude that the data is correct heuristcially, except for precipitation that shows 0 when it is infact more in many cases
Also we see that except for US states coordinates are the same between Tableau and Hopkins datasets

In [168]:
hopkins_death.loc[(31,35)]
hopkins_death.loc[(31,35)]['3/1/2020']
hopkins_death.loc[(33,65)]
hopkins_death.loc[(33,65)]['3/21/2020']
hopkins_death.loc[(-28.0167,153.4)]
hopkins_death.loc[(-28.0167,153.4)]['3/2/2020']
hopkins_death.loc[(-17.7134, 178.065)]
hopkins_death.loc[(-17.7134, 178.065)]['3/6/2020']

Unnamed: 0_level_0,Province_State,Country_Region,GDP,Urbanization,avg_m_tmp,avg_m_RH,avg_m_precip,avg_m_wind,Max_Cases,Max_Date,...,3/29/2020,3/30/2020,3/31/2020,4/1/2020,4/2/2020,4/3/2020,4/4/2020,4/5/2020,4/6/2020,4/7/2020
information,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
data,,Israel,41715.02928,92.418,15.530645,71.129032,0.0,12.467742,65.0,4/7/2020,...,15.0,16.0,20.0,26.0,36.0,40.0,44.0,49.0,57.0,65.0
avg_d_RH,,,,,,,,,,,...,59.0,45.0,40.0,74.0,72.0,50.0,40.0,43.0,73.0,68.0
avg_d_precip,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
avg_d_tmp,,,,,,,,,,,...,17.1,20.0,22.4,17.0,16.5,18.5,23.0,24.5,18.3,17.4
avg_d_wind,,,,,,,,,,,...,16.6,8.8,15.0,21.3,10.1,9.4,8.7,18.3,10.2,11.0


information
data             0.0
avg_d_RH        68.0
avg_d_precip     0.0
avg_d_tmp       15.3
avg_d_wind      23.0
Name: 3/1/2020, dtype: float64

Unnamed: 0_level_0,Province_State,Country_Region,GDP,Urbanization,avg_m_tmp,avg_m_RH,avg_m_precip,avg_m_wind,Max_Cases,Max_Date,...,3/29/2020,3/30/2020,3/31/2020,4/1/2020,4/2/2020,4/3/2020,4/4/2020,4/5/2020,4/6/2020,4/7/2020
information,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
data,,Afghanistan,520.896603,25.495,12.268831,51.779221,1.395455,8.254545,14.0,4/7/2020,...,4.0,4.0,4.0,4.0,6.0,6.0,7.0,7.0,11.0,14.0
avg_d_RH,,,,,,,,,,,...,61.0,81.0,68.0,49.0,44.0,33.0,53.0,50.0,38.0,42.0
avg_d_precip,,,,,,,,,,,...,1.02,4.32,1.52,0.0,0.0,0.0,3.56,0.0,0.51,0.0
avg_d_tmp,,,,,,,,,,,...,15.2,15.0,16.5,17.4,16.2,21.0,17.7,18.3,19.1,16.4
avg_d_wind,,,,,,,,,,,...,4.2,10.1,2.9,7.3,1.5,2.9,10.2,3.4,17.4,0.8


information
data             0.0
avg_d_RH        41.0
avg_d_precip     0.0
avg_d_tmp       18.7
avg_d_wind       2.4
Name: 3/21/2020, dtype: float64

Unnamed: 0_level_0,Province_State,Country_Region,GDP,Urbanization,avg_m_tmp,avg_m_RH,avg_m_precip,avg_m_wind,Max_Cases,Max_Date,...,3/29/2020,3/30/2020,3/31/2020,4/1/2020,4/2/2020,4/3/2020,4/4/2020,4/5/2020,4/6/2020,4/7/2020
information,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
data,Queensland,Australia,57373.68668,86.012,23.897403,78.363636,0.0,15.571429,4.0,4/2/2020,...,2.0,2.0,2.0,2.0,4.0,4.0,4.0,4.0,4.0,4.0
avg_d_RH,,,,,,,,,,,...,81.0,79.0,74.0,71.0,79.0,80.0,73.0,72.0,75.0,71.0
avg_d_precip,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
avg_d_tmp,,,,,,,,,,,...,22.0,22.7,24.2,24.0,23.0,24.0,25.2,21.4,20.9,21.5
avg_d_wind,,,,,,,,,,,...,11.7,11.4,13.7,15.7,12.4,15.8,15.8,12.3,10.7,17.9


information
data             0.0
avg_d_RH        78.0
avg_d_precip     0.0
avg_d_tmp       25.6
avg_d_wind      12.8
Name: 3/2/2020, dtype: float64

KeyError: -17.7134

In [169]:
# Israel 31\35
# All 4 check except precipitation
hopkins_conf.loc[(31,35)]
hopkins_conf.loc[(31,35)]['3/1/2020']

# # Afghanistan
# # All check
hopkins_conf.loc[(33,65)]
hopkins_conf.loc[(33,65)]['3/21/2020']

# # Queensland Australia
# # All check except precipitation
hopkins_conf.loc[(-28.0167,153.4)]
hopkins_conf.loc[(-28.0167,153.4)]['3/2/2020']

# Fiji
# All check except precipitation
hopkins_conf.loc[(-17.7134, 178.065)]
hopkins_conf.loc[(-17.7134, 178.065)]['3/6/2020']

Unnamed: 0_level_0,Province_State,Country_Region,GDP,Urbanization,avg_m_tmp,avg_m_RH,avg_m_precip,avg_m_wind,Max_Cases,first_7,...,3/29/2020,3/30/2020,3/31/2020,4/1/2020,4/2/2020,4/3/2020,4/4/2020,4/5/2020,4/6/2020,4/7/2020
information,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
data,,Israel,41715.02928,92.418,15.530645,71.129032,0.0,12.467742,9248.0,4.090737,...,4247.0,4695.0,5358.0,6092.0,6857.0,7428.0,7851.0,8430.0,8904.0,9248.0
avg_d_RH,,,,,,,,,,,...,59.0,45.0,40.0,74.0,72.0,50.0,40.0,43.0,73.0,68.0
avg_d_precip,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
avg_d_tmp,,,,,,,,,,,...,17.1,20.0,22.4,17.0,16.5,18.5,23.0,24.5,18.3,17.4
avg_d_wind,,,,,,,,,,,...,16.6,8.8,15.0,21.3,10.1,9.4,8.7,18.3,10.2,11.0


information
data            10.0
avg_d_RH        68.0
avg_d_precip     0.0
avg_d_tmp       15.3
avg_d_wind      23.0
Name: 3/1/2020, dtype: float64

Unnamed: 0_level_0,Province_State,Country_Region,GDP,Urbanization,avg_m_tmp,avg_m_RH,avg_m_precip,avg_m_wind,Max_Cases,first_7,...,3/29/2020,3/30/2020,3/31/2020,4/1/2020,4/2/2020,4/3/2020,4/4/2020,4/5/2020,4/6/2020,4/7/2020
information,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
data,,Afghanistan,520.896603,25.495,12.268831,51.779221,1.395455,8.254545,423.0,0.818182,...,120.0,170.0,174.0,237.0,273.0,281.0,299.0,349.0,367.0,423.0
avg_d_RH,,,,,,,,,,,...,61.0,81.0,68.0,49.0,44.0,33.0,53.0,50.0,38.0,42.0
avg_d_precip,,,,,,,,,,,...,1.02,4.32,1.52,0.0,0.0,0.0,3.56,0.0,0.51,0.0
avg_d_tmp,,,,,,,,,,,...,15.2,15.0,16.5,17.4,16.2,21.0,17.7,18.3,19.1,16.4
avg_d_wind,,,,,,,,,,,...,4.2,10.1,2.9,7.3,1.5,2.9,10.2,3.4,17.4,0.8


information
data            24.0
avg_d_RH        41.0
avg_d_precip     0.0
avg_d_tmp       18.7
avg_d_wind       2.4
Name: 3/21/2020, dtype: float64

Unnamed: 0_level_0,Province_State,Country_Region,GDP,Urbanization,avg_m_tmp,avg_m_RH,avg_m_precip,avg_m_wind,Max_Cases,first_7,...,3/29/2020,3/30/2020,3/31/2020,4/1/2020,4/2/2020,4/3/2020,4/4/2020,4/5/2020,4/6/2020,4/7/2020
information,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
data,Queensland,Australia,57373.68668,86.012,23.897403,78.363636,0.0,15.571429,934.0,2.622951,...,656.0,689.0,743.0,781.0,835.0,873.0,900.0,907.0,921.0,934.0
avg_d_RH,,,,,,,,,,,...,81.0,79.0,74.0,71.0,79.0,80.0,73.0,72.0,75.0,71.0
avg_d_precip,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
avg_d_tmp,,,,,,,,,,,...,22.0,22.7,24.2,24.0,23.0,24.0,25.2,21.4,20.9,21.5
avg_d_wind,,,,,,,,,,,...,11.7,11.4,13.7,15.7,12.4,15.8,15.8,12.3,10.7,17.9


information
data             9.0
avg_d_RH        78.0
avg_d_precip     0.0
avg_d_tmp       25.6
avg_d_wind      12.8
Name: 3/2/2020, dtype: float64

KeyError: -17.7134


- Israel, 03/01/20, 31	35	15.3	68	23	0	14.99310345	74.5862069	13.11034483	0
    * Google maps
    * Check API
    * Third party daily
    * Third party monthly
- Afghanistan, 03/21/20 33	65	18.7	41	2.4	0	13.71315789	47.07894737	5.684210526	0.855526316
    * Google maps
    * Check API
    * Third party daily
    * Third party monthly

- Queensland Australia, 02/03/20 -28.0167	153.4	25.6	78	12.8	0	24.05	75.55263158	16.66842105	0
    * Google maps
    * Check API
    * Third party daily
    * Third party monthly

- Fiji, 03/06/20 -17.7134	178.065	26.6	90	3.2	0	26.84473684	85.47368421	5.871052632	0
    * Google maps
        - The the northen third of Fiji
    * Check API
        - Checks good for Nausori (60km south east)
    * Third party daily
        - Checks good exepct wind that seems week and precipitation that should be a little
    * Third party monthly
        - seems good exepct for precipitation
- North Dakota, US 03/12/20 47.5289	-99.784	-0.2	73	31.5	0	-4.947368421	80.52631579	18.72368421	0
    * Google maps
        - In the fields around the middle of the state
    * Check API
         - Checks good for harvey (10km north west)
    * Third party daily
        - 
    * Third party monthly
        - checks good except precipitation

After comparing the data we conclude it is relaibale except for precpitation that will be hence forth ignored.

###### Save augmented data
1. Multi index does not save well in csv, so we also save it as a pickle

In [170]:
pickle_path = '../augmented_datasets/pickles/hopkins_death_augmented0904.pkl' 
# with open(pickle_path, 'wb') as file:
#     pickle.dump(hopkins_confirmed, file)
# hopkins_confirmed.to_csv('../augmented_datasets/hopkins_conf_augmented{0}.csv'.format(datetime.now().strftime('%d%m')))
hopkins_death.to_pickle(pickle_path)
