In [1]:
import urllib
import json
import pandas as pd
import numpy as np
import warnings
import pickle
from datetime import datetime
from datetime import timedelta
from urllib.error import URLError
from functools import wraps
from keys import client_id, client_secret, app_id
warnings.filterwarnings('ignore')

In [110]:
UTAH_COORD = (37.85447192, -111.4418764)
DIAMOND_PRINCESS_COORD = (35.4437, 139.638)
BARBADOS_BELIZE_COORD = (13.1939, -59.5432)
CONGO_BRAZZAVILLE_KINSHASA_COORD = (-4.0383, 21.7587)
MASSACHUSETTS_COORD = (41.40674725, -70.68763497)
UTAH_DUPS_LATS = [39.46704621, 38.01974191, 38.70036159, 40.27367263, 38.42424062, 38.84315422,\
                  38.46231092, 39.27057157, 38.02974928, 38.9251277, 39.10314817, 37.9463652]
RADIUS = 350 # Miles
TIMEOUT = 10 # Seconds
LOG_PATH = 'weather_logs/'
GLOBAL_PATH = '../original_datasets/remote_repo/csse_covid_19_data/csse_covid_19_time_series/'\
              'time_series_covid19_confirmed_global.csv'
US_PATH = '../original_datasets/remote_repo/csse_covid_19_data/csse_covid_19_time_series/'\
          'time_series_covid19_confirmed_US.csv'
PREVIOUS_PICKLE = '../augmented_datasets/pickles/hopkins_conf_gf0904_GDP_urban_weather_dem_age_dtests_pop.pkl'

In [3]:
# Decorator function for timeout handling
def try_three_times(func):
    @wraps(func)
    def decoration_func(*args,**kwargs):
        for i in range(3):
            try:
                return func(*args,**kwargs)
            except Exception as e:
                if i == 2:
                    raise
    return decoration_func

@try_three_times
def request_url(url, timeout=10):
     return urllib.request.urlopen(url, timeout=timeout)

###### Load datasets
Notes on data: 
1. Notice that some of the dates on the hopkins dataset appear in different format in ecxel, they are infect all in the same format: %-m/%-d/20
2. Column names are modifyed to %-m/%-d/20 to fit weather API queries
3. All three rows reffering to 'Diamnond princess' have been removed
4. The location (0,0) has been removed
5. Two places with the same coordinate have been slightly modified to accomedate indexing
4. source: https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series

In [134]:
global_df = pd.read_csv(GLOBAL_PATH)
us_df = pd.read_csv(US_PATH)
old_df = pd.read_pickle(PREVIOUS_PICKLE)
us_df.drop(columns=['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Combined_Key'], inplace=True)
us_df.rename(columns={'Long_': 'Long'}, inplace=True)
global_df.rename(columns={'Country/Region': 'Country_Region',\
                                                  'Province/State': 'Province_State'}, inplace=True)
hopkins_confirmed = pd.concat([global_df, us_df])
hopkins_confirmed.drop(hopkins_confirmed[hopkins_confirmed['Lat']\
                                             == DIAMOND_PRINCESS_COORD[0]].index, inplace=True)

hopkins_confirmed.drop(hopkins_confirmed[hopkins_confirmed['Lat']\
                                             == MASSACHUSETTS_COORD[0]].index, inplace=True)

for lat in UTAH_DUPS_LATS:
    hopkins_confirmed.drop(hopkins_confirmed[hopkins_confirmed['Lat']\
                                            == lat].index, inplace=True)

hopkins_confirmed.loc[hopkins_confirmed['Country_Region']=='Barbados', 'Lat']\
                        = BARBADOS_BELIZE_COORD[0] + 0.00001

hopkins_confirmed.loc[hopkins_confirmed['Country_Region']=='Congo (Brazzaville)', 'Lat']\
                        = CONGO_BRAZZAVILLE_KINSHASA_COORD[0] + 0.00001

hopkins_confirmed.drop(hopkins_confirmed.loc[hopkins_confirmed['Lat'] == 0].index, inplace=True)

This is a very ugly fix to remove duplicates from Utah that spoil the multiindex
These indexes should be updated every time a new df is loaded

In [135]:
idx = [2805, 2809, 2782, 2783, 2784, 2785, 2786, 2788, 2789, 2790, 2791, 2792, 2793, 2794, 2795, 2796, 2797, 2798, 2800, 2801, 2802, 3255, 3256, 3257, 3258, 3259, 3260]
for ind in idx:
    hopkins_confirmed.drop(index=ind, inplace=True)

###### Setup multi-index

In [137]:
coords = [x for x in zip(hopkins_confirmed.pop('Lat'), hopkins_confirmed.pop('Long'))]
hopkins_confirmed.index = coords

for param in ['avg_m_wind', 'avg_m_precip', 'avg_m_RH', 'avg_m_tmp',]:
    hopkins_confirmed.insert(2, param, np.nan)
hopkins_confirmed.insert(6, 'weather', '')

columns = len(hopkins_confirmed.columns)

for index in hopkins_confirmed.index:
    for param in ['avg_d_tmp', 'avg_d_RH', 'avg_d_wind', 'avg_d_precip']:
        hopkins_confirmed = hopkins_confirmed.append\
                (pd.Series([np.nan]*6+[param]+[np.nan]*(columns-7), index=hopkins_confirmed.columns, name=index))

hopkins_confirmed.set_index('weather', append=True, inplace=True)
hopkins_confirmed = hopkins_confirmed.sort_index()
dates = {date: date + '20' for date in hopkins_confirmed.columns[6:]}
hopkins_confirmed = hopkins_confirmed.rename(columns=dates)

In [143]:
hopkins_confirmed.rename(index={'': 'data'}, inplace=True)
hopkins_confirmed.rename_axis(['coordinate', 'information'],inplace=True)
for param in ['Max_Cases', 'GF_Q1', 'GF_Q2', 'GF_Q3'][::-1]:
    hopkins_confirmed.insert(6, param, np.nan)

ValueError: cannot insert GF_Q3, already exists

In [147]:
backup = hopkins_confirmed.copy()

In [146]:
old_df.columns[-1]
last_date = hopkins_confirmed.columns.get_loc(old_df.columns[-1])
merge_columns = hopkins_confirmed.columns[last_date + 1:]
to_merge = hopkins_confirmed[merge_columns]
hopkins_confirmed = old_df.merge(to_merge, how='inner', on=['coordinate', 'information'])

'4/7/2020'

MergeError: No common columns to perform merge on. Merge options: left_on=None, right_on=None, left_index=False, right_index=False

###### Query remote database

In [194]:
coords =  hopkins_confirmed.index.levels[0]
days = to_merge.columns.tolist()
q = 0
log = open('{0}log{1}.txt'.format(LOG_PATH, datetime.now().strftime('%d%m%Y')), 'w')

# Patching tools:
# coords = list(set(hopkins_confirmed.index.get_level_values(0).tolist()))
# days = days[:12]
# coords = coords[2851:]
# q = 2851

while days:
    start_time = days[0]
    end_time = days[:12][-1]
    days = days[12:]
    for coord in coords:
        q += 1
        lat = coord[0]
        long = coord[1]
        country = hopkins_confirmed.loc[coord]['Country_Region'].values[0]
        province = hopkins_confirmed.loc[coord]['Province_State'].values[0]
        url = 'https://api.aerisapi.com/observations/summary/closest?p={0},{1}&from={2}&to={3}&radius={4}miles&plimit=31&limit=1&'\
              'client_id={5}&client_secret={6}'.format(lat, long, start_time, end_time, RADIUS, client_id, client_secret)
        msg = 'collecting for {0} location {1}, {2} from {3} to {4}\n{5}'\
        .format(q, province, country, start_time, end_time, url)
        _ = log.write('\n' + msg)
        print(msg)
        try:
            request = request_url(url, TIMEOUT)
            response = request.read()
            json_ = json.loads(response)
        except URLError as e:
            msg = 'Tried three times, moving forward\n'
            print(msg)
            _ = log.write(msg)
            continue
        if json_['success']: 
            _ = log.write('\nsuccess')
            print('success')
            try:
                for day in json_['response'][0]['periods']:
                    date = datetime.strptime(str(day['summary']['ymd']), '%Y%m%d').strftime("%-m/%-d/%Y")
                    hopkins_confirmed.loc[coord].at['avg_d_tmp', date] = day['summary']['temp']['avgC']
                    hopkins_confirmed.loc[coord].at['avg_d_RH', date] = day['summary']['rh']['avg']
                    hopkins_confirmed.loc[coord].at['avg_d_wind', date] = day['summary']['wind']['avgKPH']
                    hopkins_confirmed.loc[coord].at['avg_d_precip', date] = day['summary']['precip']['totalMM']
            except IndexError:
                if json_['error']:
                    msg = 'No data found for {0}, {1} at {2}\n{3}\r\n'.format(country, province, date, json_['error'])
                    _ = log.write(msg)
                    print(msg)
            except Exception:
                msg = 'Unknowen error\n{0}\r\n'.format(json_['error'])
                _ = log.write(msg)
                print(msg)
            finally:
                request.close()
        else:
            print('Failed')
            _ = log.write('Failed')
log.close()

collecting for 1 location Tasmania, Australia from 4/8/2020 to 4/19/2020
https://api.aerisapi.com/observations/summary/closest?p=-41.4545,145.9707&from=4/8/2020&to=4/19/2020&radius=350miles&plimit=31&limit=1&client_id=tXyXXyb3Yf492X1asICz5&client_secret=WJi1enKSDGlhO8Nt4Ze00C1F6LEdMpl2O54JIwaM
Failed
collecting for 2 location nan, New Zealand from 4/8/2020 to 4/19/2020
https://api.aerisapi.com/observations/summary/closest?p=-40.9006,174.886&from=4/8/2020&to=4/19/2020&radius=350miles&plimit=31&limit=1&client_id=tXyXXyb3Yf492X1asICz5&client_secret=WJi1enKSDGlhO8Nt4Ze00C1F6LEdMpl2O54JIwaM
Failed
collecting for 3 location nan, Argentina from 4/8/2020 to 4/19/2020
https://api.aerisapi.com/observations/summary/closest?p=-38.4161,-63.6167&from=4/8/2020&to=4/19/2020&radius=350miles&plimit=31&limit=1&client_id=tXyXXyb3Yf492X1asICz5&client_secret=WJi1enKSDGlhO8Nt4Ze00C1F6LEdMpl2O54JIwaM
Failed
collecting for 4 location Victoria, Australia from 4/8/2020 to 4/19/2020
https://api.aerisapi.com/obser

Failed
collecting for 29 location nan, Gabon from 4/8/2020 to 4/19/2020
https://api.aerisapi.com/observations/summary/closest?p=-0.8037,11.6094&from=4/8/2020&to=4/19/2020&radius=350miles&plimit=31&limit=1&client_id=tXyXXyb3Yf492X1asICz5&client_secret=WJi1enKSDGlhO8Nt4Ze00C1F6LEdMpl2O54JIwaM
Failed
collecting for 30 location nan, Indonesia from 4/8/2020 to 4/19/2020
https://api.aerisapi.com/observations/summary/closest?p=-0.7893,113.9213&from=4/8/2020&to=4/19/2020&radius=350miles&plimit=31&limit=1&client_id=tXyXXyb3Yf492X1asICz5&client_secret=WJi1enKSDGlhO8Nt4Ze00C1F6LEdMpl2O54JIwaM
Failed
collecting for 31 location nan, Kenya from 4/8/2020 to 4/19/2020
https://api.aerisapi.com/observations/summary/closest?p=-0.0236,37.9062&from=4/8/2020&to=4/19/2020&radius=350miles&plimit=31&limit=1&client_id=tXyXXyb3Yf492X1asICz5&client_secret=WJi1enKSDGlhO8Nt4Ze00C1F6LEdMpl2O54JIwaM
Failed
collecting for 32 location nan, Uganda from 4/8/2020 to 4/19/2020
https://api.aerisapi.com/observations/summary/

Failed
collecting for 57 location nan, Philippines from 4/8/2020 to 4/19/2020
https://api.aerisapi.com/observations/summary/closest?p=13.0,122.0&from=4/8/2020&to=4/19/2020&radius=350miles&plimit=31&limit=1&client_id=tXyXXyb3Yf492X1asICz5&client_secret=WJi1enKSDGlhO8Nt4Ze00C1F6LEdMpl2O54JIwaM
Failed
collecting for 58 location nan, Barbados from 4/8/2020 to 4/19/2020
https://api.aerisapi.com/observations/summary/closest?p=13.193909999999999,-59.5432&from=4/8/2020&to=4/19/2020&radius=350miles&plimit=31&limit=1&client_id=tXyXXyb3Yf492X1asICz5&client_secret=WJi1enKSDGlhO8Nt4Ze00C1F6LEdMpl2O54JIwaM
Failed
collecting for 59 location Guam, US from 4/8/2020 to 4/19/2020
https://api.aerisapi.com/observations/summary/closest?p=13.4443,144.7937&from=4/8/2020&to=4/19/2020&radius=350miles&plimit=31&limit=1&client_id=tXyXXyb3Yf492X1asICz5&client_secret=WJi1enKSDGlhO8Nt4Ze00C1F6LEdMpl2O54JIwaM
Failed
collecting for 60 location nan, El Salvador from 4/8/2020 to 4/19/2020
https://api.aerisapi.com/observ

KeyboardInterrupt: 

###### Verify integrity, handle NaN and backup dataframe
1. Some coordinates are more then 350 miles away from any weather station, resulting in NaN values
2. Some stations don't save data as far back, resulting in NaN values
3. NaNs are not removed, rather when applying aggragate functions we discard then in the calculations

In [115]:
backup1 = hopkins_confirmed.copy()
# data[['Province_State']] = data[['Province_State']].fillna(0)
# confirmed_time_data = confirmed_time_data.dropna()
# confirmed_time_data.isna().sum()
# confirmed_time_data['1/22/2020']
hopkins_confirmed[['2/9/2020',
       '2/10/2020', '2/11/2020', '2/12/2020', '2/13/2020', '2/14/2020',
       '2/15/2020', '2/16/2020', '2/17/2020', '2/18/2020', '2/19/2020',
       '2/20/2020', '2/21/2020', '2/22/2020', '2/23/2020', '2/24/2020',
       '2/25/2020', '2/26/2020', '2/27/2020', '2/28/2020', '2/29/2020',
       '3/1/2020', '3/2/2020', '3/3/2020', '3/4/2020', '3/5/2020', '3/6/2020',
       '3/7/2020', '3/8/2020']].isnull().sum()

2/9/2020     260
2/10/2020    266
2/11/2020    301
2/12/2020    330
2/13/2020    273
2/14/2020    297
2/15/2020    234
2/16/2020    242
2/17/2020    227
2/18/2020    180
2/19/2020    206
2/20/2020    132
2/21/2020    136
2/22/2020    148
2/23/2020    192
2/24/2020    172
2/25/2020    141
2/26/2020    119
2/27/2020    143
2/28/2020    136
2/29/2020    168
3/1/2020     162
3/2/2020     149
3/3/2020     136
3/4/2020     134
3/5/2020     170
3/6/2020     128
3/7/2020     142
3/8/2020     162
dtype: int64

###### COMPUTE AVG PARAMETERS FOR CONFIRMED CASES

In [264]:
means = hopkins_confirmed.mean(axis=1, skipna=True)
i=0
for coord in coords:
    hopkins_confirmed.loc[coord]['avg_m_tmp'].loc['data'] = means.loc[coord]['avg_d_tmp']
    hopkins_confirmed.loc[coord]['avg_m_RH'].loc['data'] = means.loc[coord]['avg_d_RH']
    hopkins_confirmed.loc[coord]['avg_m_wind'].loc['data'] = means.loc[coord]['avg_d_wind']
    hopkins_confirmed.loc[coord]['avg_m_precip'].loc['data'] = means.loc[coord]['avg_d_precip']
    print(i)
    i += 1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062


###### Data validation, NaN handling

In [276]:
hopkins_confirmed['avg_m_precip'][::5].index[3399][0]

(71.7069, -42.6043)

In [281]:
hopkins_confirmed['avg_m_tmp'][::5].isna().sum()
hopkins_confirmed['avg_m_RH'][::5].isna().sum()
hopkins_confirmed['avg_m_wind'][::5].isna().sum()
hopkins_confirmed['avg_m_precip'][::5].isna().sum()

# We run this until there are 0 in all nulls
nulls = hopkins_confirmed['avg_m_wind'][::5].isna().tolist()
for i in range(len(nulls)):
    if nulls[i]:
        nc = hopkins_confirmed['avg_m_precip'][::5].index[i][0]
        print('Removing null at coor {0}, {1}'.format(nc, hopkins_confirmed.loc[nc, 'Country_Region']))
        hopkins_confirmed.drop(nc, level=0, inplace=True)
        for i in range(len(coords)):
            if coords[i] == nc:
                coords.pop(i)
                i += 1

0

0

0

0

###### Compute max cases and max date

In [282]:
backup2 = hopkins_confirmed.copy()
hopkins_confirmed.insert(7, '5%_Date', '')
hopkins_confirmed.insert(7, 'Max_Date', '')

In [283]:
for coord in coords:
    try:
        max_cases = hopkins_confirmed.iloc[:,12:].loc[coord].loc['data'].max()
        max_date = hopkins_confirmed.iloc[:,12:].loc[coord].loc['data'].argmax() 
#         max_date_index = hopkins_death.iloc[:,12:].loc[coord].loc['data'].argmax()
#         max_date = hopkins_death.iloc[:,12:].loc[coord].loc['data'].index[max_date_index]
        hopkins_confirmed.loc[coord, 'Max_Cases'].loc['data'] = max_cases
        hopkins_confirmed.loc[coord, 'Max_Date'].loc['data'] = max_date
    except Exception as e:
        print(e)
print('Lybia and malta were removed as they had NaN values')

'the label [data] is not in the [index]'
Lybia and malta were removed as they had NaN values


###### Sanity checks
Manual heuristic comparison of selected samples from the dataset
We compared 5 randomly selected data entries as follows:
1. lat and lan on google map
2. Daily information with Aeris API
3. Daily information with a third party climate source - https://www.worldweatheronline.com/
4. monthly information with a third party data source (This will only be an approximation) https://www.timeanddate.com/weather/israel/tel-aviv/climate
Note that coordinates are in decimal representation

We conclude that the data is correct heuristcially, except for precipitation that shows 0 when it is infact more in many cases
Also we see that except for US states coordinates are the same between Tableau and Hopkins datasets

In [646]:
# Israel 31\35
# All 4 check except precipitation
hopkins_confirmed.loc[(31,35)]
hopkins_confirmed.loc[(31,35)]['3/1/2020']

# # Afghanistan
# # All check
hopkins_confirmed.loc[(33,65)]
hopkins_confirmed.loc[(33,65)]['3/21/2020']

# # Queensland Australia
# # All check except precipitation
hopkins_confirmed.loc[(-28.0167,153.4)]
hopkins_confirmed.loc[(-28.0167,153.4)]['3/2/2020']

# Fiji
# All check except precipitation
hopkins_confirmed.loc[(-17.7134, 178.065)]
hopkins_confirmed.loc[(-17.7134, 178.065)]['3/6/2020']

Unnamed: 0_level_0,Province_State,Country_Region,avg_m_tmp,avg_m_RH,avg_m_precip,avg_m_wind,Max_Cases,Max_Date,5%_Date,GF_Q1,...,3/18/2020,3/19/2020,3/20/2020,3/21/2020,3/22/2020,3/23/2020,3/24/2020,3/25/2020,3/26/2020,3/27/2020
information,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
data,,Israel,14.721569,73.941176,0.0,12.321569,3035.0,3/27/2020,,,...,304.0,427.0,529.0,712.0,883.0,1071.0,1238.0,2369.0,2693.0,3035.0
avg_d_RH,,,,,,,,,,,...,64.0,70.0,81.0,76.0,69.0,52.0,64.0,73.0,63.0,60.0
avg_d_precip,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
avg_d_tmp,,,,,,,,,,,...,14.3,13.6,11.0,11.6,13.6,18.2,18.2,16.4,18.1,20.7
avg_d_wind,,,,,,,,,,,...,22.4,24.7,17.0,11.7,9.5,6.1,14.9,9.5,8.1,13.0


information
data            10.0
avg_d_RH        68.0
avg_d_precip     0.0
avg_d_tmp       15.3
avg_d_wind      23.0
Name: 3/1/2020, dtype: float64

Unnamed: 0_level_0,Province_State,Country_Region,avg_m_tmp,avg_m_RH,avg_m_precip,avg_m_wind,Max_Cases,Max_Date,5%_Date,GF_Q1,...,3/18/2020,3/19/2020,3/20/2020,3/21/2020,3/22/2020,3/23/2020,3/24/2020,3/25/2020,3/26/2020,3/27/2020
information,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
data,,Afghanistan,11.457576,51.924242,1.462424,8.6,110.0,3/27/2020,,,...,22.0,22.0,24.0,24.0,40.0,40.0,74.0,84.0,94.0,110.0
avg_d_RH,,,,,,,,,,,...,45.0,40.0,39.0,41.0,58.0,79.0,64.0,57.0,51.0,52.0
avg_d_precip,,,,,,,,,,,...,0.0,0.0,0.0,0.0,12.7,7.62,0.0,0.0,0.0,0.0
avg_d_tmp,,,,,,,,,,,...,18.0,19.3,19.6,18.7,17.8,16.1,14.4,15.2,16.7,16.1
avg_d_wind,,,,,,,,,,,...,1.9,2.7,4.8,2.4,19.2,8.5,8.0,3.0,1.2,11.3


information
data            24.0
avg_d_RH        41.0
avg_d_precip     0.0
avg_d_tmp       18.7
avg_d_wind       2.4
Name: 3/21/2020, dtype: float64

Unnamed: 0_level_0,Province_State,Country_Region,avg_m_tmp,avg_m_RH,avg_m_precip,avg_m_wind,Max_Cases,Max_Date,5%_Date,GF_Q1,...,3/18/2020,3/19/2020,3/20/2020,3/21/2020,3/22/2020,3/23/2020,3/24/2020,3/25/2020,3/26/2020,3/27/2020
information,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
data,Queensland,Australia,24.078788,78.863636,0.0,15.828788,555.0,3/27/2020,,,...,94.0,144.0,184.0,221.0,259.0,319.0,397.0,443.0,493.0,555.0
avg_d_RH,,,,,,,,,,,...,62.0,69.0,74.0,70.0,73.0,81.0,84.0,76.0,76.0,86.0
avg_d_precip,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
avg_d_tmp,,,,,,,,,,,...,22.0,21.7,22.4,24.5,25.0,23.2,20.9,21.9,22.1,20.0
avg_d_wind,,,,,,,,,,,...,20.1,10.0,13.9,16.6,13.7,17.3,11.8,10.4,11.0,16.0


information
data             9.0
avg_d_RH        78.0
avg_d_precip     0.0
avg_d_tmp       25.6
avg_d_wind      12.8
Name: 3/2/2020, dtype: float64

Unnamed: 0_level_0,Province_State,Country_Region,avg_m_tmp,avg_m_RH,avg_m_precip,avg_m_wind,Max_Cases,Max_Date,5%_Date,GF_Q1,...,3/18/2020,3/19/2020,3/20/2020,3/21/2020,3/22/2020,3/23/2020,3/24/2020,3/25/2020,3/26/2020,3/27/2020
information,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
data,,Fiji,27.039394,84.954545,0.0,6.854545,5.0,3/25/2020,,,...,0.0,1.0,1.0,1.0,2.0,3.0,4.0,5.0,5.0,5.0
avg_d_RH,,,,,,,,,,,...,92.0,97.0,94.0,91.0,89.0,91.0,87.0,86.0,85.0,87.0
avg_d_precip,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
avg_d_tmp,,,,,,,,,,,...,26.2,25.5,25.8,25.6,25.8,25.8,26.5,26.7,26.9,26.8
avg_d_wind,,,,,,,,,,,...,0.8,7.0,4.7,5.1,1.7,4.7,4.9,5.5,5.0,4.8


information
data             0.0
avg_d_RH        90.0
avg_d_precip     0.0
avg_d_tmp       26.6
avg_d_wind       3.2
Name: 3/6/2020, dtype: float64

In [None]:
data = pd.read_csv('../augmented_datasets/tableau_conf_data.csv')
RADIUS = 350

coords = [(31, 35), (33, 65), (-28.0167, 153.4), (-17.7134, 178.065), (47.5289, -99.784)] 
dates = ['03/01/2020', '03/21/2020', '03/02/2020', '03/06/2020', '03/12/2020']

for i in range(5):
    lat = coords[i][0]
    long = coords[i][1]
    start_time = dates[i]
    end_time = dates[i]
    location = data.loc[data['Lat'] == lat]\
                    .loc[data['Long'] == long]\
                     [['Country_Region','Province_State']]
    country = location.iloc[0][0]
    province = location.iloc[0][1]
    url = 'https://api.aerisapi.com/observations/summary/closest?p={0},{1}&from={2}&to={3}&radius={4}miles&plimit=31&limit=1&'\
          'client_id={5}&client_secret={6}'.format(lat, long, start_time, end_time, RADIUS, client_id, client_secret)
    print('collecting for {0} location {1}, {2}\n{3}'.format(i, province, country, url))
    request = urllib.request.urlopen(url)
    response = request.read()
    json_ = json.loads(response)
    if json_['success']: 
        print('success')
        try:
            for day in json_['response'][0]['periods']:
                print('Location: {0}, {1}'.format(json_['response'][0]['place']['name'], json_['response'][0]['place']['city']))
                print('avg tmp: {0}'.format(day['summary']['temp']['avgC']))
                print('avg RH: {0}'.format(day['summary']['rh']['avg']))
                print('avg wind: {0}'.format(day['summary']['wind']['avgKPH']))
                print('avg precip: {0}\n'.format(day['summary']['precip']['totalMM']))
        except IndexError:
            if json_['error']:
                msg = 'No data found for {0}, {1} at {2}\n{3}\r\n'.format(country, province, date, json_['error'])
                print(msg)
        except Exception as e:
            raise e
            msg = 'Unknowen error\n{0}\r\n'.format(json_['error'])
            print(msg)
        finally:
            request.close()



- Israel, 03/01/20, 31	35	15.3	68	23	0	14.99310345	74.5862069	13.11034483	0
    * Google maps
    * Check API
    * Third party daily
    * Third party monthly
- Afghanistan, 03/21/20 33	65	18.7	41	2.4	0	13.71315789	47.07894737	5.684210526	0.855526316
    * Google maps
    * Check API
    * Third party daily
    * Third party monthly

- Queensland Australia, 02/03/20 -28.0167	153.4	25.6	78	12.8	0	24.05	75.55263158	16.66842105	0
    * Google maps
    * Check API
    * Third party daily
    * Third party monthly

- Fiji, 03/06/20 -17.7134	178.065	26.6	90	3.2	0	26.84473684	85.47368421	5.871052632	0
    * Google maps
        - The the northen third of Fiji
    * Check API
        - Checks good for Nausori (60km south east)
    * Third party daily
        - Checks good exepct wind that seems week and precipitation that should be a little
    * Third party monthly
        - seems good exepct for precipitation
- North Dakota, US 03/12/20 47.5289	-99.784	-0.2	73	31.5	0	-4.947368421	80.52631579	18.72368421	0
    * Google maps
        - In the fields around the middle of the state
    * Check API
         - Checks good for harvey (10km north west)
    * Third party daily
        - 
    * Third party monthly
        - checks good except precipitation

After comparing the data we conclude it is relaibale except for precpitation that will be hence forth ignored.

###### Add 5% interval and averages + reloading a saved pickle

In [51]:
# DONT RUN THIS BOX IF YOU RAN THE SCRIPT FROM THE START
hopkins_confirmed = pd.DataFrame()
PICKLE_PATH = '../augmented_datasets/hopkins_conf_augmented2903.pkl'
with open(PICKLE_PATH, 'rb') as file:
    hopkins_confirmed = pickle.load(file)
coords = set(hopkins_confirmed.index.get_level_values(0).tolist())

In [285]:
hopkins_confirmed.insert(9, 'avg_interval_RH', np.nan)
hopkins_confirmed.insert(9, 'avg_interval_tmp', np.nan)

In [144]:
for coord in coords:
    max_cases = int(hopkins_confirmed.loc[coord]['Max_Cases']['data'])
    max_date = hopkins_confirmed.loc[coord]['Max_Date']
    five_prcnt = int(0.05 * max_cases)
    mask = hopkins_confirmed.loc[coord].loc['data'][14:] >= five_prcnt
    five_prcnt_date = mask.index.min()
    hopkins_confirmed.loc[coord, '5%_Date']['data'] = five_prcnt_date
    
    five_prct_column = hopkins_confirmed.columns.get_loc(five_prcnt_date)
    max_column = hopkins_confirmed.columns.get_loc(max_date['data'])
    interval = hopkins_confirmed[hopkins_confirmed.columns[five_prct_column:max_column]]
    
    hopkins_confirmed.loc[coord, 'avg_interval_tmp']['data'] = interval.loc[coord].loc['avg_d_tmp'].mean()
    hopkins_confirmed.loc[coord, 'avg_interval_RH']['data'] = interval.loc[coord].loc['avg_d_RH'].mean()

###### Save augmented data
1. Multi index does not save well in csv, so we also save it as a pickle

In [297]:
pickle_path = '../augmented_datasets/pickles/hopkins_conf_augmented{0}.pkl'.format(datetime.now().strftime('%d%m'))
with open(pickle_path, 'wb') as file:
    pickle.dump(hopkins_confirmed, file)
hopkins_confirmed.to_csv('../augmented_datasets/hopkins_conf_augmented{0}.csv'.format(datetime.now().strftime('%d%m')))