In [625]:
import urllib
import json
import pandas as pd
import numpy as np
import warnings
import pickle
from datetime import datetime
from datetime import timedelta
from keys import client_id, client_secret, app_id
warnings.filterwarnings('ignore')

In [633]:
DIAMOND_PRINCESS_COORD = (35.4437, 139.638)
BARBADOS_BELIZE_COORD = (13.1939, -59.5432)
CONGO_BRAZZAVILLE_KINSHASA_COORD = (-4.0383, 21.7587)
RADIUS = 350
LOG_PATH = 'weather_logs/'
AUGMENTED_PATH = '../augmented_datasets/'

###### Load datasets
Notes on data: 
1. Notice that some of the dates on the hopkins dataset appear in different format in ecxel, they are infect all in the same format: %-m/%-d/20
2. Column names are modifyed to %-m/%-d/20 to fit weather API queries
3. All three rows reffering to 'Diamnond princess' have been removed
4. The location (0,0) has been removed
5. Two places with the same coordinate have been slightly modified to accomedate indexing
4. source: https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series

In [380]:
confirmed_time_data = pd.read_csv('../original_datasets/hopkins_confirmed_time_series0327.csv')

confirmed_time_data = confirmed_time_data.rename({'Country/Region': 'Country_Region',\
                                                  'Province/State': 'Province_State'}, axis=1)

confirmed_time_data.drop(confirmed_time_data[confirmed_time_data['Lat']\
                                             == DIAMOND_PRINCESS_COORD[0]].index, inplace=True)

confirmed_time_data.loc[confirmed_time_data['Country_Region']=='Barbados', 'Lat']\
                        = BARBADOS_BELIZE_COORD[0] + 0.00001

confirmed_time_data.loc[confirmed_time_data['Country_Region']=='Congo (Brazzaville)', 'Lat']\
                        = CONGO_BRAZZAVILLE_KINSHASA_COORD[0] + 0.00001

confirmed_time_data.drop(confirmed_time_data.loc[confirmed_time_data['Lat'] == 0].index, inplace=True)

# death_time_data = pd.read_csv('../original_datasets/hopkins_death_time_series0323.csv')
# death_time_data = confirmed_time_data.rename({'Country/Region': 'Country_Region',\
#                                               'Province/State': 'Province_State'}, axis=1)
# line_data = pd.read_csv('COVID19_open_line_list.csv')
# line_data = line_data.rename({'latitude': 'Lat', 'longitude': 'Long',\
#                               'country': 'Country_Region', 'province': 'Province_State'}, axis=1)
# confirmed_time_data.drop(NOISE_COORD, level=0, inplace=True)

###### Setup multi-index

In [381]:
coords = [x for x in zip(confirmed_time_data.pop('Lat'), confirmed_time_data.pop('Long'))]
confirmed_time_data.index = coords

for param in ['avg_m_wind', 'avg_m_precip', 'avg_m_RH', 'avg_m_tmp',]:
    confirmed_time_data.insert(2, param, np.nan)
confirmed_time_data.insert(6, 'weather', '')

columns = len(confirmed_time_data.columns)

for index in confirmed_time_data.index:
    for param in ['avg_d_tmp', 'avg_d_RH', 'avg_d_wind', 'avg_d_precip']:
        confirmed_time_data = confirmed_time_data.append\
                (pd.Series([np.nan]*6+[param]+[np.nan]*(columns-7), index=confirmed_time_data.columns, name=index))

confirmed_time_data.set_index('weather', append=True, inplace=True)
confirmed_time_data = confirmed_time_data.sort_index()
dates = {date: date + '20' for date in confirmed_time_data.columns[6:]}
confirmed_time_data = confirmed_time_data.rename(columns=dates)

###### Query remote database

In [382]:
days = confirmed_time_data.columns[6:].tolist()
q = 0
log = open('{0}log{1}.txt'.format(LOG_PATH, datetime.now().strftime('%d%m%Y')), 'w')

while days:
    start_time = days[0]
    end_time = days[:30][-1]
    days = days[30:]
    for coord in coords:
        q += 1
        lat = coord[0]
        long = coord[1]
        country = confirmed_time_data.loc[coord]['Country_Region'].values[0]
        province = confirmed_time_data.loc[coord]['Province_State'].values[0]
        url = 'https://api.aerisapi.com/observations/summary/closest?p={0},{1}&from={2}&to={3}&radius={4}miles&plimit=31&limit=1&'\
              'client_id={5}&client_secret={6}'.format(lat, long, start_time, end_time, RADIUS, client_id, client_secret)
        msg = 'collecting for {0} location {1}, {2} from {3} to {4}\n{5}'\
        .format(q, province, country, start_time, end_time, url)
        _ = log.write('\n' + msg)
        print(msg)
        request = urllib.request.urlopen(url)
        response = request.read()
        json_ = json.loads(response)
        if json_['success']: 
            _ = log.write('\nsuccess')
            print('success')
            try:
                for day in json_['response'][0]['periods']:
                    date = datetime.strptime(str(day['summary']['ymd']), '%Y%m%d').strftime("%-m/%-d/%Y")
                    confirmed_time_data.loc[coord].at['avg_d_tmp', date] = day['summary']['temp']['avgC']
                    confirmed_time_data.loc[coord].at['avg_d_RH', date] = day['summary']['rh']['avg']
                    confirmed_time_data.loc[coord].at['avg_d_wind', date] = day['summary']['wind']['avgKPH']
                    confirmed_time_data.loc[coord].at['avg_d_precip', date] = day['summary']['precip']['totalMM']
            except IndexError:
                if json_['error']:
                    msg = 'No data found for {0}, {1} at {2}\n{3}\r\n'.format(country, province, date, json_['error'])
                    _ = log.write(msg)
                    print(msg)
            except Exception:
                msg = 'Unknowen error\n{0}\r\n'.format(json_['error'])
                _ = log.write(msg)
                print(msg)
            finally:
                request.close()
log.close()

collecting for 1 location nan, Afghanistan from 1/22/2020 to 2/20/2020
https://api.aerisapi.com/observations/summary/closest?p=33.0,65.0&from=1/22/2020&to=2/20/2020&radius=350miles&plimit=31&limit=1&client_id=tXyXXyb3Yf492X1asICz5&client_secret=WJi1enKSDGlhO8Nt4Ze00C1F6LEdMpl2O54JIwaM
success
collecting for 2 location nan, Albania from 1/22/2020 to 2/20/2020
https://api.aerisapi.com/observations/summary/closest?p=41.1533,20.1683&from=1/22/2020&to=2/20/2020&radius=350miles&plimit=31&limit=1&client_id=tXyXXyb3Yf492X1asICz5&client_secret=WJi1enKSDGlhO8Nt4Ze00C1F6LEdMpl2O54JIwaM
success
collecting for 3 location nan, Algeria from 1/22/2020 to 2/20/2020
https://api.aerisapi.com/observations/summary/closest?p=28.0339,1.6596&from=1/22/2020&to=2/20/2020&radius=350miles&plimit=31&limit=1&client_id=tXyXXyb3Yf492X1asICz5&client_secret=WJi1enKSDGlhO8Nt4Ze00C1F6LEdMpl2O54JIwaM
success
collecting for 4 location nan, Andorra from 1/22/2020 to 2/20/2020
https://api.aerisapi.com/observations/summary/cl

###### Verify integrity, handle NaN and backup dataframe
1. Some coordinates are more then 350 miles away from any weather station, resulting in NaN values
2. Some stations don't save data as far back, resulting in NaN values
3. NaNs are not removed, rather when applying aggragate functions we discard then in the calculations

In [383]:
backup = confirmed_time_data.copy()
# data[['Province_State']] = data[['Province_State']].fillna(0)
# confirmed_time_data = confirmed_time_data.dropna()
# confirmed_time_data.isna().sum()
# confirmed_time_data['1/22/2020']
confirmed_time_data.isnull().sum()
# confirmed_time_data['2/1/2020'][confirmed_time_data['2/1/2020'].isnull()]

Province_State    1156
Country_Region     984
avg_m_tmp         1230
avg_m_RH          1230
avg_m_precip      1230
avg_m_wind        1230
1/22/2020           62
1/23/2020           62
1/24/2020           58
1/25/2020           58
1/26/2020           54
1/27/2020           50
1/28/2020           42
1/29/2020           46
1/30/2020           50
1/31/2020           58
2/1/2020            58
2/2/2020            62
2/3/2020            58
2/4/2020            54
2/5/2020            62
2/6/2020            66
2/7/2020            62
2/8/2020            78
2/9/2020            74
2/10/2020           86
2/11/2020           86
2/12/2020           90
2/13/2020           66
2/14/2020           70
                  ... 
2/27/2020           40
2/28/2020           44
2/29/2020           56
3/1/2020            52
3/2/2020            60
3/3/2020            60
3/4/2020            56
3/5/2020            64
3/6/2020            56
3/7/2020            52
3/8/2020            52
3/9/2020            48
3/10/2020  

###### COMPUTE AVG PARAMETERS FOR CONFIRMED CASES

In [384]:
for coord in coords:
    lat = coord[0]
    long = coord[1]
    avg_tmp = confirmed_time_data.loc[coord].loc['avg_d_tmp'].mean(skipna=True)
    avg_rh = confirmed_time_data.loc[coord].loc['avg_d_RH'].mean(skipna=True)
    avg_wind = confirmed_time_data.loc[coord].loc['avg_d_wind'].mean(skipna=True)
    avg_precip = confirmed_time_data.loc[coord].loc['avg_d_precip'].mean(skipna=True)

    confirmed_time_data.loc[coord]['avg_m_tmp'][0] = avg_tmp
    confirmed_time_data.loc[coord]['avg_m_RH'][0] = avg_rh
    confirmed_time_data.loc[coord]['avg_m_wind'][0] = avg_wind
    confirmed_time_data.loc[coord]['avg_m_precip'][0] = avg_precip

###### Data validation, NaN handling

In [593]:
confirmed_time_data['avg_m_tmp'][::5].isna().sum()
confirmed_time_data['avg_m_RH'][::5].isna().sum()
confirmed_time_data['avg_m_wind'][::5].isna().sum()
confirmed_time_data['avg_m_precip'][::5].isna().sum()

nulls = confirmed_time_data['avg_m_precip'][::5].isna().tolist()
for i in range(len(nulls)):
    if nulls[i]:
        nc = confirmed_time_data['avg_m_precip'][::5].index[i][0]
        print('Removing null at coor {0}, {1}'.format(nc, confirmed_time_data.loc[nc, 'Country_Region']))
        confirmed_time_data.drop(nc, level=0, inplace=True)
        for i in range(len(coords)):
            if coords[i] == nc:
                coords.pop(i)
                i += 1

0

0

0

0

###### Compute basic growth-factor, max cases and max date

In [516]:
backup = confirmed_time_data.copy()
confirmed_time_data.rename(index={'': 'data'}, inplace=True)
confirmed_time_data.rename_axis(['coordinate', 'information'],inplace=True)
for param in ['Max_Cases', 'GF_Q1', 'GF_Q2', 'GF_Q3'][::-1]:
    confirmed_time_data.insert(6, param, np.nan)
confirmed_time_data.insert(7, '5%_Date', '')
confirmed_time_data.insert(7, 'Max_Date', '')

ValueError: cannot insert GF_Q3, already exists

In [597]:
for coord in coords:
    try:
        max_cases = confirmed_time_data.iloc[:,12:].loc[coord].loc['data'].max()
        max_date = confirmed_time_data.iloc[:,12:].loc[coord].loc['data'].argmax()
        confirmed_time_data.loc[coord, 'Max_Cases'].loc['data'] = max_cases
        confirmed_time_data.loc[coord, 'Max_Date'].loc['data'] = max_date
    except Exception as e:
        print(e)
print('Lybia and malta were removed as they had NaN values')

'the label [35.9375] is not in the [index]'
'the label [26.3351] is not in the [index]'
Lybia and malta were removed as they had NaN values


###### Save augmented data
1. Multi index does not save well in csv, so we also save it as a pickle

In [629]:
pickle_path = AUGMENTED_PATH + 'hopkins_conf_augmented{0}.pkl'.format(datetime.now().strftime('%d%m'))
with open(pickle_path, 'wb') as file:
    pickle.dump(confirmed_time_data, file)
confirmed_time_data.to_csv('../augmented_datasets/hopkins_conf_augmented{0}.csv'.format(datetime.now().strftime('%d%m')))