In this notebook I add weather informations, such as temperature and precipitations, to the training set of the [COVID-19 forecasting competition](https://www.kaggle.com/c/covid19-global-forecasting-week-1/discussion), in order to determine whether there is any correlation with the growth of confirmed cases. Weather data is imported from the [NOAA GSOD dataset](https://www.kaggle.com/noaa/gsod), continuously updated to include recent measurments.

[Data for this and previous weeks is available in dataset form here.](https://www.kaggle.com/davidbnn92/weather-data-for-covid19-data-analysis)

Edit: now missing values are denoted with usual `NaN`s, and not with `9999`s.

Edit 2: information concerning humidity was added, following [brennanmurphy](https://www.kaggle.com/brennanmurphy)'s advice. More specifically, dewpoint temperature was added from the NOAA GSOD dataset, then absolute and relative humidity were computed.

In [10]:
import numpy as np
import pandas as pd

import os
import json
from pathlib import Path

import matplotlib.pyplot as plt
from matplotlib import colors
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.spatial.distance import cdist

for dirname, _, filenames in os.walk('/kaggle/input'):
    print(dirname)

from google.cloud import bigquery




Here is the weather data:
* `temp`: Mean temperature for the day in degrees Fahrenheit to tenths.
* `max`: Maximum temperature reported during the day in Fahrenheit to tenths--time of max temp report varies by country and region, so this will sometimes not be the max for the calendar day.
* `min`: Minimum temperature reported during the day in Fahrenheit to tenths--time of min temp report varies by country and region, so this will sometimes not be the min for the calendar day.
* `stp`: Mean station pressure for the day in millibars to tenths.
* `slp`: Mean sea level pressure for the day in millibars to tenths.
* `dewp`: Mean dew point for the day in degrees Fahrenheit to tenths. 
* `wdsp`: Mean wind speed for the day in knots to tenths.
* `prcp`: Total precipitation (rain and/or melted snow) reported during the day in inches and hundredths; will usually not end with the midnight observation--i.e., may include latter part of previous day. .00 indicates no measurable precipitation (includes a trace).
* `fog`: Indicators (1 = yes, 0 = no/not reported) for the occurrence during the day

In [11]:
def implicit():
    from google.cloud import storage

    # If you don't specify credentials when constructing the client, the
    # client library will look for credentials in the environment.
    storage_client = storage.Client()

    # Make an authenticated API request
    buckets = list(storage_client.list_buckets())
    print(buckets)

In [17]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=r"C:\Users\Jost\Desktop\covid-304dab3f9983.json"

In [18]:
implicit()

[]


In [19]:
client = bigquery.Client()
dataset_ref = client.dataset("noaa_gsod", project="bigquery-public-data")
dataset = client.get_dataset(dataset_ref)

tables = list(client.list_tables(dataset))

table_ref = dataset_ref.table("stations")
table = client.get_table(table_ref)
stations_df = client.list_rows(table).to_dataframe()

table_ref = dataset_ref.table("gsod2020")
table = client.get_table(table_ref)
twenty_twenty_df = client.list_rows(table).to_dataframe()

stations_df['STN'] = stations_df['usaf'] + '-' + stations_df['wban']
twenty_twenty_df['STN'] = twenty_twenty_df['stn'] + '-' + twenty_twenty_df['wban']

cols_1 = ['STN', 'mo', 'da', 'temp', 'min', 'max', 'stp', 'slp', 'dewp', 'wdsp', 'prcp', 'fog']
cols_2 = ['STN', 'country', 'state', 'call', 'lat', 'lon', 'elev']
weather_df = twenty_twenty_df[cols_1].join(stations_df[cols_2].set_index('STN'), on='STN')

weather_df['temp'] = weather_df['temp'].apply(lambda x: np.nan if x==9999.9 else x)
weather_df['max'] = weather_df['max'].apply(lambda x: np.nan if x==9999.9 else x)
weather_df['min'] = weather_df['min'].apply(lambda x: np.nan if x==9999.9 else x)
weather_df['stp'] = weather_df['stp'].apply(lambda x: np.nan if x==9999.9 else x)
weather_df['slp'] = weather_df['slp'].apply(lambda x: np.nan if x==9999.9 else x)
weather_df['dewp'] = weather_df['dewp'].apply(lambda x: np.nan if x==9999.9 else x)
weather_df['wdsp'] = weather_df['wdsp'].apply(lambda x: np.nan if x==999.9 else x)
weather_df['prcp'] = weather_df['prcp'].apply(lambda x: np.nan if x==99.9 else x)

display(weather_df.tail(10))
weather_df.info(verbose=True)

  if __name__ == '__main__':
  del sys.path[0]


Unnamed: 0,STN,mo,da,temp,min,max,stp,slp,dewp,wdsp,prcp,fog,country,state,call,lat,lon,elev
2013514,999999-00425,4,2,35.8,32.9,38.7,991.2,,31.2,9.5,99.99,1,US,VT,,43.985,-73.095,149.1
2013515,999999-00425,4,11,36.1,30.7,45.7,988.0,,24.3,5.1,99.99,1,US,VT,,43.985,-73.095,149.1
2013516,999999-00425,5,9,33.0,28.4,37.8,986.9,,24.9,7.6,99.99,1,US,VT,,43.985,-73.095,149.1
2013517,999999-00425,5,11,44.1,32.2,53.2,992.7,,36.7,4.2,99.99,1,US,VT,,43.985,-73.095,149.1
2013518,999999-00440,2,7,35.8,30.9,41.9,972.3,,30.1,5.6,99.99,1,US,AL,,34.269,-85.858,295.7
2013519,999999-00319,1,10,31.2,14.0,39.2,886.8,,22.4,7.2,99.99,1,,,,,,
2013520,999999-00319,1,28,34.7,28.4,44.6,889.5,,30.4,7.2,99.99,1,,,,,,
2013521,999999-00319,2,24,34.8,21.2,48.2,884.1,,24.1,8.7,99.99,1,,,,,,
2013522,999999-00319,3,28,39.6,33.8,50.0,878.4,,35.1,15.4,99.99,1,,,,,,
2013523,999999-00319,4,16,36.0,30.2,51.8,889.3,,30.4,13.8,99.99,1,,,,,,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2013524 entries, 0 to 2013523
Data columns (total 18 columns):
STN        object
mo         object
da         object
temp       float64
min        float64
max        float64
stp        float64
slp        float64
dewp       float64
wdsp       object
prcp       float64
fog        object
country    object
state      object
call       object
lat        float64
lon        float64
elev       object
dtypes: float64(9), object(9)
memory usage: 276.5+ MB


Now let's compute absolute and relative humidity from temperature and dew point:

In [20]:
# convert everything into celsius
temp = (weather_df['temp'] - 32) / 1.8
dewp = (weather_df['dewp'] - 32) / 1.8
    
# compute relative humidity as ratio between actual vapour pressure (computed from dewpoint temperature)
# and saturation vapour pressure (computed from temperature) (the constant 6.1121 cancels out)
weather_df['rh'] = (np.exp((18.678*dewp)/(257.14+dewp))/np.exp((18.678*temp)/(257.14+temp)))

# calculate actual vapour pressure (in pascals)
# then use it to compute absolute humidity from the gas law of vapour 
# (ah = mass / volume = pressure / (constant * temperature))
weather_df['ah'] = ((np.exp((18.678*dewp)/(257.14+dewp))) * 6.1121 * 100) / (461.5 * temp)

In [69]:
train=pd.read_csv("covid.csv")
train=train.rename(columns={"countriesAndTerritories": "country"})
train.head()

Unnamed: 0,dateRep,day,month,year,cases,deaths,country,geoId,countryterritoryCode,popData2018,continentExp
0,14/06/2020,14,6,2020,556,5,Afghanistan,AF,AFG,37172386.0,Asia
1,13/06/2020,13,6,2020,656,20,Afghanistan,AF,AFG,37172386.0,Asia
2,12/06/2020,12,6,2020,747,21,Afghanistan,AF,AFG,37172386.0,Asia
3,11/06/2020,11,6,2020,684,21,Afghanistan,AF,AFG,37172386.0,Asia
4,10/06/2020,10,6,2020,542,15,Afghanistan,AF,AFG,37172386.0,Asia


In [70]:
temp=pd.read_csv("train.csv")
temp.head()


Unnamed: 0,Id,Province/State,Country/Region,Lat,Long,Date,ConfirmedCases,Fatalities
0,1,,Afghanistan,33.0,65.0,2020-01-22,0.0,0.0
1,2,,Afghanistan,33.0,65.0,2020-01-23,0.0,0.0
2,3,,Afghanistan,33.0,65.0,2020-01-24,0.0,0.0
3,4,,Afghanistan,33.0,65.0,2020-01-25,0.0,0.0
4,5,,Afghanistan,33.0,65.0,2020-01-26,0.0,0.0


In [75]:
temp=pd.read_csv("train.csv")
list(temp.columns)
#temp=temp[['Country/Region','Lat','Long','Date']]
temp=temp.drop_duplicates()
temp=temp.rename(columns={"Country/Region": "country"})

temp.head()


Unnamed: 0,Id,Province/State,country,Lat,Long,Date,ConfirmedCases,Fatalities
0,1,,Afghanistan,33.0,65.0,2020-01-22,0.0,0.0
1,2,,Afghanistan,33.0,65.0,2020-01-23,0.0,0.0
2,3,,Afghanistan,33.0,65.0,2020-01-24,0.0,0.0
3,4,,Afghanistan,33.0,65.0,2020-01-25,0.0,0.0
4,5,,Afghanistan,33.0,65.0,2020-01-26,0.0,0.0


In [76]:
train=pd.merge(train,temp)
train.head()

Unnamed: 0,dateRep,day,month,year,cases,deaths,country,geoId,countryterritoryCode,popData2018,continentExp,Lat,Long,Date,day_from_jan_first,Id,Province/State,ConfirmedCases,Fatalities
0,14/06/2020,14,6,2020,556,5,Afghanistan,AF,AFG,37172386.0,Asia,33.0,65.0,2020-01-22,22,1,,0.0,0.0
1,13/06/2020,13,6,2020,656,20,Afghanistan,AF,AFG,37172386.0,Asia,33.0,65.0,2020-01-22,22,1,,0.0,0.0
2,12/06/2020,12,6,2020,747,21,Afghanistan,AF,AFG,37172386.0,Asia,33.0,65.0,2020-01-22,22,1,,0.0,0.0
3,11/06/2020,11,6,2020,684,21,Afghanistan,AF,AFG,37172386.0,Asia,33.0,65.0,2020-01-22,22,1,,0.0,0.0
4,10/06/2020,10,6,2020,542,15,Afghanistan,AF,AFG,37172386.0,Asia,33.0,65.0,2020-01-22,22,1,,0.0,0.0


In [77]:
weather_df.head()

Unnamed: 0,STN,mo,da,temp,min,max,stp,slp,dewp,wdsp,...,fog,country,state,call,lat,lon,elev,rh,ah,day_from_jan_first
0,222130-99999,2,13,31.6,30.4,32.4,975.2,991.7,30.5,4.4,...,1,RS,,,67.55,33.35,134.0,0.956406,-5.608638,44
1,224130-99999,4,23,28.9,24.8,35.6,1.6,1011.7,18.4,2.9,...,1,RS,,,65.783,33.933,80.0,0.644366,-0.436884,114
2,246060-99999,6,3,58.1,42.1,68.4,978.2,1002.8,48.1,999.9,...,1,RS,,,63.583,103.967,211.0,0.691315,0.171131,155
3,220280-99999,1,28,11.3,5.2,19.9,10.1,1014.1,8.6,9.2,...,1,RS,,,69.2,35.117,30.0,0.886805,-0.042598,28
4,225290-99999,4,25,31.6,27.1,37.6,989.3,990.0,25.4,7.3,...,1,RS,,,64.233,35.883,3.0,0.775665,-4.54872,116


# Week 1:

In [78]:
weather_df['day_from_jan_first'] = (weather_df['da'].apply(int)
                                   + 31*(weather_df['mo']=='02') 
                                   + 60*(weather_df['mo']=='03')
                                   + 91*(weather_df['mo']=='04')
                                   + 121*(weather_df['mo']=='05')  
                                   + 152*(weather_df['mo']=='06')  

                                   )

mo = train['Date'].apply(lambda x: x[5:7])
da = train['Date'].apply(lambda x: x[8:10])
train['day_from_jan_first'] = (da.apply(int)
                               + 31*(mo=='02') 
                               + 60*(mo=='03')
                               + 91*(mo=='04') 
                               + 121*(mo=='05')
                               + 152*(mo=='06')  
                              )

C = []
for j in train.index:
    df = train.iloc[j:(j+1)]
    mat = cdist(df[['Lat','Long', 'day_from_jan_first']],
                weather_df[['lat','lon', 'day_from_jan_first']], 
                metric='euclidean')
    new_df = pd.DataFrame(mat, index=df.Id, columns=weather_df.index)
    arr = new_df.values
    new_close = np.where(arr == np.nanmin(arr, axis=1)[:,None],new_df.columns,False)
    L = [i[i.astype(bool)].tolist()[0] for i in new_close]
    C.append(L[0])
    
train['closest_station'] = C

train = train.set_index('closest_station').join(weather_df[['temp', 'min', 'max', 'stp', 'slp', 'dewp', 'rh', 'ah', 'wdsp', 'prcp', 'fog']], ).reset_index().drop(['index'], axis=1)
train.sort_values(by=['Id'], inplace=True)
train.index = train['Id'].apply(lambda x: x-1)
train.head()

KeyboardInterrupt: 

In [7]:
train.to_csv('data.csv', index=False)

# Week 2:

See commit no. 16.

# Week 3:

See commit no. 21.

# Week 4:

In [8]:
display(train_2.head())
#display(test_2.head())

# Create cross feature country + province
train['country+province'] = train['Country/Region'].fillna('') + '-' + train['Province/State'].fillna('')
train_2['country+province'] = train_2['Country_Region'].fillna('') + '-' + train_2['Province_State'].fillna('')

# Conversion table to add coordinates for regions that weren't in week 1's dataset
df = train.groupby('country+province')[['Lat', 'Long']].mean()
df.loc['United Kingdom-'] = df.loc['United Kingdom-United Kingdom']
df.loc['Diamond Princess-'] = df.loc['Cruise Ship-Diamond Princess']
df.loc['Denmark-'] = df.loc['Denmark-Denmark']
df.loc['France-'] = df.loc['France-France']
df.loc['Gambia-'] = df.loc['Gambia, The-']
df.loc['Netherlands-'] = df.loc['Netherlands-Netherlands']
df.loc['Dominica-'] = (15.3, -61.383333)
df.loc['Angola-'] = (-8.830833, 13.245)
df.loc['Bahamas-'] = (25.066667, -77.333333)
df.loc['Belize-'] = (17.498611, -88.188611)
df.loc['Cabo Verde-'] = (14.916667, -23.516667)
df.loc['Chad-'] = (12.134722, 15.055833)
df.loc['Denmark-Greenland'] = (64.181389, -51.694167)
df.loc['El Salvador-'] = (13.698889, -89.191389)
df.loc['Eritrea-'] = (15.322778, 38.925)
df.loc['Fiji-'] = (-18.166667, 178.45)
df.loc['France-Martinique'] = (14.666667, -61)
df.loc['France-New Caledonia'] = (-22.2758, 166.458)
df.loc['Grenada-'] = (12.05, -61.75)
df.loc['Guinea-Bissau-'] = (11.85, -15.566667)
df.loc['Haiti-'] = (18.533333, -72.333333)
df.loc['Laos-'] = (17.966667, 102.6)
df.loc['Libya-'] = (32.887222, 13.191389)
df.loc['Madagascar-'] = (-18.933333, 47.516667)
df.loc['Mali-'] = (12.639167, -8.002778)
df.loc['Mozambique-'] = (-25.966667, 32.583333)
df.loc['Netherlands-Sint Maarten'] = (18.052778, -63.0425)
df.loc['Nicaragua-'] = (12.136389, -86.251389)
df.loc['Niger-'] = (13.511667, 2.125278)
df.loc['Papua New Guinea-'] = (-9.478889, 147.149444)
df.loc['Saint Kitts and Nevis-'] = (17.3, -62.733333)
df.loc['Syria-'] = (33.513056, 36.291944)
df.loc['Timor-Leste-'] = (-8.566667, 125.566667)
df.loc['Uganda-'] = (0.313611, 32.581111)
df.loc['Zimbabwe-'] = (-17.829167, 31.052222)
df.loc['United Kingdom-Bermuda'] = (32.293, -64.782)
df.loc['United Kingdom-Isle of Man'] = (54.145, -4.482)

df.loc['Botswana-'] = (-24.658056, 25.912222)
df.loc['Burma-'] = (16.85, 96.183333)
df.loc['Burundi-'] = (-3.383333, 29.366667)
df.loc['Canada-Northwest Territories'] = (62.442222, -114.394722)
df.loc['Canada-Yukon'] = (60.716667, -135.05)
df.loc['Kosovo-'] = (42.666667, 21.166667)
df.loc['MS Zaandam-'] = (26.086111, -80.115278)
df.loc['Sierra Leone-'] = (8.484444, -13.234444)
df.loc['United Kingdom-Anguilla'] = (18.220833, -63.051667)
df.loc['United Kingdom-British Virgin Islands'] = (18.431389, -64.623056)
df.loc['United Kingdom-Turks and Caicos Islands'] = (21.783333, -72.283333)
df.loc['West Bank and Gaza-'] = (31.703056, 35.195556)

df.loc['France-Saint Pierre and Miquelon'] = (46.7778, -56.1778)
df.loc['Malawi-'] = (-13.983333, 33.783333)
df.loc['Netherlands-Bonaire, Sint Eustatius and Saba'] = (12.144444, -68.265556)
df.loc['Sao Tome and Principe-'] = (0.336111, 6.730556)
df.loc['South Sudan-'] = (4.85, 31.6)
df.loc['United Kingdom-Falkland Islands (Malvinas)'] = (-51.694444, -57.852778)
df.loc['Western Sahara-'] = (27.153611, -13.203333)
 
# add latitudes and longitudes to new dataframe
train_2['Lat'] = train_2['country+province'].apply(lambda x: df.loc[x, 'Lat'])
train_2['Long'] = train_2['country+province'].apply(lambda x: df.loc[x, 'Long'])

# compute closest weather station, as done for week 1
mo = train_2['Date'].apply(lambda x: x[5:7])
da = train_2['Date'].apply(lambda x: x[8:10])
train_2['day_from_jan_first'] = (da.apply(int)
                               + 31*(mo=='02') 
                               + 60*(mo=='03')
                               + 91*(mo=='04')  
                              )

C = []
for j in train_2.index:
    df = train_2.iloc[j:(j+1)]
    mat = cdist(df[['Lat','Long', 'day_from_jan_first']],
                weather_df[['lat','lon', 'day_from_jan_first']], 
                metric='euclidean')
    new_df = pd.DataFrame(mat, index=df.Id, columns=weather_df.index)
    arr = new_df.values
    new_close = np.where(arr == np.nanmin(arr, axis=1)[:,None],new_df.columns,False)
    L = [i[i.astype(bool)].tolist()[0] for i in new_close]
    C.append(L[0])
    
train_2['closest_station'] = C

# add weather observations from closest station
train_2= train_2.set_index('closest_station').join(weather_df[['temp', 'min', 'max', 'stp', 'slp', 'dewp', 'rh', 'ah', 'wdsp', 'prcp', 'fog']], ).reset_index().drop(['index'], axis=1)
train_2.sort_values(by=['Id'], inplace=True)
train_2.index = train_2['Id'].apply(lambda x: x-1)
display(train_2.head())

# output
train_2.to_csv('training_data_with_weather_info_week_4.csv', index=False)

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities
0,1,,Afghanistan,2020-01-22,0.0,0.0
1,2,,Afghanistan,2020-01-23,0.0,0.0
2,3,,Afghanistan,2020-01-24,0.0,0.0
3,4,,Afghanistan,2020-01-25,0.0,0.0
4,5,,Afghanistan,2020-01-26,0.0,0.0


Unnamed: 0_level_0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,country+province,Lat,Long,day_from_jan_first,...,min,max,stp,slp,dewp,rh,ah,wdsp,prcp,fog
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,,Afghanistan,2020-01-22,0.0,0.0,Afghanistan-,33.0,65.0,22,...,33.6,54.9,999.9,1024.3,27.4,0.545709,0.186448,9.4,0.0,0
1,2,,Afghanistan,2020-01-23,0.0,0.0,Afghanistan-,33.0,65.0,23,...,32.7,55.9,999.9,1020.8,22.8,0.461259,0.163225,14.9,99.99,1
2,3,,Afghanistan,2020-01-24,0.0,0.0,Afghanistan-,33.0,65.0,24,...,36.9,43.2,999.9,1018.6,34.5,0.801794,0.325375,10.4,0.17,1
3,4,,Afghanistan,2020-01-25,0.0,0.0,Afghanistan-,33.0,65.0,25,...,37.9,56.3,999.9,1018.0,37.8,0.728175,0.214562,6.1,0.57,1
4,5,,Afghanistan,2020-01-26,0.0,0.0,Afghanistan-,33.0,65.0,26,...,36.1,53.1,999.9,1014.8,33.2,0.685513,0.231656,10.8,0.0,1
