In this notebook I add weather informations, such as temperature and precipitations, to the training set of the [COVID-19 forecasting competition](https://www.kaggle.com/c/covid19-global-forecasting-week-1/discussion), in order to determine whether there is any correlation with the growth of confirmed cases. Weather data is imported from the [NOAA GSOD dataset](https://www.kaggle.com/noaa/gsod), continuously updated to include recent measurments.

[Data for this and previous weeks is available in dataset form here.](https://www.kaggle.com/davidbnn92/weather-data-for-covid19-data-analysis)

Edit: now missing values are denoted with usual `NaN`s, and not with `9999`s.

Edit 2: information concerning humidity was added, following [brennanmurphy](https://www.kaggle.com/brennanmurphy)'s advice. More specifically, dewpoint temperature was added from the NOAA GSOD dataset, then absolute and relative humidity were computed.

In [1]:
import numpy as np
import pandas as pd

import os
import json
from pathlib import Path

import matplotlib.pyplot as plt
from matplotlib import colors
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.spatial.distance import cdist

for dirname, _, filenames in os.walk('/kaggle/input'):
    print(dirname)

from google.cloud import bigquery




Here is the weather data:
* `temp`: Mean temperature for the day in degrees Fahrenheit to tenths.
* `max`: Maximum temperature reported during the day in Fahrenheit to tenths--time of max temp report varies by country and region, so this will sometimes not be the max for the calendar day.
* `min`: Minimum temperature reported during the day in Fahrenheit to tenths--time of min temp report varies by country and region, so this will sometimes not be the min for the calendar day.
* `stp`: Mean station pressure for the day in millibars to tenths.
* `slp`: Mean sea level pressure for the day in millibars to tenths.
* `dewp`: Mean dew point for the day in degrees Fahrenheit to tenths. 
* `wdsp`: Mean wind speed for the day in knots to tenths.
* `prcp`: Total precipitation (rain and/or melted snow) reported during the day in inches and hundredths; will usually not end with the midnight observation--i.e., may include latter part of previous day. .00 indicates no measurable precipitation (includes a trace).
* `fog`: Indicators (1 = yes, 0 = no/not reported) for the occurrence during the day

In [3]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=r"C:\Users\Jost\Desktop\covid-304dab3f9983.json"

Import weather data

In [15]:
data=pd.read_csv("train_weather.csv")
data=data.rename(columns={"Country/Region": "country"})

data.head()

Unnamed: 0,Id,Province/State,country,Lat,Long,Date,ConfirmedCases,Fatalities,day_from_jan_first,temp,min,max,stp,slp,dewp,rh,ah,wdsp,prcp,fog
0,1,,Afghanistan,33.0,65.0,2020-01-22,0.0,0.0,22,42.6,33.6,54.9,999.9,1024.3,27.4,0.545709,0.186448,9.4,0.0,0
1,2,,Afghanistan,33.0,65.0,2020-01-23,0.0,0.0,23,42.0,32.7,55.9,999.9,1020.8,22.8,0.461259,0.163225,14.9,99.99,1
2,3,,Afghanistan,33.0,65.0,2020-01-24,0.0,0.0,24,40.1,36.9,43.2,999.9,1018.6,34.5,0.801794,0.325375,10.4,0.17,1
3,4,,Afghanistan,33.0,65.0,2020-01-25,0.0,0.0,25,46.0,37.9,56.3,999.9,1018.0,37.8,0.728175,0.214562,6.1,0.57,1
4,5,,Afghanistan,33.0,65.0,2020-01-26,0.0,0.0,26,42.8,36.1,53.1,999.9,1014.8,33.2,0.685513,0.231656,10.8,0.0,1


In [21]:
temp=pd.read_csv("covid.csv")
temp=temp.rename(columns={"countriesAndTerritories": "country"})
temp=temp[["country","continentExp","popData2018"]]
temp=temp.drop_duplicates()
temp.head()

Unnamed: 0,country,continentExp,popData2018
0,Afghanistan,Asia,37172386.0
157,Albania,Europe,2866376.0
255,Algeria,Africa,42228429.0
417,Andorra,Europe,77006.0
510,Angola,Africa,30809762.0


In [33]:
data=pd.merge(data,temp)
data[data.country=="Germany"].head()

Unnamed: 0,Id,Province/State,country,Lat,Long,Date,ConfirmedCases,Fatalities,day_from_jan_first,temp,...,stp,slp,dewp,rh,ah,wdsp,prcp,fog,continentExp,popData2018
1512,10510,,Germany,51.0,9.0,2020-01-22,0.0,0.0,22,27.1,...,999.9,,26.3,0.967504,-0.385437,0.8,99.99,1,Europe,82927922.0
1513,10511,,Germany,51.0,9.0,2020-01-23,0.0,0.0,23,32.1,...,999.9,,30.1,0.922172,22.072696,1.6,0.0,1,Europe,82927922.0
1514,10512,,Germany,51.0,9.0,2020-01-24,0.0,0.0,24,30.5,...,999.9,,27.9,0.899242,-1.34494,6.5,0.0,0,Europe,82927922.0
1515,10513,,Germany,51.0,9.0,2020-01-25,0.0,0.0,25,31.0,...,999.9,,28.5,0.903143,-2.067685,5.4,0.0,0,Europe,82927922.0
1516,10514,,Germany,51.0,9.0,2020-01-26,0.0,0.0,26,33.7,...,999.9,,32.6,0.956791,1.436627,5.5,99.99,1,Europe,82927922.0


In [37]:
data=data[data.continentExp=="Europe"]
data


Unnamed: 0,Id,Province/State,country,Lat,Long,Date,ConfirmedCases,Fatalities,day_from_jan_first,temp,...,stp,slp,dewp,rh,ah,wdsp,prcp,fog,continentExp,popData2018
0,94,,Albania,41.1533,20.1683,2020-01-22,0.0,0.0,22,42.3,...,18.9,1029.3,32.8,0.687721,0.239029,0.2,0.00,0,Europe,2866376.0
1,95,,Albania,41.1533,20.1683,2020-01-23,0.0,0.0,23,44.1,...,13.9,1024.2,36.1,0.732094,0.232129,0.2,0.00,0,Europe,2866376.0
2,96,,Albania,41.1533,20.1683,2020-01-24,0.0,0.0,24,46.7,...,15.4,1025.7,34.8,0.629623,0.181447,0.8,0.00,0,Europe,2866376.0
3,97,,Albania,41.1533,20.1683,2020-01-25,0.0,0.0,25,51.5,...,13.8,1023.9,45.3,0.791856,0.205985,0.7,0.00,1,Europe,2866376.0
4,98,,Albania,41.1533,20.1683,2020-01-26,0.0,0.0,26,52.5,...,8.8,1018.8,51.8,0.974393,0.250212,0.5,0.06,1,Europe,2866376.0
5,99,,Albania,41.1533,20.1683,2020-01-27,0.0,0.0,27,50.9,...,4.4,1014.5,50.7,0.992575,0.260514,2.5,1.38,1,Europe,2866376.0
6,100,,Albania,41.1533,20.1683,2020-01-28,0.0,0.0,28,52.7,...,4.1,1014.2,47.7,0.829612,0.212542,1.6,0.08,1,Europe,2866376.0
7,101,,Albania,41.1533,20.1683,2020-01-29,0.0,0.0,29,54.0,...,999.0,1008.9,51.2,0.901614,0.228012,1.7,1.10,1,Europe,2866376.0
8,102,,Albania,41.1533,20.1683,2020-01-30,0.0,0.0,30,49.8,...,5.1,1015.2,41.0,0.715021,0.191246,1.7,0.39,1,Europe,2866376.0
9,103,,Albania,41.1533,20.1683,2020-01-31,0.0,0.0,31,49.5,...,8.7,1018.8,37.5,0.630572,0.169634,1.1,0.00,0,Europe,2866376.0


In [56]:
#data['outbreak'] = np.where(data['ConfirmedCases']>0, 1, 0)
data=data[data.ConfirmedCases>20]
data["newCases"]=data.ConfirmedCases - data.ConfirmedCases.shift(1)
data["newCases"]= data.groupby(['country'])['ConfirmedCases'].shift(1)
data["newCasespMio"]=round(data["newCases"]*1000000/data["popData2018"])

data[data.country=="Germany"]

Unnamed: 0,Id,Province/State,country,Lat,Long,Date,ConfirmedCases,Fatalities,day_from_jan_first,temp,...,rh,ah,wdsp,prcp,fog,continentExp,popData2018,outbreak,newCases,newCasespMio
1547,10545,,Germany,51.0,9.0,2020-02-26,27.0,0.0,57,37.4,...,0.796512,0.436151,9.1,99.99,1,Europe,82927922.0,1,,
1548,10546,,Germany,51.0,9.0,2020-02-27,46.0,0.0,58,36.3,...,0.828547,0.545515,7.3,99.99,1,Europe,82927922.0,1,27.0,0.0
1549,10547,,Germany,51.0,9.0,2020-02-28,48.0,0.0,59,37.4,...,0.764902,0.418842,8.4,99.99,1,Europe,82927922.0,1,46.0,1.0
1550,10548,,Germany,51.0,9.0,2020-02-29,79.0,0.0,60,45.3,...,0.753752,0.227639,10.0,99.99,1,Europe,82927922.0,1,48.0,1.0
1551,10549,,Germany,51.0,9.0,2020-03-01,130.0,0.0,61,43.8,...,0.697672,0.224244,9.7,99.99,1,Europe,82927922.0,1,79.0,1.0
1552,10550,,Germany,51.0,9.0,2020-03-02,159.0,0.0,62,43.2,...,0.745901,0.246833,6.2,99.99,1,Europe,82927922.0,1,130.0,2.0
1553,10551,,Germany,51.0,9.0,2020-03-03,196.0,0.0,63,39.8,...,0.730824,0.304399,4.1,99.99,1,Europe,82927922.0,1,159.0,2.0
1554,10552,,Germany,51.0,9.0,2020-03-04,262.0,0.0,64,37.5,...,0.790178,0.426493,5.2,99.99,1,Europe,82927922.0,1,196.0,2.0
1555,10553,,Germany,51.0,9.0,2020-03-05,482.0,0.0,65,40.0,...,0.827702,0.338765,4.8,99.99,1,Europe,82927922.0,1,262.0,3.0
1556,10554,,Germany,51.0,9.0,2020-03-06,670.0,0.0,66,41.1,...,0.855064,0.321112,6.1,99.99,1,Europe,82927922.0,1,482.0,6.0
