# NOAA Weather Data

This notebook contains code to scrape weather data from NOAA's web services: https://www.ncdc.noaa.gov/cdo-web/webservices/v2. 

In [None]:
import requests 
import noaa_token

In [291]:
# Max temperature; min temp; precipitation; average wind speed; fastest 2-min wind speed
data_types = ['TMAX', 'TMIN', 'TAVG', 'PRCP', 'AWND', 'WSF2']


# these are labeled "incomplete" since the data coverage <100% - these variables are binary and any missing dates
# in the api response mean the specified weather event did not occur.
# Fog; smog/haze 
incomplete_data_types = ['WT01', 'WT08']

In [286]:
params = {
    'datasetid': 'GHCND',
    'datatypeid': 'TMAX',
    'stationid':'GHCND:USW00023234',
    'startdate':'2021-02-01',
    'enddate':'2021-02-28',
    'units':'standard',
    'limit':31
}
auth_header = {'token': noaa_token._token}
r = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/data', params=params, headers=auth_header)

In [287]:
feb_results = r.json()['results']
print(len(feb_results))
feb_results[0]

28


{'date': '2021-02-01T00:00:00',
 'datatype': 'TMAX',
 'station': 'GHCND:USW00023234',
 'attributes': ',,W,2400',
 'value': 65.0}

In [289]:
#feb = pd.DataFrame(feb_results)
feb.head()

Unnamed: 0,date,datatype,station,attributes,value
0,2021-02-01T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",65.0
1,2021-02-02T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",61.0
2,2021-02-03T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",59.0
3,2021-02-04T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",62.0
4,2021-02-05T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",67.0


In [292]:
auth_header = {'token': noaa_token._token}

for i in data_types[1:]:
    params = {
        'datasetid': 'GHCND',
        'datatypeid': i,
        'stationid':'GHCND:USW00023234',
        'startdate':'2021-02-01',
        'enddate':'2021-02-28',
        'units':'standard',
        'limit':31
    }

    r = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/data', params=params, headers=auth_header)
    feb[i]  = [i['value'] for i in r.json()['results']]

In [293]:
feb

Unnamed: 0,date,datatype,station,attributes,value,TMIN,TAVG,PRCP,AWND,WSF2
0,2021-02-01T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",65.0,48.0,55.0,0.35,9.6,23.9
1,2021-02-02T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",61.0,51.0,57.0,0.18,6.5,14.1
2,2021-02-03T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",59.0,43.0,52.0,0.0,8.3,18.1
3,2021-02-04T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",62.0,45.0,53.0,0.0,6.3,16.1
4,2021-02-05T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",67.0,43.0,52.0,0.0,4.9,23.0
5,2021-02-06T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",67.0,44.0,53.0,0.0,5.6,21.0
6,2021-02-07T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",67.0,44.0,53.0,0.0,6.3,18.1
7,2021-02-08T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",58.0,43.0,52.0,0.0,6.7,18.1
8,2021-02-09T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",60.0,51.0,54.0,0.0,6.3,17.0
9,2021-02-10T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",62.0,49.0,54.0,0.0,8.5,21.9


In [311]:
params = {
    'datasetid': 'GHCND',
    'datatypeid': 'WT01',
    'stationid':'GHCND:USW00023234',
    'startdate':'2021-02-01',
    'enddate':'2021-02-28',
    'units':'standard',
    'limit':31
}

r = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/data', params=params, headers=auth_header)

In [312]:
s = pd.DataFrame(r.json()['results'])
_s = pd.Series(s['value'].values, pd.to_datetime(s['date']))
fog = _s.asfreq('d', fill_value=0)
new_date_range = pd.date_range(start="2021-02-01", end="2021-02-28", freq="D")
fog = fog.reindex(new_date_range, fill_value=0)
fog

2021-02-01    1
2021-02-02    1
2021-02-03    0
2021-02-04    0
2021-02-05    0
2021-02-06    1
2021-02-07    1
2021-02-08    1
2021-02-09    0
2021-02-10    0
2021-02-11    1
2021-02-12    1
2021-02-13    1
2021-02-14    1
2021-02-15    1
2021-02-16    1
2021-02-17    1
2021-02-18    0
2021-02-19    1
2021-02-20    1
2021-02-21    0
2021-02-22    0
2021-02-23    0
2021-02-24    0
2021-02-25    0
2021-02-26    0
2021-02-27    0
2021-02-28    0
Freq: D, dtype: int64

In [313]:
feb['FOG'] = fog.values

In [302]:
params = {
    'datasetid': 'GHCND',
    'datatypeid': 'WT08',
    'stationid':'GHCND:USW00023234',
    'startdate':'2021-02-01',
    'enddate':'2021-02-28',
    'units':'standard',
    'limit':31
}

r = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/data', params=params, headers=auth_header)

In [304]:
s = pd.DataFrame(r.json()['results'])
_s = pd.Series(s['value'].values, pd.to_datetime(s['date']))
haze = _s.asfreq('d', fill_value=0)
new_date_range = pd.date_range(start="2021-02-01", end="2021-02-28", freq="D")
haze = haze.reindex(new_date_range, fill_value=0)
haze

2021-02-01    1
2021-02-02    1
2021-02-03    1
2021-02-04    0
2021-02-05    1
2021-02-06    1
2021-02-07    1
2021-02-08    1
2021-02-09    1
2021-02-10    1
2021-02-11    1
2021-02-12    1
2021-02-13    1
2021-02-14    1
2021-02-15    1
2021-02-16    1
2021-02-17    1
2021-02-18    0
2021-02-19    1
2021-02-20    0
2021-02-21    1
2021-02-22    0
2021-02-23    1
2021-02-24    0
2021-02-25    1
2021-02-26    1
2021-02-27    1
2021-02-28    1
Freq: D, dtype: int64

In [307]:
feb['HAZE'] = haze.values

In [314]:
feb

Unnamed: 0,date,datatype,station,attributes,value,TMIN,TAVG,PRCP,AWND,WSF2,FOG,HAZE
0,2021-02-01T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",65.0,48.0,55.0,0.35,9.6,23.9,1,1
1,2021-02-02T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",61.0,51.0,57.0,0.18,6.5,14.1,1,1
2,2021-02-03T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",59.0,43.0,52.0,0.0,8.3,18.1,0,1
3,2021-02-04T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",62.0,45.0,53.0,0.0,6.3,16.1,0,0
4,2021-02-05T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",67.0,43.0,52.0,0.0,4.9,23.0,0,1
5,2021-02-06T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",67.0,44.0,53.0,0.0,5.6,21.0,1,1
6,2021-02-07T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",67.0,44.0,53.0,0.0,6.3,18.1,1,1
7,2021-02-08T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",58.0,43.0,52.0,0.0,6.7,18.1,1,1
8,2021-02-09T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",60.0,51.0,54.0,0.0,6.3,17.0,0,1
9,2021-02-10T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",62.0,49.0,54.0,0.0,8.5,21.9,0,1


In [316]:
feb.rename(columns={'value':'TMAX'}).drop(columns=['datatype', 'station', 'attributes']).to_csv('feb21_weather_data.csv', index=False)

------

In [192]:
jan_results = r.json()['results']
print(len(jan_results))
jan_results[0]

31


{'date': '2021-01-01T00:00:00',
 'datatype': 'AWND',
 'station': 'GHCND:USW00023234',
 'attributes': ',,W,',
 'value': 4.0}

In [193]:
#jan = pd.DataFrame(jan_tmax)
jan.head()

Unnamed: 0,date,datatype,station,attributes,value,TMIN,PRCP,TAVG,AWND,WSF2
0,2021-01-01T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",60.0,43.0,0.0,50.0,19.9,19.9
1,2021-01-02T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",54.0,50.0,0.19,51.0,13.0,13.0
2,2021-01-03T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",53.0,49.0,0.0,52.0,8.9,8.9
3,2021-01-04T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",62.0,51.0,0.3,55.0,29.1,29.1
4,2021-01-05T00:00:00,TMAX,GHCND:USW00023234,",,W,2400",58.0,45.0,0.0,52.0,12.1,12.1


In [194]:
jan['AWND']  = [i['value'] for i in r.json()['results']]

In [196]:
params = {
    'datasetid': 'GHCND',
    'datatypeid': 'WT01',
    'stationid':'GHCND:USW00023234',
    'startdate':'2021-01-01',
    'enddate':'2021-01-31',
    'units':'standard',
    'limit':31
}
auth_header = {'token': noaa_token._token}
r = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/data', params=params, headers=auth_header)

In [200]:
fog_jan = r.json()['results']
fog_jan[0]

{'date': '2021-01-01T00:00:00',
 'datatype': 'WT01',
 'station': 'GHCND:USW00023234',
 'attributes': ',,W,',
 'value': 1}

In [226]:
s = pd.DataFrame(fog_jan)
_s = pd.Series(s['value'].values, pd.to_datetime(s['date']))
fog = _s.asfreq('d', fill_value=0)
new_date_range = pd.date_range(start="2021-01-01", end="2021-01-31", freq="D")
fog = fog.reindex(new_date_range, fill_value=0)
fog

2021-01-01    1
2021-01-02    1
2021-01-03    1
2021-01-04    1
2021-01-05    1
2021-01-06    1
2021-01-07    1
2021-01-08    1
2021-01-09    1
2021-01-10    0
2021-01-11    1
2021-01-12    1
2021-01-13    1
2021-01-14    1
2021-01-15    1
2021-01-16    1
2021-01-17    0
2021-01-18    0
2021-01-19    0
2021-01-20    0
2021-01-21    0
2021-01-22    0
2021-01-23    0
2021-01-24    1
2021-01-25    0
2021-01-26    1
2021-01-27    1
2021-01-28    1
2021-01-29    1
2021-01-30    0
2021-01-31    0
Freq: D, dtype: int64

In [281]:
jan['FOG'] = fog.reset_index()[0]

In [270]:
params = {
    'datasetid': 'GHCND',
    'datatypeid': 'WT08',
    'stationid':'GHCND:USW00023234',
    'startdate':'2021-01-01',
    'enddate':'2021-01-31',
    'units':'standard',
    'limit':31
}
auth_header = {'token': noaa_token._token}
r = requests.get('https://www.ncdc.noaa.gov/cdo-web/api/v2/data', params=params, headers=auth_header)

In [273]:
s = pd.DataFrame(r.json()['results'])

In [274]:

_s = pd.Series(s['value'].values, pd.to_datetime(s['date']))
new_date_range = pd.date_range(start="2021-01-01", end="2021-01-31", freq="D")
haze = _s.reindex(new_date_range, fill_value=0)
haze

2021-01-01    1
2021-01-02    1
2021-01-03    1
2021-01-04    1
2021-01-05    1
2021-01-06    1
2021-01-07    1
2021-01-08    1
2021-01-09    1
2021-01-10    1
2021-01-11    1
2021-01-12    1
2021-01-13    1
2021-01-14    1
2021-01-15    1
2021-01-16    0
2021-01-17    1
2021-01-18    0
2021-01-19    1
2021-01-20    1
2021-01-21    1
2021-01-22    1
2021-01-23    1
2021-01-24    1
2021-01-25    1
2021-01-26    1
2021-01-27    1
2021-01-28    1
2021-01-29    1
2021-01-30    1
2021-01-31    1
Freq: D, dtype: int64

In [283]:
jan['HAZE'] = haze.values

In [317]:
jan.rename(columns={'value':'TMAX'}).drop(columns=['datatype', 'station', 'attributes']).to_csv('jan21_weather_data.csv', index=False)