## Query daily weather data using NOAA CDO Web Service

### Setup dependencies

In [1]:
# Set up Dependencies
import pandas as pd
import calendar
import json
import requests
import time

# NOAA CDO web service token
from api_keys import noaa_token

### Build API Request
* data of interest: temperature, precipitation and snowfall
* locations: Chicago, New York City and New Orleans
* API limits to 1,000 data per request and 5 requests per second

In [2]:
# Function to create API request
def get_daily_weather(url, token, dataset, datatype, station,
                      start_time, end_time, 
                      unit="standard", limit=1000):
    
    # parameter as string
    params = f"datasetid={str(dataset)}" + \
        f"&datatypeid={'&datatypeid='.join(datatype)}" + \
        f"&stationid={'&stationid='.join(station)}" + \
        f"&startdate={str(start_time)}&enddate={str(end_time)}" + \
        f"&units={str(unit)}" + \
        f"&limit={str(limit)}"
    
    # header
    header = {'token': token}
    
    # make request
    r = requests.get(url, params=params, headers=header)
    print(f"Requesting {str(start_time)} to {str(end_time)} data. Status code: {str(r.status_code)}")
    
    # move data from json to data frame
    try:
        df = pd.DataFrame.from_dict(r.json()['results'])
        df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
        
        if (len(df) >= limit):
            print(f"CAUTION: Data count exceeds limit: {limit}. Consider selecting fewer data type or stations")
        else:
            print("Import completed")
        
        return df

    except:
        print("Error importing data")

In [3]:
# base url
base_url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/data"

# data set and location
dataset = "GHCND"
datatype = ['PRCP', 'SNOW', 'TAVG', 'TMAX', 'TMIN']
locations = [
    'GHCND:USW00094846',  # Chicago ORD
    'GHCND:USW00094789',  # NYC JFK
    'GHCND:USW00012916'   # New Orleans airport
]

# dicionary to add city name after pulling weather data
cities = {
    'GHCND:USW00094846': 'Chicago',
    'GHCND:USW00094789': 'New York',
    'GHCND:USW00012916': 'New Orleans'
}

# time period
start_year = 2010
end_year = 2019

### Request data and add to dataframe

In [4]:
# empty data frame to store data
df = pd.DataFrame()

# One API request per month
for year in range(start_year, end_year+1):
    for month in range(1,13):
        # beginning and end of month day string for API
        start_day = f"{year}-{month:02d}-01"
        end_day = f"{year}-{month:02d}-{calendar.monthrange(year,month)[1]}"
        
        # get daily weather data
        tmp_df = get_daily_weather(base_url, noaa_token, dataset, datatype, 
                                   locations, start_day, end_day)
        
        # append data to data frame
        df = df.append(tmp_df, ignore_index=True)
        
        # avoid exceeding 5 requests per second limit
        time.sleep(0.2)

Requesting 2010-01-01 to 2010-01-31 data. Status code: 200
Import completed
Requesting 2010-02-01 to 2010-02-28 data. Status code: 200
Import completed
Requesting 2010-03-01 to 2010-03-31 data. Status code: 200
Import completed
Requesting 2010-04-01 to 2010-04-30 data. Status code: 200
Import completed
Requesting 2010-05-01 to 2010-05-31 data. Status code: 200
Import completed
Requesting 2010-06-01 to 2010-06-30 data. Status code: 200
Import completed
Requesting 2010-07-01 to 2010-07-31 data. Status code: 200
Import completed
Requesting 2010-08-01 to 2010-08-31 data. Status code: 200
Import completed
Requesting 2010-09-01 to 2010-09-30 data. Status code: 200
Import completed
Requesting 2010-10-01 to 2010-10-31 data. Status code: 200
Import completed
Requesting 2010-11-01 to 2010-11-30 data. Status code: 200
Import completed
Requesting 2010-12-01 to 2010-12-31 data. Status code: 200
Import completed
Requesting 2011-01-01 to 2011-01-31 data. Status code: 200
Import completed
Requesting 2

In [5]:
# Preview data
print(f"Number of data points {len(df)}")
print("Data type for each column")
print(df.dtypes)
df.head()

Number of data points 48320
Data type for each column
date          datetime64[ns]
datatype              object
station               object
attributes            object
value                float64
dtype: object


Unnamed: 0,date,datatype,station,attributes,value
0,2010-01-01,PRCP,GHCND:USW00012916,"T,,X,2400",0.0
1,2010-01-01,TMAX,GHCND:USW00012916,",,X,2400",54.0
2,2010-01-01,TMIN,GHCND:USW00012916,",,X,2400",42.0
3,2010-01-01,PRCP,GHCND:USW00094789,",,0,2400",0.04
4,2010-01-01,SNOW,GHCND:USW00094789,"T,,0,",0.0


### Rearange data by date using pivot table

In [6]:
# pivot table
df_weather = df.pivot_table(
    values='value',
    index=['station', 'date'],
    columns='datatype'
)
df_weather=df_weather.reset_index()

# add city name
for city in cities:
    df_weather.loc[df_weather['station']==city,'city'] = cities[city]

# preview data
df_weather

datatype,station,date,PRCP,SNOW,TAVG,TMAX,TMIN,city
0,GHCND:USW00012916,2010-01-01,0.00,,,54.0,42.0,New Orleans
1,GHCND:USW00012916,2010-01-02,0.00,,,51.0,39.0,New Orleans
2,GHCND:USW00012916,2010-01-03,0.00,,,47.0,37.0,New Orleans
3,GHCND:USW00012916,2010-01-04,0.00,,,43.0,30.0,New Orleans
4,GHCND:USW00012916,2010-01-05,0.00,,,43.0,28.0,New Orleans
...,...,...,...,...,...,...,...,...
10951,GHCND:USW00094846,2019-12-27,0.00,0.0,37.0,37.0,28.0,Chicago
10952,GHCND:USW00094846,2019-12-28,0.28,0.0,34.0,48.0,27.0,Chicago
10953,GHCND:USW00094846,2019-12-29,0.93,0.0,52.0,57.0,47.0,Chicago
10954,GHCND:USW00094846,2019-12-30,0.09,0.7,38.0,47.0,27.0,Chicago


### Save data to csv

In [7]:
df_weather.to_csv("daily_weather.csv", index=False)