# Get the weather data

This script scrapes the data from noaa's api, formats it, and stores it as a dataframe in a pickel file for further analysis elsewhere

In [8]:
#needed to make web requests
import requests
import pandas as pd
import json
import numpy as np
from datetime import datetime
import io

# Define a function to make scraping a bit easier

In [2]:
def get_noaa_data(year,station="GHCND:USW00094728"):
    '''
    input: a year as an int or a string in YYYY format
        and a station as a string (defaut to central park)
    output: noaa daliy observations in central park for the given year, 
        in a dictonary that specifies the type of observation, 
        the date, etc.
    '''
    
    #setup the request
    year = str(year)
    
    Token = 'ocUDAtCqvEcffGgKoCvSFLubTEJVWDMc'
    base='https://www.ncdc.noaa.gov/cdo-web/api/v2/data?'
    datasetid='datasetid=GHCND'
    datatypeid='datatypeid=PRCP,SNWD,SNOW,TSUN,TAVG,AWND,WT09,WT14,WT07,WT01,WT17,WT06,WT05,WT02,WT11,WT22,WT04,WT13,WT16,WT08,WT18,WT03,WT19'
    limit='limit=1000'
    stationid='stationid='+station
    startdate='startdate='+year+'-01-01'
    enddate='enddate='+year+'-12-31'
    A='&'
    req = base+datasetid+A+datatypeid+A+limit+A+stationid+A+startdate+A+enddate
    #make the request
    r = requests.get(req,headers={'token':Token})
    #convert to json and keep only the relevant part
    j = r.json()
    j = j['results']
    
    
    #read the data into a df
    _df = pd.DataFrame(j)
    #drop unneeded info
    _df.drop(columns=['station', 'attributes'],inplace=True)
    #set the index to date
    _df.set_index('date',inplace=True)
    _df.index.names = ['DATE']
    #format the date
    _df.index = pd.to_datetime(_df.index)
    
    #pivot the df
    df = _df.pivot_table(values='value', index=_df.index, columns='datatype', aggfunc='first')
    
    #add the station info
    df['STATION'] = station
    #drop the datatype column
#     df.drop(columns=['datatype'],inplace=True)
    
    return df
    

In [3]:
#uncomment to test
# df = get_noaa_data(2019)
# df.head()

# setup and make the api call

In [4]:
start = 1950
stop = 2019
df = get_noaa_data(start)
for year in range(start+1,stop+1):
    print("getting data for",year)
    _df = get_noaa_data(year)
    df = df.append(_df, ignore_index=False, sort=False)

getting data for 1951
getting data for 1952
getting data for 1953
getting data for 1954
getting data for 1955
getting data for 1956
getting data for 1957
getting data for 1958
getting data for 1959
getting data for 1960
getting data for 1961
getting data for 1962
getting data for 1963
getting data for 1964
getting data for 1965
getting data for 1966
getting data for 1967
getting data for 1968
getting data for 1969
getting data for 1970
getting data for 1971
getting data for 1972
getting data for 1973
getting data for 1974
getting data for 1975
getting data for 1976
getting data for 1977
getting data for 1978
getting data for 1979
getting data for 1980
getting data for 1981
getting data for 1982
getting data for 1983
getting data for 1984
getting data for 1985
getting data for 1986
getting data for 1987
getting data for 1988
getting data for 1989
getting data for 1990
getting data for 1991
getting data for 1992
getting data for 1993
getting data for 1994
getting data for 1995
getting da

In [5]:
#df

Unnamed: 0_level_0,PRCP,SNOW,SNWD,STATION,WT03,WT16,WT18,WT04,WT06,TSUN,...,WT01,WT02,WT08,WT13,WT19,WT22,TAVG,WT11,WT07,WT09
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1950-01-01,0.0,0.0,0.0,GHCND:USW00094728,,,,,,,...,,,,,,,,,,
1950-01-02,5.0,0.0,0.0,GHCND:USW00094728,,,,,,,...,,,,,,,,,,
1950-01-03,13.0,0.0,0.0,GHCND:USW00094728,,,,,,,...,,,,,,,,,,
1950-01-04,0.0,0.0,0.0,GHCND:USW00094728,,,,,,,...,,,,,,,,,,
1950-01-05,0.0,0.0,0.0,GHCND:USW00094728,,,,,,,...,,,,,,,,,,
1950-01-06,5.0,0.0,0.0,GHCND:USW00094728,,,,,,,...,,,,,,,,,,
1950-01-07,58.0,0.0,0.0,GHCND:USW00094728,,,,,,,...,,,,,,,,,,
1950-01-08,0.0,0.0,0.0,GHCND:USW00094728,,,,,,,...,,,,,,,,,,
1950-01-09,0.0,0.0,0.0,GHCND:USW00094728,,,,,,,...,,,,,,,,,,
1950-01-10,117.0,0.0,0.0,GHCND:USW00094728,,,,,,,...,,,,,,,,,,


In [6]:
#uncomment to check the columns included
# cols = list(df.columns)
# cols

In [7]:
basedir = '/Users/bono/demo_projects/weather_stocks/'
file = basedir+'data/df_weather.pkl'
df.to_pickle(file)

In [9]:
df

Unnamed: 0_level_0,PRCP,SNOW,SNWD,STATION,WT03,WT16,WT18,WT04,WT06,TSUN,...,WT01,WT02,WT08,WT13,WT19,WT22,TAVG,WT11,WT07,WT09
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1950-01-01,0.0,0.0,0.0,GHCND:USW00094728,,,,,,,...,,,,,,,,,,
1950-01-02,5.0,0.0,0.0,GHCND:USW00094728,,,,,,,...,,,,,,,,,,
1950-01-03,13.0,0.0,0.0,GHCND:USW00094728,,,,,,,...,,,,,,,,,,
1950-01-04,0.0,0.0,0.0,GHCND:USW00094728,,,,,,,...,,,,,,,,,,
1950-01-05,0.0,0.0,0.0,GHCND:USW00094728,,,,,,,...,,,,,,,,,,
1950-01-06,5.0,0.0,0.0,GHCND:USW00094728,,,,,,,...,,,,,,,,,,
1950-01-07,58.0,0.0,0.0,GHCND:USW00094728,,,,,,,...,,,,,,,,,,
1950-01-08,0.0,0.0,0.0,GHCND:USW00094728,,,,,,,...,,,,,,,,,,
1950-01-09,0.0,0.0,0.0,GHCND:USW00094728,,,,,,,...,,,,,,,,,,
1950-01-10,117.0,0.0,0.0,GHCND:USW00094728,,,,,,,...,,,,,,,,,,
