##### The data scraping process is time-intensive. I show my code here but the data is all saved as csv files in the data folder and the reader may skip to the data analysis notebook with the csv files in hand. You must first request a token from here: https://www.ncdc.noaa.gov/cdo-web/token



In [1]:
import requests
import pandas as pd
import datetime
from IPython import display
import time

In [15]:
token = '{Insert token here}'

In [3]:
def make_request(endpoint, payload = None):
  return requests.get(f'https://www.ncei.noaa.gov/cdo-web/api/v2/{endpoint}',
                      headers =
                          {'token': token
                       }, params = payload
                      )

In [17]:
i = 0
offset = 0
results = []

In [None]:
while i < 126:
  response = make_request('stations', {'limit':1000, 'offset': offset, 'datasetid': 'GHCND'})
  if response.status_code == 200:
    results.extend(response.json()['results'])
    i += 1
    offset = i*1000
  else:
    print (f'Request failed with status code: {response.status_code}.')
    time.sleep(60)


In [None]:
df2 = pd.DataFrame(results)

In [None]:
df2.to_csv('stations.csv')

##### Next we will scrape data from the Global Historical Climatological Network Daily dataset from 2004 through 2023 for Chicago. 

In [4]:
current = datetime.date(2004,1,1)
end = datetime.date(2004,1,10)
results = []

while current < end:
  display.clear_output(wait = True)
  display.display(f'Gathering data for {str(current)}')

  response = make_request('data',
                          {'datasetid': 'GHCND',
                           'locationid': 'CITY:US170006',
                           'startdate': current,
                           'enddate': current,
                           'units': 'metric',
                           'limit':1000
                           })
  if response.ok:
    results.extend(response.json()['results'])
  current += datetime.timedelta(days = 1)

'Gathering data for 2004-01-09'

In [5]:
results = pd.DataFrame(results)
results['date2'] = pd.to_datetime(results.date)

In [None]:
results2 = results.query('date2 > "2013-12-31"')
results = results.query('date2 < "2014-01-01"')

In [None]:
results.to_csv('results.csv')
results2.to_csv('results2.csv')

##### Gather information about datatypes in the dataset so we can match their descriptions with the abbreviations in the GHCND data we scraped above.

In [None]:
offset = 0
i = 0
datatypes = []
while i < 3:
  response = make_request('datatypes', {'limit':1000, 'offset': offset})
  if response.status_code == 200:
    datatypes.extend(response.json()['results'])
    i += 1
    offset = i*1000
  else:
    print (f'Request failed with status code: {response.status_code}.')
    time.sleep(60)

In [5]:
datatypes = pd.DataFrame(datatypes)
datatypes.to_csv('datatypes.csv')

##### Because many stations listed only have some types of measurements recorded, we scrape data for all stations in Illinois (FIPS: 17) over a 10 day period to see which stations have the types of  data we're looking for: temperature and pressure

In [4]:
current = datetime.date(2008,1,1)
end = datetime.date(2008,1,10)
results = []

while current < end:
  display.clear_output(wait = True)
  display.display(f'Gathering data for {str(current)}')

  response = make_request('data',
                          {'datasetid': 'GHCND',
                           'locationid': 'FIPS:17', #Illinois
                           'startdate': current,
                           'enddate': current,
                           'units': 'metric',
                           'limit':1000
                           })
  if response.ok:
    results.extend(response.json()['results'])
  current += datetime.timedelta(days = 1)

'Gathering data for 2008-01-09'

In [None]:
results2 = []
i = 0
offset = 0
while i < 4:
  response = make_request('stations', {'limit':1000, 'offset': offset, 'datasetid': 'GHCND', 'locationid': 'FIPS:17'})
  if response.status_code == 200:
    results2.extend(response.json()['results'])
    i += 1
    offset = i*1000
  else:
    print (f'Request failed with status code: {response.status_code}.')
    time.sleep(60)


In [16]:
All_Illinois_stations = pd.DataFrame(results).merge(pd.DataFrame(results2), left_on = 'station', right_on = 'id')

In [18]:
All_Illinois_stations.to_csv('All_Illinois_stations.csv')

In [None]:
#Scrape again for DeKalb results
current = datetime.date(2004,1,1)
end = datetime.date(2024,1,1)
results = []

while current < end:
  display.clear_output(wait = True)
  display.display(f'Gathering data for {str(current)}')

  response = make_request('data',
                          {'datasetid': 'GHCND',
                           'stationid': 'GHCND:USC00112223',
                           'startdate': current,
                           'enddate': current,
                           'units': 'metric',
                           'limit':1000
                           })
  if response.ok:
      try:
        if len(response.json())>0:  
            results.extend(response.json()['results'])
        else:
            current += datetime.timedelta(days = 1)
        time.sleep(0.5)
      except ValueError:
        current += datetime.timedelta(days = 1)  
  current += datetime.timedelta(days = 1)

In [31]:
results = pd.DataFrame(results)

In [32]:
results.to_csv('DeKalb.csv')