# Análise de dados de temperatura coletados de uma API

Fonte dos dados: [https://www.ncdc.noaa.gov/cdo-web/token](https://www.ncdc.noaa.gov/cdo-web/token)

Bibliotecas importadas

In [2]:
import requests
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 

plt.style.use('seaborn')

In [3]:
def make_request(endpoint, payload=None):
    """
    Faz uma requisição para a API de clima
    Parâmetros:
        - endpoint
        - payload
    Returns:
        Um objeto response
    """
    with open('../../TOKENS/ncdc_token.txt') as f:
        ncdc_tk = f.read()
    return requests.get(
        f'https://www.ncdc.noaa.gov/cdo-web/api/v2/{endpoint}',
        headers={'token': ncdc_tk},
        params=payload
    )

In [4]:
response = make_request('datasets', {'startdate': '2018-10-01'})

In [5]:
response.status_code

200

In [6]:
response.ok

True

Uma vez obtida a resposta, o método `json` pode ser utilizado para obter o dicionário dos dados a partir da variável `payload`:

In [7]:
payload = response.json()
payload.keys()

dict_keys(['metadata', 'results'])

In [8]:
payload['metadata']

{'resultset': {'offset': 1, 'count': 11, 'limit': 25}}

A chave `results` contém uma lista de dicionários. Por exemplo, o primeiro desses dicionários por sua vez, contém:

In [9]:
payload['results'][0].keys()

dict_keys(['uid', 'mindate', 'maxdate', 'name', 'datacoverage', 'id'])

Do resultado acima, ficamos com os campos referentes ap ID e ao nome dos _datasets_ (conjunto de dados):

In [10]:
[(data['id'], data['name']) for data in payload['results']]

[('GHCND', 'Daily Summaries'),
 ('GSOM', 'Global Summary of the Month'),
 ('GSOY', 'Global Summary of the Year'),
 ('NEXRAD2', 'Weather Radar (Level II)'),
 ('NEXRAD3', 'Weather Radar (Level III)'),
 ('NORMAL_ANN', 'Normals Annual/Seasonal'),
 ('NORMAL_DLY', 'Normals Daily'),
 ('NORMAL_HLY', 'Normals Hourly'),
 ('NORMAL_MLY', 'Normals Monthly'),
 ('PRECIP_15', 'Precipitation 15 Minute'),
 ('PRECIP_HLY', 'Precipitation Hourly')]

O primeiro elemento referente aos resumso diários (**Daily Summaries**) contém os dados de interesse.

In [11]:
response = make_request(
    'datacategories', payload={'datasetid': 'GHCND'}
)

response.status_code

200

In [13]:
response.json()['results']

[{'name': 'Evaporation', 'id': 'EVAP'},
 {'name': 'Land', 'id': 'LAND'},
 {'name': 'Precipitation', 'id': 'PRCP'},
 {'name': 'Sky cover & clouds', 'id': 'SKY'},
 {'name': 'Sunshine', 'id': 'SUN'},
 {'name': 'Air Temperature', 'id': 'TEMP'},
 {'name': 'Water', 'id': 'WATER'},
 {'name': 'Wind', 'id': 'WIND'},
 {'name': 'Weather Type', 'id': 'WXTYPE'}]

In [15]:
response = make_request(
    'datatypes',
    payload={'datacategoryid': 'TEMP', 'limit': 100}
)
response.status_code

200

In [17]:
[(datatype['id'], datatype['name']) for datatype in response.json()['results']]

[('CDSD', 'Cooling Degree Days Season to Date'),
 ('DATN',
  'Number of days included in the multiday minimum temperature (MDTN)'),
 ('DATX',
  'Number of days included in the multiday maximum temperature (MDTX)'),
 ('DLY-DUTR-NORMAL', 'Long-term averages of daily diurnal temperature range'),
 ('DLY-DUTR-STDDEV',
  'Long-term standard deviations of daily diurnal temperature range'),
 ('DLY-TAVG-NORMAL', 'Long-term averages of daily average temperature'),
 ('DLY-TAVG-STDDEV',
  'Long-term standard deviations of daily average temperature'),
 ('DLY-TMAX-NORMAL', 'Long-term averages of daily maximum temperature'),
 ('DLY-TMAX-STDDEV',
  'Long-term standard deviations of daily maximum temperature'),
 ('DLY-TMIN-NORMAL', 'Long-term averages of daily minimum temperature'),
 ('DLY-TMIN-STDDEV',
  'Long-term standard deviations of daily minimum temperature'),
 ('EMNT', 'Extreme minimum temperature for the period.'),
 ('EMXT', 'Extreme maximum temperature for the period.'),
 ('HDSD', 'Heating De

In [23]:
response = make_request(
    'locationcategories', payload={'datasetid': 'GHCND'}
)

response.status_code

200

In [19]:
import pprint

In [24]:
pprint.pprint(response.json())

{'metadata': {'resultset': {'count': 12, 'limit': 25, 'offset': 1}},
 'results': [{'id': 'CITY', 'name': 'City'},
             {'id': 'CLIM_DIV', 'name': 'Climate Division'},
             {'id': 'CLIM_REG', 'name': 'Climate Region'},
             {'id': 'CNTRY', 'name': 'Country'},
             {'id': 'CNTY', 'name': 'County'},
             {'id': 'HYD_ACC', 'name': 'Hydrologic Accounting Unit'},
             {'id': 'HYD_CAT', 'name': 'Hydrologic Cataloging Unit'},
             {'id': 'HYD_REG', 'name': 'Hydrologic Region'},
             {'id': 'HYD_SUB', 'name': 'Hydrologic Subregion'},
             {'id': 'ST', 'name': 'State'},
             {'id': 'US_TERR', 'name': 'US Territory'},
             {'id': 'ZIP', 'name': 'Zip Code'}]}


In [28]:
def get_item(name, what, endpoint, start=1, end=None):
    """
    Grab the JSON payload using binary search.
    Parameters:
        - name: The item to look for.
        - what: Dictionary specifying what item `name` is.
        - endpoint: Where to look for the item.
        - start: The position to start at. We don't need
          to touch this, but the function will manipulate
          this with recursion.
        - end: The last position of the items. Used to
          find the midpoint, but like `start`this is not
          something we need to worry about.
    Returns: Dictionary of the infomation for the item
             if found, otherwise an empty dictionary.
    """
    # Find the midpoint to cut the data in half each time
    mid = (start + (end or 1)) // 2
    # Lowercase the name so this is not case-sensitive
    name = name.lower()
    # Define the payload we will send with each request
    payload = {
        'datasetid': 'GHCND',
        'sortfield': 'name',
        'offset': mid, # we'll change the offset each time
        'limit': 1 # we only want one value back
    }
    # Make request adding additional filters from `what`
    response = make_request(endpoint, {**payload, **what})

    if response.ok:
        payload = response.json()
        # if ok, grap the index from the response
        # metadata the first time through
        end = end or payload['metadata']['resultset']['count']
        # grab the lowercase version of the current name
        current_name = payload['results'][0]['name'].lower()
        # if what we are searching for is in the current name, we have found our item
        if name in current_name:
            # return the found item
            return payload['results'][0]
        else:
            if start >= end:
                # if start index is greater than ot equal 
                # to end index, we coundn't find it
                return {}
            elif name < current_name:
                # name cames before the current name in the
                # alphabet => search further to the left
                return get_item(name, what, endpoint, start, mid - 1)
            elif name > current_name:
                # name comes after the current name in the 
                # alphabet => search further to the right
                return get_item(name, what, endpoint, mid + 1, end)
            else:
                # response wasn't ok, use code to determine why
                print(f'Response not OK. status: {response.status_code}')

Download dos dados

In [29]:
nyc = get_item(
    'New York', {'locationcategoryid': 'CITY'}, 'locations'
)
nyc

{'mindate': '1869-01-01',
 'maxdate': '2022-01-26',
 'name': 'New York, NY US',
 'datacoverage': 1,
 'id': 'CITY:US360019'}

In [31]:
central_park = get_item(
    'NY City Central Park',
    {'locationcategoryid': nyc['id']}, 'stations'
)

central_park

{'elevation': 42.7,
 'mindate': '1869-01-01',
 'maxdate': '2022-01-25',
 'latitude': 40.77898,
 'name': 'NY CITY CENTRAL PARK, NY US',
 'datacoverage': 1,
 'id': 'GHCND:USW00094728',
 'elevationUnit': 'METERS',
 'longitude': -73.96925}

In [33]:
response = make_request(
    'data',
    {
        'datasetid': 'GHCND',
        'stationid': central_park['id'],
        'locationid': nyc['id'],
        'startdate': '2018-10-01',
        'enddate': '2018-10-31',
        'datatypeid': ['TAVG', 'TMAX', 'TMIN'],
        'units': 'metric',
        'limit': 1000
    }
)
response.status_code

200

In [34]:
df = pd.DataFrame(response.json()['results'])
df.head()

Unnamed: 0,date,datatype,station,attributes,value
0,2018-10-01T00:00:00,TMAX,GHCND:USW00094728,",,W,2400",24.4
1,2018-10-01T00:00:00,TMIN,GHCND:USW00094728,",,W,2400",17.2
2,2018-10-02T00:00:00,TMAX,GHCND:USW00094728,",,W,2400",25.0
3,2018-10-02T00:00:00,TMIN,GHCND:USW00094728,",,W,2400",18.3
4,2018-10-03T00:00:00,TMAX,GHCND:USW00094728,",,W,2400",23.3


In [35]:
df.datatype.unique()

array(['TMAX', 'TMIN'], dtype=object)

In [36]:
if get_item(
    'NY City Central Park',
    {'locationid': nyc['id'], 'datatypeid': 'TAVG'},
    'stations'
):
    print('Found!')

Found!
