# Crawl data from [NOAA](https://www.ncei.noaa.gov/support/access-data-service-api-user-documentation)


In [3]:
%pip install requests

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.



In [4]:
import pandas as pd
import numpy as np
import requests

In [5]:
import operator

def get_stations_by_bounding_box(gps_n, gps_w, gps_s, gps_e, verbose):
    url = "https://www.ncei.noaa.gov/access/services/search/v1/data?dataset=" + "daily-summaries"
    url += "&bbox=" + str(gps_n) + "," + str(gps_w) + "," + str(gps_s) + "," + str(gps_e)
    url += "&dataTypes=TMIN,TMAX,PRCP"
    url += "&limit=10&offset=0"
    if verbose: print(url)

    try:
        req = requests.get(url, data=None, json=None, headers=None)
    except Exception as e:
        print('ERROR: ' + repr(e), ' fn()', url)
        return None
    
    if not req.status_code == 200: 
        if req.status_code == 429:
            raise Exception('ERROR: HTTP 429 Too Many Requests! rate limiting.  resp.status_code = ', str(req.status_code), req.text)
        else:
            print('\tERROR: resp.status_code = ', str(req.status_code), req.text)
            return None

    resp = req.json()
    if len(resp) == 0:
        print("\tNo results from query ", url)
        return None
    
    #print(json.dumps(resp, sort_keys=False, indent=2))
    station_data = []
    #if 'data' in resp.keys():
    if not 'results' in resp.keys(): raise Exception("ERROR: key 'results' not in response")
    results = resp['results']
    for result in results:
        if not 'endDate' in result.keys(): raise Exception("ERROR: key 'endDate' not in response (result)")
        if not 'startDate' in result.keys(): raise Exception("ERROR: key 'startDate' not in response (result)")
        if not 'centroid' in result.keys(): raise Exception("ERROR: key 'centroid' not in response (result)")
        if not 'point' in result['centroid'].keys(): raise Exception("ERROR: key 'point' not in response (result['centroid'])")
        centroid = result['centroid']
        longitude = float(centroid['point'][0])
        latitude = float(centroid['point'][1])
        if not 'name' in result.keys(): raise Exception("ERROR: key 'name' not in response (result)")
        if not 'location' in result.keys(): raise Exception("ERROR: key 'location' not in response (result)")
        if not 'id' in result.keys(): raise Exception("ERROR: key 'id' not in response (result)")
        if not 'dataTypesCount' in result.keys(): raise Exception("ERROR: key 'dataTypesCount' not in response (result)")       
        if not 'boundingPoints' in result.keys(): raise Exception("ERROR: key 'boundingPoints' not in response (result)")        
        if not 'stations' in result.keys(): raise Exception("ERROR: key 'stations' not in response (result)")
        stations = result['stations']
        for station in stations:
            if not 'name' in station.keys(): raise Exception("ERROR: key 'name' not in response (stations)")
            if not 'id' in station.keys(): raise Exception("ERROR: key 'id' not in response (stations)")
            if verbose: print(station['id'], station['name'], "\t", latitude, longitude, "\t", result['startDate'], "\t", result['endDate'])
            station_data.append([station['id'], station['name'], latitude, longitude, result['startDate'], result['endDate']])
    
    return station_data

In [6]:
20.102365,101.947632,23.074678,107.484741
stations = get_stations_by_bounding_box(23.440, 102.337, 19.465, 107.999, False)

In [7]:
stations

[['LAW00041062',
  'PHU CUM LS 50, LA',
  19.9,
  103.13333,
  '1967-08-25T00:00:00',
  '1968-12-30T23:59:59'],
 ['CHM00056985',
  'MENGZI, CH',
  23.383,
  103.383,
  '1945-03-01T00:00:00',
  '2024-11-30T23:59:59'],
 ['VMM00048806',
  'SON LA, VM',
  21.333,
  103.9,
  '1997-03-10T00:00:00',
  '2024-11-30T23:59:59'],
 ['VMM00048808',
  'CAO BANG, VM',
  22.667,
  106.25,
  '1959-05-09T00:00:00',
  '2024-11-30T23:59:59'],
 ['VMM00048825',
  'HA DONG, VM',
  20.967,
  105.767,
  '2012-09-04T00:00:00',
  '2024-11-30T23:59:59'],
 ['VMM00048820',
  'NOIBAI INTERNATIONAL, VM',
  21.221,
  105.807,
  '1958-12-18T00:00:00',
  '2024-11-30T23:59:59'],
 ['VMM00048826',
  'PHU LIEN, VM',
  20.8,
  106.633,
  '1974-09-24T00:00:00',
  '2024-11-30T23:59:59'],
 ['VMM00048830',
  'LANG SON, VM',
  21.833,
  106.767,
  '1974-09-26T00:00:00',
  '2024-11-30T23:59:59'],
 ['VMM00048840',
  'THANH HOA, VM',
  19.75,
  105.783,
  '1974-09-24T00:00:00',
  '2024-11-30T23:59:59'],
 ['CHM00059417',
  'LONGZHOU, 

Filter only stations from Viet Nam

In [8]:
print(type(stations))

filter_stations = [station for station in stations if 'VMM' in station[0]]
filter_stations

<class 'list'>


[['VMM00048806',
  'SON LA, VM',
  21.333,
  103.9,
  '1997-03-10T00:00:00',
  '2024-11-30T23:59:59'],
 ['VMM00048808',
  'CAO BANG, VM',
  22.667,
  106.25,
  '1959-05-09T00:00:00',
  '2024-11-30T23:59:59'],
 ['VMM00048825',
  'HA DONG, VM',
  20.967,
  105.767,
  '2012-09-04T00:00:00',
  '2024-11-30T23:59:59'],
 ['VMM00048820',
  'NOIBAI INTERNATIONAL, VM',
  21.221,
  105.807,
  '1958-12-18T00:00:00',
  '2024-11-30T23:59:59'],
 ['VMM00048826',
  'PHU LIEN, VM',
  20.8,
  106.633,
  '1974-09-24T00:00:00',
  '2024-11-30T23:59:59'],
 ['VMM00048830',
  'LANG SON, VM',
  21.833,
  106.767,
  '1974-09-26T00:00:00',
  '2024-11-30T23:59:59'],
 ['VMM00048840',
  'THANH HOA, VM',
  19.75,
  105.783,
  '1974-09-24T00:00:00',
  '2024-11-30T23:59:59']]

Use API to access climate data for each station

In [9]:
import requests
from datetime import datetime
from dateutil.relativedelta import relativedelta
import calendar

def get_data_from_station(station_id, startdate, enddate, verbose):
    print(station_id)
    # normalize variable

    if ':' in str(station_id):
        station_id = str(station_id).partition(':')[2]

    if 'T' in str(startdate):
        startdate = str(startdate).partition('T')[0]

    if 'T' in str(enddate):
        enddate = str(enddate).partition('T')[0]

    data_type = "daily-summaries"
    # data_type = "global-summary-of-the-month"

    url = "https://www.ncei.noaa.gov/access/services/data/v1" 
    url += "?dataset=" + data_type
    url += "&dataTypes=TAVG,TMIN,TMAX,PRCP"
    url += "&stations=" + station_id 
    url += "&startDate=" + startdate 
    url += "&endDate=" + enddate 
    url += "&format=json&units=standard&includeAttributes=false";

    headers = {'Sec-Ch-Ua-Platform': 'Windows',
              'Sec-Ch-Ua-Platform-Version': '10.0.0',
              'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'}
    
    headers.update({"accept-language": "en-US,en;q=0.9", 'token': 'lHnoGeREULAfljjRfjBMtNWwpCCKkeFD'})
        
    try:
        req = requests.get(url, data=None, json=None, headers=headers)
    except Exception as e:
        print('ERROR: ' + repr(e), ' get_data_from_station()', url)
        return None
    
    if req.status_code == 200:
        pass
    elif req.status_code == 429:
        print("ERROR: HTTP 429 Too Many Requests! rate limiting.  resp.status_code = ", str(req.status_code), req.text, " get_data_from_station()")
        return None
    elif req.status_code == 503:
        print("ERROR: HTTP 503 Service Unavailable.  Server is temporarily unavailable due to maintenance or capacity issues.  get_data_from_station()")
        return None
    else:
        print('\tERROR: resp.status_code = ', str(req.status_code), req.text, "get_data_from_station()")
        return None

    resp = req.json()

    data = {"DATE": [], "STATION" : [], "TMAX" : [], "TMIN" : [], "TAVG" : [], "PRCP" : []}

    for line in resp:
        if not "DATE" in line.keys(): raise Exception("ERROR: key 'DATE' expected but not found. ", line)
        if not "STATION" in line.keys(): raise Exception("ERROR: key 'STATION' expected but not found. ", line)
        
        missing_element = 0

        if not "TMAX" in line.keys(): 
            missing_element += 1
        if not "TMIN" in line.keys(): 
            missing_element += 1
        if not "TAVG" in line.keys():
            missing_element += 1
        if not "PRCP" in line.keys():
            missing_element += 1
        
        if (missing_element == 4):
            continue

        data['STATION'].append(line.get('STATION', None))
        data['DATE'].append(line.get('DATE', None))
        data['TMAX'].append(line.get('TMAX', None))
        data['TMIN'].append(line.get('TMIN', None))
        data['TAVG'].append(line.get('TAVG', None))
        data['PRCP'].append(line.get('PRCP', None))
    
    return pd.DataFrame(data)
    

    

In [10]:
for station in filter_stations:
    print(station[0])
    print(station[4], station[5])
df = pd.concat([get_data_from_station(station[0], station[4], station[5], False) for station in filter_stations])



VMM00048806
1997-03-10T00:00:00 2024-11-30T23:59:59
VMM00048808
1959-05-09T00:00:00 2024-11-30T23:59:59
VMM00048825
2012-09-04T00:00:00 2024-11-30T23:59:59
VMM00048820
1958-12-18T00:00:00 2024-11-30T23:59:59
VMM00048826
1974-09-24T00:00:00 2024-11-30T23:59:59
VMM00048830
1974-09-26T00:00:00 2024-11-30T23:59:59
VMM00048840
1974-09-24T00:00:00 2024-11-30T23:59:59
VMM00048806
VMM00048808
VMM00048825
VMM00048820
VMM00048826
ERROR: HTTP 503 Service Unavailable.  Server is temporarily unavailable due to maintenance or capacity issues.  get_data_from_station()
VMM00048830
VMM00048840


In [11]:
df.to_csv('data/crawled_data_northern_vietnam.csv', encoding='utf-8')