# Crawl data from [NOAA](https://www.ncei.noaa.gov/support/access-data-service-api-user-documentation)


In [21]:
%pip install requests

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [22]:
import pandas as pd
import numpy as np

In [23]:
import requests

In [24]:
import math

def haversine_distance(lat1, lon1, lat2, lon2):
    """
    Returns the distance in meters between the two GPS coordinates in decimal degrees.

    Usage:
        dist_m = haversine_distance(40.440363, -76.126746, 40.440406, -76.121293)
        print("dist_m: ", dist_m)
        # Result is 461.5 m.  The correct result is 461.5 m. 

    """
    # Radius of the Earth in meters
    earth_radius_m = 6371000.0
    
    # Convert latitude and longitude from degrees to radians
    lat1_rad = math.radians(lat1)
    lon1_rad = math.radians(lon1)
    lat2_rad = math.radians(lat2)
    lon2_rad = math.radians(lon2)
    
    # Differences between the latitudes and longitudes
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad
    
    # Haversine formula
    a = math.sin(dlat / 2)**2 + math.cos(lat1_rad) * math.cos(lat2_rad) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    
    # Calculate the distance
    distance = earth_radius_m * c
    
    return distance

In [25]:

def elevation_by_lat_lon(lat1, lon1):
    """
    Returns the elevation in meters for the specified latitude/longitude, or None
    if the service could not determine the elevation. 

    Uses Open-Elevation.

    Open-Elevation
    website: https://open-elevation.com/
    docs: https://github.com/Jorl17/open-elevation/blob/master/docs/api.md
    endpoint: curl 'https://api.open-elevation.com/api/v1/lookup?locations=10,10|20,20|41.161758,-8.583933'

    Usage:
        elevation_m = elevation_by_lat_lon(38.5288, -78.4383)
        print('Elevation for 38.5288, -78.4383: ', elevation_m, 'meters')
    """

    url = 'https://api.open-elevation.com/api/v1/lookup?locations=' + str(lat1) + ',' + str(lon1)

    try:
        resp = requests.get(url).json() 
    except Exception as e:
        print('ERROR: ' + repr(e), url)

    if not resp:
        return 0

    #print(json.dumps(data, sort_keys=True, indent=2))
    if not 'results' in resp.keys(): 
        print(json.dumps(resp, sort_keys=True, indent=2))
        return None
    results = resp['results'][0]
    if not 'elevation' in results.keys(): return None
    elevation = float(results['elevation'])
    return elevation


In [26]:
import operator

def get_stations_by_bounding_box(lat, lon, gps_n, gps_w, gps_s, gps_e, verbose):
    """
    Using the latest API endpoint

    Returns a list of stations along with their GPS coordinates and start/end dates,
    and the computed distance and elevation delta from the target location specified
    by lat,lon that include the dataTypes = TMIN,TMAX,PRCP.  

    Usage:

    stations = get_stations_by_bounding_box(40.44077631612291, -76.12267163754181, False)
    if stations is None:
        print("NO results")
    else:
        print("The best station choice is: ", stations[0][3], "from ", stations[0][6], " to ", stations[0][7])
        # All data in stations:
        print("sort, delta_elevation_m, delta_distance_m, ID, latitude, longitude, dateStart, dateEnd")
        for station in stations:
            print(station)
    """

    url = "https://www.ncei.noaa.gov/access/services/search/v1/data?dataset=" + "daily-summaries"
    url += "&bbox=" + str(gps_n) + "," + str(gps_w) + "," + str(gps_s) + "," + str(gps_e)
    url += "&dataTypes=TMIN,TMAX,PRCP"
    url += "&limit=10&offset=0"
    if verbose: print(url)

    try:
        req = requests.get(url, data=None, json=None, headers=None)
    except Exception as e:
        print('ERROR: ' + repr(e), ' fn()', url)
        return None
    
    if not req.status_code == 200: 
        if req.status_code == 429:
            raise Exception('ERROR: HTTP 429 Too Many Requests! rate limiting.  resp.status_code = ', str(req.status_code), req.text)
        else:
            print('\tERROR: resp.status_code = ', str(req.status_code), req.text)
            return None

    resp = req.json()
    if len(resp) == 0:
        print("\tNo results from query ", url)
        return None
    
    #print(json.dumps(resp, sort_keys=False, indent=2))
    station_data = []
    #if 'data' in resp.keys():
    if not 'results' in resp.keys(): raise Exception("ERROR: key 'results' not in response")
    results = resp['results']
    for result in results:
        if not 'endDate' in result.keys(): raise Exception("ERROR: key 'endDate' not in response (result)")
        if not 'startDate' in result.keys(): raise Exception("ERROR: key 'startDate' not in response (result)")
        if not 'centroid' in result.keys(): raise Exception("ERROR: key 'centroid' not in response (result)")
        if not 'point' in result['centroid'].keys(): raise Exception("ERROR: key 'point' not in response (result['centroid'])")
        centroid = result['centroid']
        longitude = float(centroid['point'][0])
        latitude = float(centroid['point'][1])
        if not 'name' in result.keys(): raise Exception("ERROR: key 'name' not in response (result)")
        if not 'location' in result.keys(): raise Exception("ERROR: key 'location' not in response (result)")
        if not 'id' in result.keys(): raise Exception("ERROR: key 'id' not in response (result)")
        if not 'dataTypesCount' in result.keys(): raise Exception("ERROR: key 'dataTypesCount' not in response (result)")       
        if not 'boundingPoints' in result.keys(): raise Exception("ERROR: key 'boundingPoints' not in response (result)")        
        if not 'stations' in result.keys(): raise Exception("ERROR: key 'stations' not in response (result)")
        stations = result['stations']
        for station in stations:
            if not 'name' in station.keys(): raise Exception("ERROR: key 'name' not in response (stations)")
            if not 'id' in station.keys(): raise Exception("ERROR: key 'id' not in response (stations)")
            delta_distance_m = haversine_distance(lat, lon, latitude, longitude)
            delta_elevation_m = abs(elevation_by_lat_lon(lat, lon) - elevation_by_lat_lon(latitude, longitude))
            sort = delta_distance_m * delta_elevation_m
            if verbose: print(station['id'], "\t", latitude, longitude, "\t", round(delta_distance_m,1), round(delta_elevation_m,1), result['startDate'], "\t", result['endDate'])
            station_data.append([round(sort,1), round(delta_elevation_m,1), round(delta_distance_m,1), station['id'], latitude, longitude, result['startDate'], result['endDate']])
    
    # Sort the data by delta_distance_m * delta_elevation
    station_data.sort(reverse=False, key=operator.itemgetter(0))
    return station_data

In [27]:
stations = get_stations_by_bounding_box(20.9642809, 105.7935119, 23.440, 103.337, 19.465, 107.337, False)
# stations = get_stations_by_bounding_box(40.44077631612291, -76.12267163754181, False)

In [28]:
stations

[[0.0,
  0.0,
  135026.3,
  'VMM00048840',
  19.75,
  105.783,
  '1974-09-24T00:00:00',
  '2024-11-11T23:59:59'],
 [11077.5,
  4.0,
  2769.4,
  'VMM00048825',
  20.967,
  105.767,
  '2012-09-04T00:00:00',
  '2024-11-11T23:59:59'],
 [200061.0,
  7.0,
  28580.1,
  'VMM00048820',
  21.221,
  105.807,
  '1958-12-18T00:00:00',
  '2024-11-11T23:59:59'],
 [356431.0,
  4.0,
  89107.7,
  'VMM00048826',
  20.8,
  106.633,
  '1974-09-24T00:00:00',
  '2024-11-11T23:59:59'],
 [23266527.0,
  126.0,
  184655.0,
  'CHM00059417',
  22.367,
  106.75,
  '1953-01-01T00:00:00',
  '2024-11-12T23:59:59'],
 [34760512.0,
  249.0,
  139600.4,
  'VMM00048830',
  21.833,
  106.767,
  '1974-09-26T00:00:00',
  '2024-11-11T23:59:59'],
 [43314292.9,
  222.0,
  195109.4,
  'VMM00048808',
  22.667,
  106.25,
  '1959-05-09T00:00:00',
  '2024-11-11T23:59:59'],
 [129989337.0,
  648.0,
  200600.8,
  'VMM00048806',
  21.333,
  103.9,
  '1997-03-10T00:00:00',
  '2024-11-11T23:59:59'],
 [469530386.6,
  1283.0,
  365962.9,
  '

Filter only stations from Viet Nam

In [29]:
print(type(stations))

filter_stations = [station for station in stations if 'VMM' in station[3]]
print(filter_stations)

<class 'list'>
[[0.0, 0.0, 135026.3, 'VMM00048840', 19.75, 105.783, '1974-09-24T00:00:00', '2024-11-11T23:59:59'], [11077.5, 4.0, 2769.4, 'VMM00048825', 20.967, 105.767, '2012-09-04T00:00:00', '2024-11-11T23:59:59'], [200061.0, 7.0, 28580.1, 'VMM00048820', 21.221, 105.807, '1958-12-18T00:00:00', '2024-11-11T23:59:59'], [356431.0, 4.0, 89107.7, 'VMM00048826', 20.8, 106.633, '1974-09-24T00:00:00', '2024-11-11T23:59:59'], [34760512.0, 249.0, 139600.4, 'VMM00048830', 21.833, 106.767, '1974-09-26T00:00:00', '2024-11-11T23:59:59'], [43314292.9, 222.0, 195109.4, 'VMM00048808', 22.667, 106.25, '1959-05-09T00:00:00', '2024-11-11T23:59:59'], [129989337.0, 648.0, 200600.8, 'VMM00048806', 21.333, 103.9, '1997-03-10T00:00:00', '2024-11-11T23:59:59']]


Use API to access climate data for each station

In [30]:
import requests
from datetime import datetime
from dateutil.relativedelta import relativedelta
import calendar

def get_data_from_station(station_id, startdate, enddate, verbose):
    print(station_id)
    # normalize variable

    if ':' in str(station_id):
        station_id = str(station_id).partition(':')[2]

    if 'T' in str(startdate):
        startdate = str(startdate).partition('T')[0]

    if 'T' in str(enddate):
        enddate = str(enddate).partition('T')[0]

    data_type = "daily-summaries"
    # data_type = "global-summary-of-the-month"

    url = "https://www.ncei.noaa.gov/access/services/data/v1" 
    url += "?dataset=" + data_type
    url += "&dataTypes=TMIN,TMAX,PRCP"
    url += "&stations=" + station_id 
    url += "&startDate=" + startdate 
    url += "&endDate=" + enddate 
    url += "&format=json&units=standard&includeAttributes=false";

    headers = {'Sec-Ch-Ua-Platform': 'Windows',
              'Sec-Ch-Ua-Platform-Version': '10.0.0',
              'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'}
    
    headers.update({"accept-language": "en-US,en;q=0.9", 'token': 'lHnoGeREULAfljjRfjBMtNWwpCCKkeFD'})
        
    try:
        req = requests.get(url, data=None, json=None, headers=headers)
    except Exception as e:
        print('ERROR: ' + repr(e), ' get_data_from_station()', url)
        return None
    
    if req.status_code == 200:
        pass
    elif req.status_code == 429:
        print("ERROR: HTTP 429 Too Many Requests! rate limiting.  resp.status_code = ", str(req.status_code), req.text, " get_data_from_station()")
        return None
    elif req.status_code == 503:
        print("ERROR: HTTP 503 Service Unavailable.  Server is temporarily unavailable due to maintenance or capacity issues.  get_data_from_station()")
        return None
    else:
        print('\tERROR: resp.status_code = ', str(req.status_code), req.text, "get_data_from_station()")
        return None

    resp = req.json()

    data = {"DATE": [], "STATION" : [], "TMAX" : [], "TMIN" : [], "PRCP" : []}

    for line in resp:
        if not "DATE" in line.keys(): raise Exception("ERROR: key 'DATE' expected but not found. ", line)
        if not "STATION" in line.keys(): raise Exception("ERROR: key 'STATION' expected but not found. ", line)
        
        if not "TMAX" in line.keys(): 
            continue
        if not "TMIN" in line.keys(): 
            continue
        if not "PRCP" in line.keys():
            continue

        data['STATION'].append(line['STATION'])
        data['DATE'].append(line['DATE'])
        data['TMAX'].append(line['TMAX'])
        data['TMIN'].append(line['TMIN'])
        data['PRCP'].append(line['PRCP'])
    
    return pd.DataFrame(data)
    

    

In [31]:
for station in filter_stations:
    print(station[3])
    print(station[6], station[7])
df = pd.concat([get_data_from_station(station[3], station[6], station[7], False) for station in filter_stations])



VMM00048840
1974-09-24T00:00:00 2024-11-11T23:59:59
VMM00048825
2012-09-04T00:00:00 2024-11-11T23:59:59
VMM00048820
1958-12-18T00:00:00 2024-11-11T23:59:59
VMM00048826
1974-09-24T00:00:00 2024-11-11T23:59:59
VMM00048830
1974-09-26T00:00:00 2024-11-11T23:59:59
VMM00048808
1959-05-09T00:00:00 2024-11-11T23:59:59
VMM00048806
1997-03-10T00:00:00 2024-11-11T23:59:59
VMM00048840
VMM00048825
VMM00048820
VMM00048826
VMM00048830
VMM00048808
ERROR: HTTP 503 Service Unavailable.  Server is temporarily unavailable due to maintenance or capacity issues.  get_data_from_station()
VMM00048806


In [33]:
df.to_csv('data/crawled_data_northern_vietnam.csv', encoding='utf-8')