Part 1 of 7  - Web Scraper - This is the data gathering part, no analysis

In [2]:
import requests
import geopandas as gpd
import shapely

import time
import functools
import requests

def exponential_backoff(
    max_retries=5,
    initial_delay=1,
    backoff_factor=2,
    retry_statuses=(429, 500, 502, 503, 504)
):
    def decorator(func):
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            retries = 0
            delay = initial_delay
            while True:
                try:
                    return func(*args, **kwargs)
                except requests.HTTPError as e:
                    status = e.response.status_code
                    if status in retry_statuses:
                        if retries >= max_retries:
                            print(f"❌ Max retries exceeded (status {status})")
                            raise
                        retry_after = e.response.headers.get("Retry-After")
                        sleep_time = int(retry_after) if retry_after else delay
                        print(f"⚠️ HTTP {status} — retrying in {sleep_time} seconds...")
                        time.sleep(sleep_time)
                        delay *= backoff_factor
                        retries += 1
                    else:
                        raise
        return wrapper
    return decorator

In [9]:
##### THERE HAS TO BE A PLACE TO QUERY W PYTHON - JUST NEED TO FIND IT

import requests

url = "https://api.waterdata.usgs.gov/ogcapi/v0/collections/daily/items"
params = {
    "bbox": "-113.3,40.6,-111.6,42.0",
    "limit" : 10000
}
headers = {
    "Accept": "application/json"
}

response = requests.get(url, headers=headers, params=params)
data = response.json()


print("Status code:", response.status_code)
print("Content-Type:", response.headers.get("Content-Type", ""))
print("Response length:", len(response.text))
# print("Preview:", response.text)  # preview the content

# Safe JSON parsing
if 'application/json' in response.headers.get("Content-Type", ""):
    try:
        data = response.json()
        print("Parsed JSON keys:", list(data.keys()))
    except Exception as e:
        print("JSON parse error:", e)
else:
    print("Server did not return JSON.")
    
    
features = response.json()["features"]
print(len(features))

Status code: 200
Content-Type: application/json; charset=utf-8
Response length: 8172027
Parsed JSON keys: ['type', 'features', 'numberReturned', 'links', 'timeStamp']
10000


In [22]:
for feat in features:
    print(feat)

{'type': 'Feature', 'properties': {'time': '1973-08-17', 'unit_of_measure': 'ft^3/s', 'qualifier': None, 'parameter_code': '00060', 'statistic_id': '00003', 'monitoring_location_id': 'USGS-10125600', 'time_series_id': '0e181e9905f94fbc87e3068a6ae2a4a4', 'value': '40.0', 'approval_status': 'Approved', 'last_modified': '2025-07-05T00:41:07.798504+00:00'}, 'id': '00000a97-6364-4cf9-922c-720bb5856347', 'geometry': {'type': 'Point', 'coordinates': [-112.147732621578, 41.838537290374]}}


In [None]:
offset = 0
limit = 5000
gsl_data = []

base_url = "https://api.waterdata.usgs.gov/ogcapi/v0/collections/daily/items"
base_params = {
    "bbox": "-113.3,40.6,-111.6,42.0",
    "limit" : limit
}
headers = {
    "Accept": "application/json"
}

gsl_2016_gdf = gpd.read_file(r"data\UTAH_GIS\GSLWaterLevel2016\GSLWaterLevel2016.shp")
gsl_2016_gdf = gsl_2016_gdf.to_crs("EPSG:4326")

while True:
    base_params["offset"] = offset
    response = requests.get(base_url, headers=headers, params=base_params)
    if response.status_code == 400:
        print("break 400")
        break
    
    features = response.json().get("features", [])
    
    if not features:
        print(f"No more features at offset {offset}.")
        break

    print(f"Retrieved {len(features)} features at offset {offset}")

    for feat in features:
        coords = feat["geometry"]["coordinates"]
        point_geom = shapely.geometry.Point(coords[0], coords[1])
        
        # if gsl_2016_gdf.contains(point_geom).any():
        feat_props = feat.get("properties", {})
        if feat_props["unit_of_measure"] == "ft":
            feat_props["geometry"] = point_geom
            gsl_data.append(feat_props)
    
    offset += limit
    
print(gsl_data)
result_gdf = gpd.GeoDataFrame(gsl_data, geometry="geometry", crs="EPSG:4326")
result_gdf.to_csv("USGS_daily_items_gsl.csv", index=False)
result_gdf.to_file('USGS_daily_items_gsl.shp')

Retrieved 5000 features at offset 0
Retrieved 5000 features at offset 5000
Retrieved 5000 features at offset 10000
Retrieved 5000 features at offset 15000
Retrieved 5000 features at offset 20000
Retrieved 5000 features at offset 25000
Retrieved 5000 features at offset 30000
Retrieved 5000 features at offset 35000
Retrieved 5000 features at offset 40000
break 400


In [None]:
from USGS_GSL_STATIONS import USGS_GSL_STATIONS
BASE = "https://api.waterdata.usgs.gov/ogcapi/v0"
base_filename = "data/usgs_history/GSL_WTR_LEVEL_HISTORY_{station}"
for monitoring_station in USGS_GSL_STATIONS:
    filename = base_filename.format(station=monitoring_station)
    station_data = []
    url = f"{BASE}/collections/time-series-metadata"
    params = {
        "monitoring_location_id": monitoring_station,
        "f": "json"
    }
    resp = requests.get(url, params=params)
    print(resp.url)
    resp.raise_for_status()
    print(resp.json())
    
    # result_gdf = gpd.GeoDataFrame(station_data, geometry="geometry", crs="EPSG:4326")
    # result_gdf.to_csv(f"{filename}.csv", index=False)
    # result_gdf.to_file(f"{filename}.shp")

https://api.waterdata.usgs.gov/ogcapi/v0/collections/time-series-metadata?monitoring_location_id=USGS-10010027&f=json
{'id': 'time-series-metadata', 'title': 'Time series metadata', 'description': 'Daily data and continuous measurements are grouped into time series, which represent a collection of observations of a single parameter, potentially aggregated using a standard statistic, at a single monitoring location. This endpoint provides metadata about those time series, including their operational thresholds, units of measurement, and when the earliest and most recent observations in a time series occurred.\n', 'keywords': ['current', 'single location', 'data'], 'links': [{'type': 'application/json', 'rel': 'root', 'title': 'The landing page of this server as JSON', 'href': 'https://api.waterdata.usgs.gov/ogcapi/v0?f=json'}, {'type': 'text/html', 'rel': 'root', 'title': 'The landing page of this server as HTML', 'href': 'https://api.waterdata.usgs.gov/ogcapi/v0?f=html'}, {'type': 'app

In [7]:
import requests
import shapely.geometry

@exponential_backoff(max_retries=5)
def fetch_all_daily_data_for_station(location_id: str, start_date: str = None, end_date: str = None, limit: int = 1000):
    base_url = "https://api.waterdata.usgs.gov/ogcapi/v0/collections/daily/items"
    
    all_records = []
    offset = 0

    print(f"Fetching full daily history for station {location_id}...")

    while True:
        params = {
            "monitoring_location_id": location_id,
            "limit": limit,
            "offset": offset,
            "f": "json",
            "api_key": "JqmRwodclL6exiCqERg0efqmmFaEyAK7175cdqT9"
        }
        if start_date and end_date:
            params["datetime"] = f"{start_date}/{end_date}"

        try:
            response = requests.get(base_url, params=params)
            if response.status_code != 200:
                raise requests.HTTPError(response=response)
            data = response.json()
        except requests.HTTPError as e:
            print(f"HTTP error {e.response.status_code} encountered. Stopping fetch.")
            break
        except Exception as e:
            print(f"Unexpected error: {e}. Stopping fetch.")
            break

        features = data.get("features", [])
        if not features:
            # No more data
            break
        if response.status_code != 200:
            raise requests.HTTPError(response=response)  # Let decorator catch this

        data = response.json()
        features = data.get("features", [])

        if not features:
            # No more data
            break

        for feature in features:
            props = feature.get("properties", {})
            geom = feature.get("geometry")
            if geom and geom.get("type") == "Point":
                coords = geom.get("coordinates")
                if coords and len(coords) == 2:
                    lon, lat = coords
                    props["longitude"] = lon
                    props["latitude"] = lat
                    props["geometry"] = shapely.geometry.Point(lon, lat)  # shapely Point for GeoPandas
            if props["unit_of_measure"] == "ft":
                all_records.append(props)

        offset += limit
        print(f"Fetched {len(all_records)} records so far...")

    print(f"✅ Retrieved total {len(all_records)} records for {location_id}")
    return all_records


In [8]:

import os
from USGS_GSL_STATIONS import USGS_GSL_STATIONS
base_filename = "data/usgs_history/GSL_WTR_LEVEL_HISTORY_{station}"
for monitoring_station in USGS_GSL_STATIONS:
    print(monitoring_station)
    filename = base_filename.format(station=monitoring_station)
    station_data = fetch_all_daily_data_for_station(monitoring_station)
    if os.path.exists(f"{filename}.csv") or os.path.exists(f"{filename}.shp"):
        print(f"Skipping {monitoring_station} because data file already exists.")
        continue
    
    result_gdf = gpd.GeoDataFrame(station_data, geometry="geometry", crs="EPSG:4326")
    print(result_gdf.shape)
    result_gdf.to_csv(f"{filename}.csv", index=False)
    result_gdf.to_file(f"{filename}.shp")


USGS-10010027
Fetching full daily history for station USGS-10010027...
Fetched 1000 records so far...
Fetched 2000 records so far...
Fetched 2174 records so far...
✅ Retrieved total 2174 records for USGS-10010027
(2174, 13)
USGS-10010024
Fetching full daily history for station USGS-10010024...


  result_gdf.to_file(f"{filename}.shp")
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(


Fetched 1000 records so far...
Fetched 2000 records so far...
Fetched 2416 records so far...
✅ Retrieved total 2416 records for USGS-10010024
(2416, 13)
USGS-10010100
Fetching full daily history for station USGS-10010100...


  result_gdf.to_file(f"{filename}.shp")
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(


Fetched 1000 records so far...
Fetched 2000 records so far...
Fetched 3000 records so far...
Fetched 4000 records so far...
Fetched 5000 records so far...
Fetched 6000 records so far...
Fetched 7000 records so far...
Fetched 8000 records so far...
Fetched 9000 records so far...
Fetched 10000 records so far...
Fetched 11000 records so far...
Fetched 12000 records so far...
Fetched 13000 records so far...
Fetched 13715 records so far...
✅ Retrieved total 13715 records for USGS-10010100
(13715, 13)


  result_gdf.to_file(f"{filename}.shp")
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(


USGS-10010050
Fetching full daily history for station USGS-10010050...
Fetched 1000 records so far...
Fetched 1412 records so far...
✅ Retrieved total 1412 records for USGS-10010050
(1412, 13)


  result_gdf.to_file(f"{filename}.shp")
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(


USGS-10141000
Fetching full daily history for station USGS-10141000...
Fetched 146 records so far...
Fetched 287 records so far...
Fetched 454 records so far...
Fetched 595 records so far...
Fetched 732 records so far...
Fetched 884 records so far...
Fetched 1024 records so far...
Fetched 1198 records so far...
Fetched 1349 records so far...
Fetched 1496 records so far...
Fetched 1640 records so far...
Fetched 1787 records so far...
Fetched 1951 records so far...
Fetched 2124 records so far...
Fetched 2272 records so far...
Fetched 2430 records so far...
Fetched 2561 records so far...
Fetched 2704 records so far...
Fetched 2844 records so far...
Fetched 3003 records so far...
Fetched 3156 records so far...
Fetched 3305 records so far...
Fetched 3439 records so far...
Fetched 3605 records so far...
Fetched 3763 records so far...
Fetched 3919 records so far...
Fetched 4080 records so far...
Fetched 4217 records so far...
Fetched 4359 records so far...
Fetched 4502 records so far...
Fetch

  result_gdf.to_file(f"{filename}.shp")
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(


Fetched 1000 records so far...
Fetched 2000 records so far...
Fetched 3000 records so far...
Fetched 4000 records so far...
Fetched 5000 records so far...
Fetched 6000 records so far...
Fetched 7000 records so far...
Fetched 8000 records so far...
Fetched 9000 records so far...
Fetched 10000 records so far...
Fetched 11000 records so far...
Fetched 12000 records so far...
Fetched 13000 records so far...
Fetched 14000 records so far...
Fetched 14849 records so far...
✅ Retrieved total 14849 records for USGS-10010000
(14849, 13)


  result_gdf.to_file(f"{filename}.shp")
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
