This code is adapted from the SMHI official API, and just uses their structure to organize a reusable function. Not only covers streamflow, but other variables.
import requests
import pandas as pd
import io
from typing import Union, List, Optional
# Code using the base SMHI HydroObs API to fetch station metadata and time series data.
# Documentation (in Swedish) can be found here:
# https://opendata.smhi.se/hydroobs/api
BASE_URL = "https://opendata-download-hydroobs.smhi.se/api/version/latest/"
SUPPORTED_PARAMETERS = {
"discharge_daily": 1,
"discharge_15min": 2,
"water_level": 3,
"water_temperature": 4,
"ice_formation": 5,
"ice_breakup": 6,
"ice_thickness": 7,
"snow_density": 8,
"water_content": 9,
"discharge_monthly": 10,
}
'''
| ID | Swedish Name | English Name | Unit |
|----|---------------------------|--------------------------|--------|
| 1 | Vattenföring (Dygn) | Discharge (Daily) | m³/s |
| 2 | Vattenföring (15 min) | Discharge (15 min) | m³/s |
| 3 | Vattenstånd | Water level | cm |
| 4 | Vattendragstemperatur | Water temperature | °C |
| 5 | Isläggning | Ice formation date | — |
| 6 | Islossning | Ice breakup date | — |
| 7 | Istjocklek | Ice thickness | cm |
| 8 | Snödensitet | Snow density | g/cm³ |
| 9 | Vatteninnehåll | Water content (SWE) | mm |
| 10 | Vattenföring (Månad) | Discharge (Monthly) | m³/s |
"""
'''
SUPPORTED_PERIODS = ["latest-day", "latest-months", "corrected-archive", "observations"]
def _get_parameters() -> pd.DataFrame:
"""Return a list of available HydroObs parameters."""
url = f"{BASE_URL}parameter.json"
r = requests.get(url)
r.raise_for_status()
data = r.json()["parameter"]
return pd.DataFrame(data)
def _get_station_metadata(parameter_id: int) -> pd.DataFrame:
"""Get metadata for all stations measuring a given parameter (e.g. discharge)."""
url = f"{BASE_URL}parameter/{parameter_id}.json"
r = requests.get(url)
r.raise_for_status()
data = r.json()["station"]
df = pd.DataFrame(
[
{
"id": s["id"],
"name": s["name"],
"latitude": s["latitude"],
"longitude": s["longitude"],
"from": pd.to_datetime(s["from"], unit="ms"),
"to": pd.to_datetime(s["to"], unit="ms"),
}
for s in data
]
)
df["years"] = (df["to"] - df["from"]).dt.days / 365.25
df["active"] = df["to"] > pd.Timestamp.today()
return df
def get_metadata(parameter: str = "discharge_daily") -> pd.DataFrame:
"""Get metadata for stations measuring a given parameter (e.g. discharge, temperature)."""
if parameter not in SUPPORTED_PARAMETERS:
raise ValueError(f"Unsupported parameter: {parameter}")
parameter_id = SUPPORTED_PARAMETERS[parameter]
metadata = _get_station_metadata(parameter_id)
metadata["parameter_id"] = parameter_id
metadata["parameter_name"] = parameter
return metadata
import re
def get_data(
station_id: int,
parameter: Union[str, int] = "discharge",
period: str = "corrected-archive",
start: Optional[str] = None,
end: Optional[str] = None,
verbose: bool = True,
) -> pd.DataFrame:
"""
Download discharge (or other parameter) data for a specific SMHI station.
Handles SMHI's messy CSVs by:
- reading only lines that start with a date (YYYY-MM-DD)
- ignoring metadata, comments, or notes
"""
# --- parameter setup
if isinstance(parameter, str):
if parameter not in SUPPORTED_PARAMETERS:
raise ValueError(f"Unsupported parameter: {parameter}")
parameter_id = SUPPORTED_PARAMETERS[parameter]
else:
parameter_id = parameter
if period not in SUPPORTED_PERIODS:
raise ValueError(f"Invalid period: must be one of {SUPPORTED_PERIODS}")
url = f"{BASE_URL}parameter/{parameter_id}/station/{station_id}/period/{period}/data.csv"
if verbose:
print(f"Downloading data from {url}")
r = requests.get(url)
r.raise_for_status()
lines = r.text.splitlines()
# --- regex for lines that start with date like "1984-08-17"
date_pattern = re.compile(r"^\d{4}-\d{2}-\d{2}")
records = []
for line in lines:
if date_pattern.match(line):
parts = line.split(";")
date = parts[0].strip()
value = parts[1].strip() if len(parts) > 1 else None
quality = parts[2].strip() if len(parts) > 2 else None
records.append((date, value, quality))
df = pd.DataFrame(records, columns=["date", "value", "quality"])
# --- clean and convert
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df["value"] = pd.to_numeric(df["value"], errors="coerce")
df = df.dropna(subset=["date", "value"]).sort_values("date")
if start:
df = df[df["date"] >= pd.to_datetime(start)]
if end:
df = df[df["date"] <= pd.to_datetime(end)]
if verbose:
if not df.empty:
print(f"Final dataset: {len(df)} rows from {df['date'].min().date()} to {df['date'].max().date()}")
else:
print("No valid numeric rows found after cleanup")
return df.reset_index(drop=True)
# Discharge stations (default)
meta_q = get_metadata()
print(meta_q.head())
# Water temperature stations
meta_temp = get_metadata("water_temperature")
print(meta_temp.head())
# Full time series
df = get_data(2357, "discharge_daily", period="corrected-archive")
# Latest day #4
df = get_data(2357, "discharge_daily", period="latest-day")
# Given interval
df = get_data(2357, "discharge_daily", start="2015-01-01", end="2024-12-31")
Sweden
Related to issue #5
This code is adapted from the SMHI official API, and just uses their structure to organize a reusable function. Not only covers streamflow, but other variables.
Code for list of stations (and metadata #1) and time series.
The time series also offer the option of real time data #4 ("latest-day")
Example of usage: