## Packages import

In [None]:
import requests
import pandas as pd
from datetime import datetime, timedelta
import time

## Parameters for data download

In [None]:
station_id = 731  # Gdańsk, ul. Wyzwolenia
start_date = datetime(2025, 1, 1)
end_date = datetime(2025, 3, 31)

## Functions

In [None]:
# Downloading list of sensors for station
def get_sensors(station_id):
    url = f"https://api.gios.gov.pl/pjp-api/v1/rest/station/sensors/{station_id}" # API URL 
    resp = requests.get(url) # GET request to the API
    resp.raise_for_status() # Raise an error for bad responses
    data = resp.json() # JSON response
    sensors = data.get("Lista stanowisk pomiarowych dla podanej stacji", []) # Extracting the list of sensors

    if isinstance(sensors, list) and all(isinstance(s, dict) for s in sensors): # Check if the response is a list of dictionaries
        return sensors
    else: # If the response is not in the expected format, print an error message
        print("Incorrect format", sensors)
        return []

# Extracting archival data for a given sensor and date range
def get_archival_data(sensor_id, date_from_str, date_to_str):
    base_url = f"https://api.gios.gov.pl/pjp-api/v1/rest/archivalData/getDataBySensor/{sensor_id}"
    params = {
        "dateFrom": date_from_str,
        "dateTo": date_to_str,
        "page": 0, # Start from the first page
        "size": 100 # Number of records per page
    }

    all_data = []
    while True: # Loop to handle pagination
        try: # GET request to the API with parameters
            resp = requests.get(base_url, params=params)
            if resp.status_code == 429: # Rate limit exceeded
                print("Paused due to rate limit. Retrying in 2 seconds...")
                time.sleep(2) # Modify the sleep time as needed
                continue

            resp.raise_for_status()
            data = resp.json()
            measurements = data.get("Lista archiwalnych wyników pomiarów", [])
            all_data.extend(measurements)

            total_pages = data.get("totalPages", 1)
            if params["page"] >= total_pages - 1:
                break

            params["page"] += 1
            time.sleep(1.2)
        except Exception as e:
            print(f"Error with sensor: {sensor_id}: {e}")
            break

    return all_data

# Function to process hourly data to daily averages
def process_hourly_to_daily(data):
    df = pd.DataFrame(data)
    if df.empty:
        return df

    df["Data"] = pd.to_datetime(df["Data"])
    df = df.rename(columns={"Wartość": "value"})
    df = df.dropna(subset=["value"])
    df["date"] = df["Data"].dt.date
    daily_avg = df.groupby("date")["value"].mean().reset_index()
    return daily_avg

## Extract

In [None]:
sensors = get_sensors(station_id) # Download sensors for the station

result_dfs = []

for sensor in sensors: # Iterate through sensors
    sensor_id = sensor.get("Identyfikator stanowiska")
    param_formula = sensor.get("Wskaźnik - wzór")

    if param_formula not in ["PM10", "PM2.5"]:
        continue  # Skip if not PM10 or PM2.5

    print(f"🔍 Sensor: {param_formula} (ID: {sensor_id})")

    current_start = start_date
    all_records = []

    while current_start <= end_date: # Iterate through date ranges
        segment_end = min(current_start + timedelta(days=365), end_date)
        date_from_str = current_start.strftime("%Y-%m-%d %H:%M")
        date_to_str = segment_end.strftime("%Y-%m-%d %H:%M")

        print(f"Timestamp: {date_from_str} → {date_to_str}")
        data = get_archival_data(sensor_id, date_from_str, date_to_str)
        all_records.extend(data)

        current_start = segment_end + timedelta(days=1)
        time.sleep(1.5)

    daily_df = process_hourly_to_daily(all_records) # Process data to daily averages

    if not daily_df.empty: # Check if data is not empty
        daily_df.rename(columns={"value": param_formula}, inplace=True)
        result_dfs.append(daily_df.set_index("date"))

# Data check and saving
if result_dfs:
    final_df = pd.concat(result_dfs, axis=1)
    final_df.sort_index(inplace=True)
    filename = "pomiar2025.csv" # Output filename
    final_df.to_csv(filename) # Save to CSV
    print(f"Data saved to {filename}")
else:
    print("No data available for the specified date range.")