# Data collection - Air pollution
Code in this file is used to extract historical data for PM10 and PM2.5 for chosen station from GIOŚ API. The file is divide into parts:
- Importing dependencies
- Defining parameters
- Functions
- Data extracts
- Saving the dataset

## Importing dependencies

In [1]:
import requests # HTTP requests
import pandas as pd # Data manipulation
from datetime import datetime, timedelta # Date and time manipulation
import time

## Defining parameters

In [20]:
station_id = 731  # Gdańsk, ul. Wyzwolenia
start_date = datetime(2025, 1, 1) # Start date for data collection
end_date = datetime(2025, 6, 30) # End date for data collection

## Functions

In [3]:
def get_sensors(station_id):
    """
    Function to get the list of sensors for a given station ID from GIOŚ API.
    Args:
        station_id (int): The ID of the station to get sensors for.
    Returns:
        list: A list of sensors for the specified station, or an empty list if the format is incorrect.
    """
    url = f"https://api.gios.gov.pl/pjp-api/v1/rest/station/sensors/{station_id}" # API URL 
    resp = requests.get(url) # GET request to the API
    resp.raise_for_status() # Raise an error for bad responses
    data = resp.json() # JSON response
    sensors = data.get("Lista stanowisk pomiarowych dla podanej stacji", []) # Extracting the list of sensors

    if isinstance(sensors, list) and all(isinstance(s, dict) for s in sensors): # Check if the response is a list of dictionaries
        return sensors
    else: # If the response is not in the expected format, print an error message
        print("Incorrect format", sensors)
        return []

def get_archival_data(sensor_id, date_from_str, date_to_str):
    """
    Function to get archival data for a specific sensor from GIOŚ API.
    Args:
        sensor_id (int): The ID of the sensor to get data for.
        date_from_str (str): Start date in 'YYYY-MM-DD' format.
        date_to_str (str): End date in 'YYYY-MM-DD' format.
    Returns:
        list: A list of measurements for the specified sensor and date range.
    """
    base_url = f"https://api.gios.gov.pl/pjp-api/v1/rest/archivalData/getDataBySensor/{sensor_id}"
    params = {
        "dateFrom": date_from_str,
        "dateTo": date_to_str,
        "page": 0, # Start from the first page
        "size": 100 # Number of records per page
    }

    all_data = []
    while True: # Loop to handle pagination
        try:
            resp = requests.get(base_url, params=params)
            if resp.status_code == 429: # Rate limit exceeded
                print("Paused due to rate limit. Retrying in 2 seconds...")
                time.sleep(2) # Modify the sleep time as needed
                continue

            resp.raise_for_status()
            data = resp.json()
            measurements = data.get("Lista archiwalnych wyników pomiarów", [])
            all_data.extend(measurements)

            total_pages = data.get("totalPages", 1)
            if params["page"] >= total_pages - 1:
                break

            params["page"] += 1
            time.sleep(1.2)
        except Exception as e:
            print(f"Error with sensor: {sensor_id}: {e}")
            break

    return all_data

def process_hourly_to_daily(data):
    """
    Function to process hourly data into daily averages.
    Args:
        data (list): A list of hourly measurements.
    Returns:
        pd.DataFrame: A DataFrame with daily averages of the measurements.
    """
    df = pd.DataFrame(data)
    if df.empty:
        return df
    df["Data"] = pd.to_datetime(df["Data"])
    df = df.rename(columns={"Wartość": "value"})
    df = df.dropna(subset=["value"])
    df["date"] = df["Data"].dt.date
    daily_avg = df.groupby("date")["value"].mean().reset_index()
    return daily_avg

## Data extracts

In [None]:
sensors = get_sensors(station_id)  # Download sensors for the station

pm25_sensors = [s for s in sensors if s.get("Wskaźnik - wzór") == "PM2.5"]
pm10_sensors = [s for s in sensors if s.get("Wskaźnik - wzór") == "PM10"]

result_dfs = []

def process_sensor_group(sensor_group, param_formula):
    """
    Function to process a group of sensors for a specific parameter (e.g., PM2.5 or PM10).
    Args:
        sensor_group (list): A list of sensors to process.
        param_formula (str): The parameter formula to use for renaming the DataFrame.
    Returns:
        pd.DataFrame: A DataFrame with daily averages for the specified parameter.
    """
    group_dfs = []

    for sensor in sensor_group:
        sensor_id = sensor.get("Identyfikator stanowiska")
        print(f"🔍 Sensor: {param_formula} (ID: {sensor_id})")

        current_start = start_date
        all_records = []

        while current_start <= end_date:
            segment_end = min(current_start + timedelta(days=365), end_date)
            date_from_str = current_start.strftime("%Y-%m-%d %H:%M")
            date_to_str = segment_end.strftime("%Y-%m-%d %H:%M")

            print(f"Timestamp: {date_from_str} → {date_to_str}")
            data = get_archival_data(sensor_id, date_from_str, date_to_str)
            all_records.extend(data)

            current_start = segment_end + timedelta(days=1)
            time.sleep(1.5)

        daily_df = process_hourly_to_daily(all_records)

        if not daily_df.empty:
            daily_df.rename(columns={"value": param_formula}, inplace=True)
            group_dfs.append(daily_df.set_index("date"))

    # Merge all dataframes for this group by date (average if multiple)
    if group_dfs:
        combined = pd.concat(group_dfs, axis=1)
        combined = combined.groupby(combined.index).mean()  # average if multiple sensors
        return combined
    else:
        return None

# Process PM2.5 and PM10
pm25_df = process_sensor_group(pm25_sensors, "PM2.5")
pm10_df = process_sensor_group(pm10_sensors, "PM10")

# Combine both into one DataFrame
final_df = pd.concat([pm25_df, pm10_df], axis=1)

## Saving the datasets

In [None]:
if not final_df.empty:
    final_df.sort_index(inplace=True)
    filename = "pomiar2025.csv"
    final_df.to_csv(filename)
    print(f"Data saved to {filename}")
else:
    print("No data available for the specified date range.")