In [None]:
import requests
import pandas as pd
import xml.etree.ElementTree as ET

def get_data(url, params=None, timeout=10):
    """
    Fetch data from the given API endpoint using GET.

    Args:
        url (str): The API endpoint URL.
        params (dict, optional): Query parameters for the request.
        timeout (int): Timeout for the request.

    Returns:
        str: The response content as text.

    Raises:
        Exception: If the request fails or times out.
    """
    try:
        response = requests.get(url, params=params, timeout=timeout)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        raise Exception(f"Failed to fetch data from {url}: {str(e)}")

def parse_xml(xml_string, element_path, attributes):
    """
    Parse XML data and extract specific attributes from elements.

    Args:
        xml_string (str): The raw XML data as a string.
        element_path (str): The XML element path to search for.
        attributes (list): The list of attributes to extract.

    Returns:
        list[dict]: A list of dictionaries containing the extracted data.
    """
    root = ET.fromstring(xml_string)
    data = [
        {attr: element.attrib.get(attr) for attr in attributes}
        for element in root.findall(element_path)
    ]
    return data

def get_london_site_codes():
    """
    Fetch and extract site codes for air quality monitoring stations in London.

    Returns:
        pd.DataFrame: A DataFrame containing site codes and names for London.
    """
    url = "https://api.erg.ic.ac.uk/AirQuality/Annual/MonitoringObjective/GroupName=London"
    xml_data = get_data(url)
    sites = parse_xml(xml_data, './/Site', ['SiteCode', 'SiteName'])

    df = pd.DataFrame(sites).drop_duplicates().sort_values('SiteCode')
    return df

london_sites = get_london_site_codes()
site_codes_list = london_sites['SiteCode'].values.tolist()

def get_annual_air_quality_report(site_code, year):
    """
    Fetch the annual air quality report for a monitoring site and year.

    Args:
        site_code (str): The monitoring site code.
        year (str): The year for the air quality report.

    Returns:
        str: The raw XML response.
    """
    url = f"https://api.erg.ic.ac.uk/AirQuality/Annual/MonitoringReport/SiteCode={site_code}/Year={year}"

    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise an error for bad responses
        return response.text  # Return the XML response content
    except requests.RequestException as e:
        print(f"Failed to fetch data for SiteCode: {site_code}, Year: {year}. Error: {str(e)}")
        return None

def parse_annual_air_quality_report(xml_string, site_code, year):
    """
    Parse the annual air quality report XML and convert it into a pandas DataFrame.

    Args:
        xml_string (str): The XML response as a string.

    Returns:
        pd.DataFrame: A DataFrame containing the parsed data.
    """
    # Parse the XML string
    root = ET.fromstring(xml_string)

    # List to store rows of data
    data = []

    # Iterate through each ReportItem in the XML
    for report_item in root.findall('.//ReportItem'):
        # Extract attributes from each ReportItem
        report_data = {
            'SiteCode': site_code,
            'Year': year,
            'SpeciesCode': report_item.get('SpeciesCode'),
            'ReportItem': report_item.get('ReportItem'),
            'ReportItemName': report_item.get('ReportItemName'),
            'Annual': report_item.get('Annual')
        }

        # Append the report_data to the list
        data.append(report_data)

    # Convert the list of dictionaries to a pandas DataFrame
    df = pd.DataFrame(data)

    return df

# Execution
site_codes = site_codes_list
years = ['2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', '2024']  # Example years

# Initialize an empty list to hold all the DataFrames
all_reports = []

# Loop through each site code and year
for site_code in site_codes:
    for year in years:
        print(f"Fetching data for SiteCode: {site_code}, Year: {year}")

        # Fetch the annual air quality report
        xml_report = get_annual_air_quality_report(site_code, year)

        if xml_report:
            # Parse the XML report and convert to a DataFrame
            df_report = parse_annual_air_quality_report(xml_report, site_code, year)
            all_reports.append(df_report)

# Concatenate all DataFrames into a single DataFrame
if all_reports:
    df_air_report_sites = pd.concat(all_reports, ignore_index=True)
    display(df_air_report_sites)

In [46]:
df_air_report_sites

Unnamed: 0,SiteCode,Year,SpeciesCode,ReportItem,ReportItemName,Annual
0,BG1,2014,NO2,0,Nitrogen Dioxide,-999
1,BG1,2014,NO2,1,Data capture rate (%):,88
2,BG1,2014,NO2,2,Hourly max (ug/m3):,229.1
3,BG1,2014,NO2,3,Low days:,317
4,BG1,2014,NO2,4,Moderate days:,1
...,...,...,...,...,...,...
26067,WMD,2024,PM10,5,High days:,0
26068,WMD,2024,PM10,6,Very High days:,0
26069,WMD,2024,PM10,7,Mean: (AQS Objective < 40ug/m3),21
26070,WMD,2024,PM10,7,Days where daily mean >50ug/m3: (AQS Objective...,9
