In [None]:
import requests 
import pandas as pd 
import re 

In [65]:
def requests_url(url):
    response = requests.get(API_URL)
    response.raise_for_status()
    response.encoding = 'utf-8'
    data = response.text
    return data

def transform_data(data):
    # Extract the data from the XML response by PollutantConcentration
    pattern = r'<PollutantConcentration>(.*?)</PollutantConcentration>'
    pollutant_data = re.findall(pattern, data, re.DOTALL)

    # Extract the element from PollutantConcentration
    station_pattern = r'<StationName>(.*?)</StationName>'
    datetime_pattern = r'<DateTime>(.*?)</DateTime>'
    no2_pattern = r'<NO2>(.*?)</NO2>'
    o3_pattern = r'<O3>(.*?)</O3>'
    so2_pattern = r'<SO2>(.*?)</SO2>'
    co_pattern = r'<CO>(.*?)</CO>'
    pm10_pattern = r'<PM10>(.*?)</PM10>'
    pm25_pattern = r'<PM2.5>(.*?)</PM2.5>'

    data_dict = {
        'Station': [],
        'DateTime': [],
        'NO2': [],
        'O3': [],
        'SO2': [],
        'CO': [],
        'PM10': [],
        'PM2.5': []
    }
    for measurement in pollutant_data:
        data_dict['Station'].append(re.findall(station_pattern, measurement)[0])
        data_dict['DateTime'].append(re.findall(datetime_pattern, measurement)[0])
        data_dict['NO2'].append(re.findall(no2_pattern, measurement)[0])
        data_dict['O3'].append(re.findall(o3_pattern, measurement)[0])
        data_dict['SO2'].append(re.findall(so2_pattern, measurement)[0])
        data_dict['CO'].append(re.findall(co_pattern, measurement)[0])
        data_dict['PM10'].append(re.findall(pm10_pattern, measurement)[0])
        data_dict['PM2.5'].append(re.findall(pm25_pattern, measurement)[0])

    # Create DataFrame
    df = pd.DataFrame(data_dict)

    # Convert DateTime to datetime type
    df['DateTime'] = pd.to_datetime(df['DateTime'])

    # Convert numeric columns to float, replacing '-' with NaN
    numeric_columns = ['NO2', 'O3', 'SO2', 'CO', 'PM10', 'PM2.5']
    for col in numeric_columns:
        df[col] = df[col].replace('-', pd.NA)
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    df.to_csv('cleaned_data.csv', index=False)
    return df

In [68]:
url = "https://www.aqhi.gov.hk/epd/ddata/html/out/24pc_Eng.xml"
data = requests_url(url)

_ = transform_data(data)

Unnamed: 0,Station,DateTime,NO2,O3,SO2,CO,PM10,PM2.5
0,Central/Western,2025-04-17 00:00:00+08:00,28.1,,2.2,,70.6,19.4
1,Central/Western,2025-04-17 01:00:00+08:00,,86.6,,,66.0,16.5
2,Central/Western,2025-04-17 02:00:00+08:00,19.8,97.9,3.0,,64.8,16.5
3,Central/Western,2025-04-17 03:00:00+08:00,19.2,94.8,3.0,,59.9,16.4
4,Central/Western,2025-04-17 04:00:00+08:00,,100.6,2.5,,60.6,17.4
...,...,...,...,...,...,...,...,...
427,Mong Kok,2025-04-17 19:00:00+08:00,91.9,21.5,3.5,551.2,54.3,18.7
428,Mong Kok,2025-04-17 20:00:00+08:00,77.1,31.1,2.8,424.9,54.4,19.5
429,Mong Kok,2025-04-17 21:00:00+08:00,81.7,28.0,2.2,583.5,53.2,18.7
430,Mong Kok,2025-04-17 22:00:00+08:00,66.2,38.6,1.9,434.7,62.6,23.8
