In [9]:
import requests
import pandas as pd
import json
import time

# API Configuration
API_KEY = "3b0245765d215cb08bd25591a879398c00b8f65e975f7b77f96263d3598788f2"
locations_url = "https://api.openaq.org/v3/locations/"
measurements_url = "https://api.openaq.org/v3/sensors/"
headers = {"X-API-Key": API_KEY}

# List of city location IDs
locations = [7570, 921005, 277971, 1275379, 8809, 8477, 1572, 754, 748, 958, 589, 1274948, 922, 236033, 982, 476,
             1285344, 230097, 1381, 8910, 972, 2895671, 509, 921002, 8717, 3010440, 2939447, 2064, 450, 528, 3009455,
             1415, 8735, 3036183, 8640, 456, 8567, 2272, 270714, 224177, 7975, 268736, 1138, 230091, 8755,1275800,
             2873228, 8867, 1289474, 1275789, 1275797, 8652, 519, 236027, 744,326608, 1185, 2037]

# List of pollutants to focus on (wildfire-related, matching API values)
target_pollutants = ["pm2.5", "o₃"]  # Updated to include only pm2.5 and o₃

# Shared data
all_data = []

# Track requests for hourly limit
request_count = 0
hour_start_time = time.time()

# Progress tracking
locations_processed = 0
total_tasks = 0
tasks_completed = 0

def fetch_location_data(location_id):
    """Fetch location details and return sensor tasks."""
    global request_count, hour_start_time, locations_processed, total_tasks
    url = f"{locations_url}{location_id}"
    tasks = []
    try:
        # Check hourly limit
        current_time = time.time()
        if current_time - hour_start_time >= 3600:
            request_count = 0
            hour_start_time = current_time
        if request_count >= 2000:
            sleep_time = 3600 - (current_time - hour_start_time)
            print(f"Hourly limit reached. Sleeping for {sleep_time:.2f} seconds...")
            time.sleep(sleep_time)
            request_count = 0
            hour_start_time = time.time()
        request_count += 1

        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        location_result = response.json().get("results", [])[0]
        
        city_name = location_result.get("name", "Unknown")
        latitude = location_result.get("coordinates", {}).get("latitude", None)
        longitude = location_result.get("coordinates", {}).get("longitude", None)
        
        for sensor in location_result.get("sensors", []):
            sensor_id = sensor["id"]
            param_name = sensor["parameter"]["displayName"].lower()
            unit = sensor["parameter"]["units"]
            if param_name in target_pollutants:
                tasks.append((location_id, city_name, latitude, longitude, sensor_id, param_name, unit))
        
        # Log the number of matching sensors
        num_sensors = len(tasks)
        print(f"Location {location_id} ({city_name}) has {num_sensors} sensors matching target pollutants")
        total_tasks += num_sensors
        
        locations_processed += 1
        # Log progress every 5 locations
        if locations_processed % 5 == 0 or locations_processed == len(locations):
            progress = (locations_processed / len(locations)) * 100
            print(f"Progress: {locations_processed}/{len(locations)} locations processed ({progress:.1f}%) "
                  f"({total_tasks} total tasks queued, {tasks_completed} completed)")
        
        time.sleep(1)  # 1 request per second
    except requests.exceptions.RequestException as e:
        print(f"Error fetching location {location_id}: {e}")
    return tasks

def fetch_measurements(location_id, city_name, latitude, longitude, sensor_id, param_name, unit):
    """Fetch monthly measurements for a specific sensor with retry logic."""
    global request_count, hour_start_time, tasks_completed
    meas_url = (f"{measurements_url}{sensor_id}/hours/monthly?"
                f"datetime_from=2014-01-01T00:00:00Z&datetime_to=2025-03-06T12:32:00Z&limit=200&page=1")
    max_retries = 3
    retry_delay = 1

    for attempt in range(max_retries):
        try:
            # Check hourly limit
            current_time = time.time()
            if current_time - hour_start_time >= 3600:
                request_count = 0
                hour_start_time = current_time
            if request_count >= 2000:
                sleep_time = 3600 - (current_time - hour_start_time)
                print(f"Hourly limit reached. Sleeping for {sleep_time:.2f} seconds...")
                time.sleep(sleep_time)
                request_count = 0
                hour_start_time = time.time()
            request_count += 1

            response = requests.get(meas_url, headers=headers, timeout=10)
            if response.status_code == 429:
                print(f"429 error for sensor {sensor_id}. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
                retry_delay *= 2
                continue
            response.raise_for_status()
            meas_data = response.json().get("results", [])
            
            for month_data in meas_data:
                monthly_value = month_data.get("value", "N/A")
                period_from = month_data["period"]["datetimeFrom"]["utc"]
                period_to = month_data["period"]["datetimeTo"]["utc"]
                summary = month_data.get("summary", {})
                
                all_data.append({
                    "City": city_name,
                    "Latitude": latitude,
                    "Longitude": longitude,
                    "Sensor Parameter": param_name,
                    "Unit": unit,
                    "Month Start (UTC)": period_from,
                    "Month End (UTC)": period_to,
                    "Monthly Average": monthly_value,
                    "Minimum Value": summary.get("min", "N/A"),
                    "Maximum Value": summary.get("max", "N/A"),
                    "Median Value": summary.get("median", "N/A"),
                    "Standard Deviation": summary.get("sd", "N/A")
                })
            tasks_completed += 1
            # Log task completion progress
            if tasks_completed % 5 == 0 or tasks_completed == total_tasks:
                progress = (tasks_completed / total_tasks) * 100 if total_tasks > 0 else 0
                print(f"Task Progress: {tasks_completed}/{total_tasks} tasks completed ({progress:.1f}%)")
            time.sleep(1)  # 1 request per second
            break
        except requests.exceptions.RequestException as e:
            if isinstance(e, requests.exceptions.HTTPError) and e.response.status_code == 429:
                print(f"429 error for sensor {sensor_id}. Retrying in {retry_delay} seconds...")
                time.sleep(retry_delay)
                retry_delay *= 2
            else:
                print(f"Error fetching measurements for sensor {sensor_id}: {e}")
                break

def main():
    global all_data
    
    # Fetch location data and collect sensor tasks
    sensor_tasks = []
    for location_id in locations:
        tasks = fetch_location_data(location_id)
        sensor_tasks.extend(tasks)

    # Fetch measurements sequentially
    for task in sensor_tasks:
        location_id, city_name, latitude, longitude, sensor_id, param_name, unit = task
        fetch_measurements(location_id, city_name, latitude, longitude, sensor_id, param_name, unit)

    # Convert to Pandas DataFrame
    df = pd.DataFrame(all_data)
    df['Month Start (UTC)'] = pd.to_datetime(df['Month Start (UTC)'])
    df['Month End (UTC)'] = pd.to_datetime(df['Month End (UTC)'])

    # Save to CSV
    df.to_csv("air_quality_monthly_data.csv", index=False)
    print("Monthly data collection complete. Results saved to air_quality_monthly_data.csv")

if __name__ == "__main__":
    start_time = time.time()
    main()
    end_time = time.time()
    print(f"Execution time: {end_time - start_time:.2f} seconds")

Location 7570 (Toronto Downtown) has 2 sensors matching target pollutants
Location 921005 (Downtown Vancouver) has 1 sensors matching target pollutants
Location 277971 (Edmonton Central Eas) has 2 sensors matching target pollutants
Location 1275379 (Ottawa Downtown) has 2 sensors matching target pollutants
Location 8809 (Calgary Central2) has 2 sensors matching target pollutants
Progress: 5/58 locations processed (8.6%) (9 total tasks queued, 0 completed)
Location 8477 (St-Dominique) has 1 sensors matching target pollutants
Location 1572 (Brandon) has 2 sensors matching target pollutants
Location 754 (FREDERICTON) has 2 sensors matching target pollutants
Location 748 (CHARLOTTETOWN) has 2 sensors matching target pollutants
Location 958 (Saskatoon) has 2 sensors matching target pollutants
Progress: 10/58 locations processed (17.2%) (18 total tasks queued, 0 completed)
Location 589 (Regina) has 2 sensors matching target pollutants
Location 1274948 (Thunder Bay) has 2 sensors matching tar

In [11]:
import pandas as pd

# Load data
df = pd.read_csv("air_quality_monthly_data.csv")
df['Month Start (UTC)'] = pd.to_datetime(df['Month Start (UTC)'])

# Filter wildfire seasons (May-Sep, 2018-2024)
wildfire_df = df[df['Month Start (UTC)'].dt.month.isin([5, 6, 7, 8, 9])]
wildfire_df = wildfire_df[wildfire_df['Month Start (UTC)'].dt.year.between(2018, 2024)]

# Pivot to pair PM2.5 and O₃
pivot_df = wildfire_df.pivot_table(index=['City', 'Month Start (UTC)'], 
                                  columns='Sensor Parameter', 
                                  values='Monthly Average').dropna()

# Calculate overall correlation
correlations = pivot_df.corr(method='pearson').loc['pm2.5', 'o₃']

# Per city and year correlation
# Group by City and year, then compute correlation between pm2.5 and o₃
city_year_corr = pivot_df.groupby(['City', pivot_df.index.get_level_values('Month Start (UTC)').year])[['pm2.5', 'o₃']].corr().iloc[0::2, -1]

print("Overall Correlation between PM2.5 and O₃:", correlations)
print("\nCorrelation by City and Year:")
print(city_year_corr)

Overall Correlation between PM2.5 and O₃: 0.09194197860531834

Correlation by City and Year:
City             Month Start (UTC)  Sensor Parameter
Auclair          2021               pm2.5                    NaN
                 2022               pm2.5              -0.180393
                 2023               pm2.5              -0.464529
                 2024               pm2.5              -0.692740
BATHURST         2018               pm2.5              -0.450174
                                                          ...   
Winnipeg_Ellens  2019               pm2.5               0.212509
                 2020               pm2.5               0.399999
                 2021               pm2.5              -0.454287
                 2022               pm2.5              -0.514022
                 2023               pm2.5               0.243698
Name: o₃, Length: 166, dtype: float64
