In [39]:
import pandas as pd 
import numpy as np

In [40]:
import requests
import pandas as pd

from dotenv import load_dotenv
load_dotenv()


True

In [41]:
# OpenAQ v3 â€” Fetch Delhi PM2.5 data with proper datetime handling
import requests
import pandas as pd
import time
from datetime import datetime
import json
import os
from pathlib import Path

API_KEY = os.getenv("API_KEY")

HEADERS = {"X-API-Key": API_KEY, "Accept": "application/json"}

DATE_FROM = "2024-12-15T00:00:00Z"
DATE_TO   = "2025-02-14T23:59:59Z"

BASE = "https://api.openaq.org/v3"

def get_delhi_locations():
    """Get Delhi locations"""
    url = f"{BASE}/locations"
    r = requests.get(url, headers=HEADERS, params={"limit": 100}, timeout=30)
    r.raise_for_status()
    locations = r.json().get("results", [])
    
    delhi_locs = [
        loc for loc in locations 
        if "delhi" in str(loc.get("name", "")).lower()
    ]
    return delhi_locs

def get_location_sensors(location_id):
    """Get all sensors for a location"""
    url = f"{BASE}/locations/{location_id}/sensors"
    r = requests.get(url, headers=HEADERS, params={"limit": 100}, timeout=30)
    if r.status_code == 200:
        return r.json().get("results", [])
    return []

def fetch_sensor_hours(sensor_id, date_from, date_to):
    """Fetch hourly data for a sensor"""
    url = f"{BASE}/sensors/{sensor_id}/hours"
    all_data = []
    page = 1
    max_pages = 30
    
    while page <= max_pages:
        params = {
            "date_from": date_from,
            "date_to": date_to,
            "limit": 1000,
            "page": page
        }
        
        try:
            r = requests.get(url, headers=HEADERS, params=params, timeout=30)
            if r.status_code == 404:
                break
            r.raise_for_status()
            
            results = r.json().get("results", [])
            
            # Debug: print first result structure
            if page == 1 and results and len(all_data) == 0:
                print(f"      Sample result structure: {json.dumps(results[0], indent=2)[:300]}")
            
            if not results:
                break
            
            all_data.extend(results)
            
            if len(results) < 1000:
                break
            
            page += 1
            time.sleep(0.2)
            
        except Exception as e:
            print(f"      Error: {e}")
            break
    
    return all_data

# ------------------------------
# Main pipeline
# ------------------------------
print("="*70)
print(" OpenAQ v3: Fetching PM2.5 Data for Delhi")
print("="*70)

try:
    # Get Delhi locations
    print("\n1. Fetching Delhi locations...")
    locations = get_delhi_locations()
    print(f"   Found {len(locations)} Delhi locations\n")
    
    all_rows = []
    
    for idx, loc in enumerate(locations, 1):
        loc_id = loc.get("id")
        loc_name = loc.get("name")
        coords = loc.get("coordinates", {})
        lat = coords.get("latitude")
        lon = coords.get("longitude")
        
        print(f"{idx}. {loc_name} (ID: {loc_id})")
        
        # Get sensors for this location
        sensors = get_location_sensors(loc_id)
        
        if not sensors:
            print(f"   - No sensors found")
            continue
        
        # Filter for PM2.5 sensors
        pm25_sensors = [
            s for s in sensors 
            if s.get("parameter", {}).get("name", "").lower() == "pm25"
            or s.get("parameter", {}).get("id") == 2
        ]
        
        if not pm25_sensors:
            print(f"   - Has {len(sensors)} sensors but no PM2.5")
            continue
        
        print(f"   Found {len(pm25_sensors)} PM2.5 sensor(s)")
        
        for sensor in pm25_sensors:
            sensor_id = sensor.get("id")
            sensor_name = sensor.get("name")
            date_last = sensor.get("datetimeLast", {}).get("utc", "unknown")
            
            print(f"   - Sensor {sensor_id}: {sensor_name}")
            print(f"     Last data: {date_last}")
            
            # Check if sensor has data in our date range
            if date_last != "unknown":
                last_date = datetime.fromisoformat(date_last.replace("Z", "+00:00"))
                target_start = datetime.fromisoformat(DATE_FROM.replace("Z", "+00:00"))
                
                if last_date < target_start:
                    print(f"     âš  Sensor data too old, skipping")
                    continue
            
            # Fetch hourly data
            print(f"     Fetching hourly data...")
            hours = fetch_sensor_hours(sensor_id, DATE_FROM, DATE_TO)
            
            if hours:
                print(f"     âœ“ Got {len(hours)} hours")
                
                for h in hours:
                    # Extract datetime - try different field names
                    dt_utc = None
                    if "datetime" in h:
                        dt = h["datetime"]
                        if isinstance(dt, dict):
                            dt_utc = dt.get("utc")
                        else:
                            dt_utc = dt
                    elif "period" in h:
                        period = h["period"]
                        if isinstance(period, dict):
                            dt_utc = period.get("datetimeFrom", {}).get("utc") if isinstance(period.get("datetimeFrom"), dict) else period.get("datetimeFrom")
                    
                    all_rows.append({
                        "location_id": loc_id,
                        "location_name": loc_name,
                        "latitude": lat,
                        "longitude": lon,
                        "sensor_id": sensor_id,
                        "datetime_utc": dt_utc,
                        "pm25": h.get("value"),
                        "unit": "Âµg/mÂ³",
                        "coverage_percent": h.get("coverage", {}).get("percentComplete") if isinstance(h.get("coverage"), dict) else None
                    })
            else:
                print("     âœ— No data in date range")
        
        time.sleep(0.3)
    
    print("\n" + "="*70)
    
    if all_rows:
        df = pd.DataFrame(all_rows)
        
        # Parse datetime
        df["datetime_utc"] = pd.to_datetime(df["datetime_utc"], utc=True, errors="coerce")
        
        # Remove rows with missing data
        df = df.dropna(subset=["pm25"])
        
        # Sort
        df = df.sort_values(["location_id", "sensor_id", "datetime_utc"]).reset_index(drop=True)
        
        # Create data folder if it doesn't exist
        data_folder = Path("data")
        data_folder.mkdir(exist_ok=True)
        
        # Save
        out_file = data_folder / "delhi_pm25_openaq_v3_final.csv"
        df.to_csv(out_file, index=False)
        
        print(f"\nSUCCESS! Saved {len(df)} PM2.5 records to '{out_file}'")
        print("="*70)
        print(f"\nData Summary:")
        print(f"   Shape: {df.shape}")
        print(f"   Date range: {df['datetime_utc'].min()} to {df['datetime_utc'].max()}")
        print(f"   Unique locations: {df['location_id'].nunique()}")
        print(f"   Unique sensors: {df['sensor_id'].nunique()}")
        
        print(f"\nPM2.5 Statistics (Âµg/mÂ³):")
        stats = df['pm25'].describe()
        for stat_name, value in stats.items():
            print(f"   {stat_name:8s}: {value:,.2f}")
        
        print(f"\nSample Data (first 10 rows):")
        print(df.head(10)[['location_name', 'datetime_utc', 'pm25']].to_string())
        
        print(f"\nFull dataset saved to: {out_file}")
        
    else:
        print("\nNo PM2.5 data found for the specified date range")
        print(f"\nDate range requested: {DATE_FROM} to {DATE_TO}")
        
except Exception as e:
    print(f"\nError: {e}")
    import traceback
    traceback.print_exc()

 OpenAQ v3: Fetching PM2.5 Data for Delhi

1. Fetching Delhi locations...
   Found 4 Delhi locations

1. Delhi Technological University, Delhi - CPCB (ID: 13)
   Found 1 PM2.5 sensor(s)
   - Sensor 13864: pm25 Âµg/mÂ³
     Last data: 2018-02-22T04:00:00Z
     âš  Sensor data too old, skipping
2. R K Puram, Delhi - DPCC (ID: 17)
   Found 2 PM2.5 sensor(s)
   - Sensor 35: pm25 Âµg/mÂ³
     Last data: 2018-02-21T21:15:00Z
     âš  Sensor data too old, skipping
   - Sensor 12234787: pm25 Âµg/mÂ³
     Last data: 2026-02-14T08:30:00Z
     Fetching hourly data...
      Sample result structure: {
  "value": 110.0,
  "flagInfo": {
    "hasFlags": false
  },
  "parameter": {
    "id": 2,
    "name": "pm25",
    "units": "\u00b5g/m\u00b3",
    "displayName": null
  },
  "period": {
    "label": "1hour",
    "interval": "01:00:00",
    "datetimeFrom": {
      "utc": "2025-02-18T19:30:00Z",
   
     âœ“ Got 7762 hours
3. Punjabi Bagh, Delhi - DPCC (ID: 50)
   Found 2 PM2.5 sensor(s)
   - Sensor 396

In [42]:
pm25df = pd.read_csv("data/delhi_pm25_openaq_v3_final.csv")
pm25df.head(20)

Unnamed: 0,location_id,location_name,latitude,longitude,sensor_id,datetime_utc,pm25,unit,coverage_percent
0,17,"R K Puram, Delhi - DPCC",28.563262,77.186937,12234787,2025-02-18 19:30:00+00:00,110.0,Âµg/mÂ³,50.0
1,17,"R K Puram, Delhi - DPCC",28.563262,77.186937,12234787,2025-02-18 20:30:00+00:00,94.8,Âµg/mÂ³,100.0
2,17,"R K Puram, Delhi - DPCC",28.563262,77.186937,12234787,2025-02-18 21:30:00+00:00,94.5,Âµg/mÂ³,100.0
3,17,"R K Puram, Delhi - DPCC",28.563262,77.186937,12234787,2025-02-18 22:30:00+00:00,102.0,Âµg/mÂ³,100.0
4,17,"R K Puram, Delhi - DPCC",28.563262,77.186937,12234787,2025-02-18 23:30:00+00:00,74.0,Âµg/mÂ³,100.0
5,17,"R K Puram, Delhi - DPCC",28.563262,77.186937,12234787,2025-02-19 00:30:00+00:00,97.8,Âµg/mÂ³,100.0
6,17,"R K Puram, Delhi - DPCC",28.563262,77.186937,12234787,2025-02-19 01:30:00+00:00,103.0,Âµg/mÂ³,100.0
7,17,"R K Puram, Delhi - DPCC",28.563262,77.186937,12234787,2025-02-19 02:30:00+00:00,97.5,Âµg/mÂ³,100.0
8,17,"R K Puram, Delhi - DPCC",28.563262,77.186937,12234787,2025-02-19 03:30:00+00:00,99.8,Âµg/mÂ³,100.0
9,17,"R K Puram, Delhi - DPCC",28.563262,77.186937,12234787,2025-02-19 04:30:00+00:00,80.5,Âµg/mÂ³,100.0
