## Library Installation
Before running the notebook, ensure that the required libraries are installed. You can install them using the following commands:

# OSM Features Extraction for Air Quality Data
This notebook extracts OpenStreetMap features based on latitude and longitude for air quality data labeling.

## Import Required Libraries
Import libraries such as pandas, requests, and any mapping libraries needed.

In [1]:
# Library Installation
!pip install pandas requests geopy overpy

Collecting pandas
  Downloading pandas-3.0.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (79 kB)
Collecting requests
  Using cached requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting geopy
  Using cached geopy-2.4.1-py3-none-any.whl.metadata (6.8 kB)
Collecting overpy
  Downloading overpy-0.7-py3-none-any.whl.metadata (3.5 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.4.1-cp313-cp313-macosx_14_0_arm64.whl.metadata (6.6 kB)
Collecting charset_normalizer<4,>=2 (from requests)
  Downloading charset_normalizer-3.4.4-cp313-cp313-macosx_10_13_universal2.whl.metadata (37 kB)
Collecting idna<4,>=2.5 (from requests)
  Using cached idna-3.11-py3-none-any.whl.metadata (8.4 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Downloading urllib3-2.6.3-py3-none-any.whl.metadata (6.9 kB)
Collecting certifi>=2017.4.17 (from requests)
  Downloading certifi-2026.1.4-py3-none-any.whl.metadata (2.5 kB)
Collecting geographiclib<3,>=1.52 (from geopy)
  Using cached geographiclib

In [2]:
# Import Required Libraries
import pandas as pd
import requests
import json
from geopy.geocoders import Nominatim
import overpy

# Initialize Overpass API
api = overpy.Overpass()

## Load Station Locations from CSV
Load the station locations from the provided CSV file using pandas.

In [3]:
# Load Station Locations from CSV
df_stations = pd.read_csv('station_locations.csv')
print(df_stations.head())

                                      station_name station_id   latitude  \
0                SIDCO Kurichi, Coimbatore - TNPCB  site_5094  10.942451   
1                    Urban, Chamarajanagar - KSPCB  site_5124  11.553580   
2                    MD University, Rohtak - HSPCB   site_147  28.521230   
3  IESD Banaras Hindu University, Varanasi - UPPCB  site_5468  25.262326   
4                           Sirifort, Delhi - CPCB   site_119  28.550425   

   longitude  city  state  
0  76.978996   NaN    NaN  
1  76.555210   NaN    NaN  
2  76.371380   NaN    NaN  
3  82.995408   NaN    NaN  
4  77.215938   NaN    NaN  


## Define Function to Fetch OSM Features
Create a function that takes latitude and longitude as input and fetches relevant OSM features using an API.

In [None]:
# Define Function to Fetch OSM Features

def fetch_osm_features(lat, lon):
    # Define your Overpass API query here
    query = f'[out:json];(node(around:500,{lat},{lon}););out;'
    result = api.query(query)
    relevant_features = []
    for node in result.nodes:
        features = node.tags
        # Check for relevant features
        if any(feature in features for feature in ['aerialway', 'amenity', 'building', 'highway', 'landuse', 'natural', 'shop']):
            relevant_features.append(features)
    return relevant_features

## Map Latitude and Longitude to OSM Features
Iterate through the station locations and use the function to map each location to its corresponding OSM features.

In [13]:
import requests
import time

# Example: bounding box around the first station (small area, ~500m buffer)
lat = float(df_stations.iloc[0]['latitude'])
lon = float(df_stations.iloc[0]['longitude'])
delta = 0.005  # ~500m in degrees

bbox = f"{lat-delta},{lon-delta},{lat+delta},{lon+delta}"

query = f"""
[bbox:{bbox}]
[out:json]
[timeout:90];
(
  node({lat-delta},{lon-delta},{lat+delta},{lon+delta});
  way({lat-delta},{lon-delta},{lat+delta},{lon+delta});
  relation({lat-delta},{lon-delta},{lat+delta},{lon+delta});
);
out body;
>;
out skel qt;
"""

url = "https://overpass-api.de/api/interpreter"

print(f"Querying Overpass API for station: {df_stations.iloc[0]['station_name']}")
print(f"Location: {lat}, {lon}")
print(f"Bounding box: {bbox}\n")

# Add error handling
try:
    response = requests.post(url, data={'data': query}, timeout=120)
    
    print(f"Response status code: {response.status_code}")
    print(f"Response content type: {response.headers.get('Content-Type', 'Unknown')}")
    
    # Check if response is successful
    if response.status_code == 200:
        # Print first 500 chars of response to debug
        print(f"Response preview (first 500 chars):\n{response.text[:500]}\n")
        
        # Try to parse JSON
        data = response.json()
        
        # Print a summary of the result
        print(f"✓ Successfully received data!")
        print(f"Number of elements: {len(data.get('elements', []))}")
        
        if data.get('elements'):
            print(f"\nFirst 3 elements:")
            for i, elem in enumerate(data['elements'][:3]):
                print(f"\nElement {i+1}:")
                print(f"  Type: {elem.get('type')}")
                print(f"  ID: {elem.get('id')}")
                print(f"  Tags: {elem.get('tags', {})}")
    elif response.status_code == 429:
        print("⚠ Rate limited! Too many requests. Wait a moment and try again.")
    elif response.status_code == 504:
        print("⚠ Gateway timeout! The query took too long. Try reducing the search area.")
    else:
        print(f"⚠ Error: {response.status_code}")
        print(f"Response text:\n{response.text[:1000]}")
        
except requests.exceptions.Timeout:
    print("⚠ Request timed out! The server took too long to respond.")
except requests.exceptions.RequestException as e:
    print(f"⚠ Request failed: {e}")
except json.JSONDecodeError as e:
    print(f"⚠ Failed to parse JSON response: {e}")
    print(f"Response text:\n{response.text[:1000]}")

Querying Overpass API for station: SIDCO Kurichi, Coimbatore - TNPCB
Location: 10.942451, 76.978996
Bounding box: 10.937451,76.973996,10.947451000000001,76.98399599999999

Response status code: 200
Response content type: application/json
Response preview (first 500 chars):
{
  "version": 0.6,
  "generator": "Overpass API 0.7.62.10 2d4cfc48",
  "osm3s": {
    "timestamp_osm_base": "2026-01-28T11:07:55Z",
    "copyright": "The data included in this document is from www.openstreetmap.org. The data is made available under ODbL."
  },
  "elements": [

{
  "type": "node",
  "id": 266585747,
  "lat": 10.9391913,
  "lon": 76.9810669
},
{
  "type": "node",
  "id": 266585748,
  "lat": 10.9392998,
  "lon": 76.9799146
},
{
  "type": "node",
  "id": 1423799063,
  "lat": 10.937

✓ Successfully received data!
Number of elements: 22953

First 3 elements:

Element 1:
  Type: node
  ID: 266585747
  Tags: {}

Element 2:
  Type: node
  ID: 266585748
  Tags: {}

Element 3:
  Type: node
  ID: 1423799063
  T

## Label Mapping for Air Quality Data
Create a mapping of the fetched OSM features to labels relevant for air quality data.

In [22]:
# Extract OSM Feature Labels for ML Training
def extract_osm_feature_labels(osm_data):
    """
    Extract OSM features as LABELS (not counts).
    Each primary feature key becomes a column with comma-separated values.
    
    Example output:
    - highway: "trunk,secondary,residential"
    - landuse: "industrial,commercial"
    - amenity: "fuel,parking,hospital"
    
    Returns:
        dict: Feature labels {key: "value1,value2,value3"}
    """
    from collections import defaultdict
    
    # Store unique values for each feature key
    feature_values = defaultdict(set)
    
    # PRIMARY feature keys from OSM Map Features
    primary_feature_keys = {
        'aerialway', 'aeroway', 'highway', 'railway', 'public_transport',
        'landuse', 'natural', 'leisure', 'place', 'building',
        'amenity', 'shop', 'tourism', 'office',
        'man_made', 'power', 'craft', 'industrial',
        'emergency', 'healthcare',
        'waterway', 'water', 'geological',
        'barrier', 'boundary', 'historic', 'military', 'sport'
    }
    
    # Process OSM elements
    elements = osm_data.get('elements', [])
    total_elements = len(elements)
    
    for element in elements:
        tags = element.get('tags', {})
        
        for key, value in tags.items():
            # ONLY include primary feature keys
            if key not in primary_feature_keys:
                continue
            
            # Skip generic/non-informative values
            skip_values = ['yes', 'no', 'unknown', '']
            if value in skip_values:
                continue
            
            # Clean the value
            safe_value = str(value).replace(' ', '_').replace(':', '_').replace('-', '_').replace(',', '_').replace('/', '_').replace('.', '_')
            
            # Add to set (automatically handles duplicates)
            feature_values[key].add(safe_value)
    
    # Convert sets to comma-separated strings
    feature_labels = {}
    for key, values in feature_values.items():
        # Sort values for consistency
        sorted_values = sorted(list(values))
        feature_labels[key] = ','.join(sorted_values)
    
    # Add metadata
    feature_labels['_total_elements'] = total_elements
    feature_labels['_unique_feature_types'] = len(feature_values)
    
    return feature_labels, feature_values


# Apply feature extraction to the data we retrieved
if 'data' in locals() and data.get('elements'):
    print("Extracting OSM feature labels...\n")
    feature_labels, feature_values_dict = extract_osm_feature_labels(data)
    
    # Display the feature extraction results
    print("=" * 70)
    print("OSM FEATURE LABELS EXTRACTION RESULTS")
    print("=" * 70)
    print(f"Total OSM elements found: {feature_labels.get('_total_elements', 0)}")
    print(f"Unique feature types: {feature_labels.get('_unique_feature_types', 0)}")
    
    print("\n" + "-" * 70)
    print("EXTRACTED FEATURE LABELS BY CATEGORY")
    print("-" * 70)
    
    # Display features by category
    for key in sorted(feature_values_dict.keys()):
        values = feature_labels[key]
        value_list = values.split(',')
        print(f"\n{key.upper()}:")
        print(f"  Labels: {values}")
        print(f"  Count of unique values: {len(value_list)}")
    
    # Create DataFrame for this station
    feature_df = pd.DataFrame([feature_labels])
    feature_df.insert(0, 'station_id', df_stations.iloc[0]['station_id'])
    feature_df.insert(1, 'station_name', df_stations.iloc[0]['station_name'])
    feature_df.insert(2, 'latitude', df_stations.iloc[0]['latitude'])
    feature_df.insert(3, 'longitude', df_stations.iloc[0]['longitude'])
    
    print("\n" + "=" * 70)
    print("FEATURE DATAFRAME CREATED")
    print("=" * 70)
    print(f"Shape: {feature_df.shape}")
    print(f"Columns: {len(feature_df.columns)}")
    print(f"\nColumns: {list(feature_df.columns)}")
    
    # Show sample data
    print("\n" + "-" * 70)
    print("SAMPLE DATA (First Row)")
    print("-" * 70)
    for col in feature_df.columns:
        if not col.startswith('_'):
            print(f"{col:20s}: {feature_df[col].iloc[0]}")
    
    print("\n✓ Feature label extraction complete!")
    print("  Each column contains comma-separated labels for that feature type.")
    print("  This labeled data can be used for air quality prediction!")
    
else:
    print("⚠ No OSM data available. Please run the previous cell to fetch data first.")

Extracting OSM feature labels...

OSM FEATURE LABELS EXTRACTION RESULTS
Total OSM elements found: 22953
Unique feature types: 6

----------------------------------------------------------------------
EXTRACTED FEATURE LABELS BY CATEGORY
----------------------------------------------------------------------

AMENITY:
  Labels: college,fuel
  Count of unique values: 2

BARRIER:
  Labels: gate
  Count of unique values: 1

BOUNDARY:
  Labels: administrative
  Count of unique values: 1

HIGHWAY:
  Labels: residential,service,tertiary,trunk,unclassified
  Count of unique values: 5

LANDUSE:
  Labels: industrial
  Count of unique values: 1

RAILWAY:
  Labels: rail
  Count of unique values: 1

FEATURE DATAFRAME CREATED
Shape: (1, 12)
Columns: 12

Columns: ['station_id', 'station_name', 'latitude', 'longitude', 'barrier', 'highway', 'railway', 'landuse', 'amenity', 'boundary', '_total_elements', '_unique_feature_types']

----------------------------------------------------------------------
SAM

## Save Mapped Features to CSV
Save the mapped features and labels to a new CSV file for further analysis.

In [23]:
# Save Extracted Features to CSV
if 'feature_df' in locals():
    output_file = 'station_osm_features.csv'
    feature_df.to_csv(output_file, index=False)
    print(f"✓ Feature data saved to '{output_file}'")
    print(f"  - Rows: {len(feature_df)}")
    print(f"  - Columns: {len(feature_df.columns)}")
    print(f"\nThis CSV can be used for ML model training to predict air quality based on location features!")
else:
    print("⚠ No feature data to save. Please run the feature extraction cell first.")

✓ Feature data saved to 'station_osm_features.csv'
  - Rows: 1
  - Columns: 12

This CSV can be used for ML model training to predict air quality based on location features!


## Batch Process Multiple Stations
Process multiple stations with automatic retry and rate limiting (5 second delay between requests).

In [26]:
# Batch Process Multiple Stations
import time

# ========== CONFIGURATION ==========
NUM_STATIONS = 25  # Change this to process more stations
DELAY_SECONDS = 5  # Delay between requests to avoid rate limiting
RETRY_DELAY = 15  # Delay after a failed request before retrying
# ===================================

print(f"Starting batch processing for {NUM_STATIONS} stations...")
print(f"Delay between requests: {DELAY_SECONDS} seconds")
print("=" * 70)

# Storage for all station features
all_features = []
failed_stations = []

for idx in range(min(NUM_STATIONS, len(df_stations))):
    station = df_stations.iloc[idx]
    station_id = station['station_id']
    station_name = station['station_name']
    lat = float(station['latitude'])
    lon = float(station['longitude'])
    
    print(f"\n[{idx+1}/{NUM_STATIONS}] Processing: {station_name}")
    print(f"  Location: ({lat}, {lon})")
    
    # Build Overpass query
    delta = 0.005
    bbox = f"{lat-delta},{lon-delta},{lat+delta},{lon+delta}"
    
    query = f"""
    [bbox:{bbox}]
    [out:json]
    [timeout:90];
    (
      node({lat-delta},{lon-delta},{lat+delta},{lon+delta});
      way({lat-delta},{lon-delta},{lat+delta},{lon+delta});
      relation({lat-delta},{lon-delta},{lat+delta},{lon+delta});
    );
    out body;
    >;
    out skel qt;
    """
    
    url = "https://overpass-api.de/api/interpreter"
    
    # Try to fetch data with retry logic
    max_retries = 5
    retry_count = 0
    success = False
    
    while retry_count < max_retries and not success:
        try:
            print(f"  Querying Overpass API... (attempt {retry_count+1}/{max_retries})")
            response = requests.post(url, data={'data': query}, timeout=120)
            
            if response.status_code == 200:
                data = response.json()
                elements_count = len(data.get('elements', []))
                print(f"  ✓ Success! Received {elements_count} elements")
                
                # Extract feature labels
                feature_labels, _ = extract_osm_feature_labels(data)
                
                # Add station metadata
                feature_labels['station_id'] = station_id
                feature_labels['station_name'] = station_name
                feature_labels['latitude'] = lat
                feature_labels['longitude'] = lon
                
                all_features.append(feature_labels)
                success = True
                
            elif response.status_code == 429:
                print(f"  ⚠ Rate limited! Waiting {RETRY_DELAY} seconds...")
                time.sleep(RETRY_DELAY)
                retry_count += 1
                
            elif response.status_code == 504:
                print(f"  ⚠ Gateway timeout! Waiting {RETRY_DELAY} seconds...")
                time.sleep(RETRY_DELAY)
                retry_count += 1
                
            else:
                print(f"  ⚠ Error {response.status_code}: {response.text[:200]}")
                retry_count += 1
                
        except requests.exceptions.Timeout:
            print(f"  ⚠ Request timed out! Waiting {RETRY_DELAY} seconds...")
            time.sleep(RETRY_DELAY)
            retry_count += 1
            
        except Exception as e:
            print(f"  ⚠ Error: {e}")
            retry_count += 1
    
    if not success:
        print(f"  ✗ Failed after {max_retries} attempts")
        failed_stations.append({'station_id': station_id, 'station_name': station_name})
    
    # Wait before next request (except for the last one)
    if idx < NUM_STATIONS - 1:
        print(f"  Waiting {DELAY_SECONDS} seconds before next request...")
        time.sleep(DELAY_SECONDS)

# Create final DataFrame
print("\n" + "=" * 70)
print("BATCH PROCESSING COMPLETE")
print("=" * 70)
print(f"Successfully processed: {len(all_features)} stations")
print(f"Failed: {len(failed_stations)} stations")

if all_features:
    # Create DataFrame with all features
    batch_df = pd.DataFrame(all_features)
    
    # Reorder columns: station info first, then features
    info_cols = ['station_id', 'station_name', 'latitude', 'longitude']
    feature_cols = [col for col in batch_df.columns if col not in info_cols]
    batch_df = batch_df[info_cols + sorted(feature_cols)]
    
    print(f"\nDataFrame shape: {batch_df.shape}")
    print(f"Columns: {len(batch_df.columns)}")
    
    # Save to CSV
    output_file = f'station_osm_features_batch_{NUM_STATIONS}.csv'
    batch_df.to_csv(output_file, index=False)
    print(f"\n✓ Saved to '{output_file}'")
    
    # Display summary
    print("\n" + "-" * 70)
    print("SUMMARY OF EXTRACTED FEATURES")
    print("-" * 70)
    print(batch_df.head())
    
else:
    print("\n⚠ No data was successfully extracted!")

if failed_stations:
    print("\n" + "-" * 70)
    print("FAILED STATIONS")
    print("-" * 70)
    for failed in failed_stations:
        print(f"  - {failed['station_name']} ({failed['station_id']})")

Starting batch processing for 25 stations...
Delay between requests: 5 seconds

[1/25] Processing: SIDCO Kurichi, Coimbatore - TNPCB
  Location: (10.942451, 76.978996)
  Querying Overpass API... (attempt 1/5)
  ⚠ Gateway timeout! Waiting 15 seconds...
  Querying Overpass API... (attempt 2/5)
  ✓ Success! Received 22953 elements
  Waiting 5 seconds before next request...

[2/25] Processing: Urban, Chamarajanagar - KSPCB
  Location: (11.55358, 76.55521)
  Querying Overpass API... (attempt 1/5)
  ✓ Success! Received 13203 elements
  Waiting 5 seconds before next request...

[3/25] Processing: MD University, Rohtak - HSPCB
  Location: (28.52123, 76.37138)
  Querying Overpass API... (attempt 1/5)
  ⚠ Gateway timeout! Waiting 15 seconds...
  Querying Overpass API... (attempt 2/5)
  ✓ Success! Received 905 elements
  Waiting 5 seconds before next request...

[4/25] Processing: IESD Banaras Hindu University, Varanasi - UPPCB
  Location: (25.262326, 82.995408)
  Querying Overpass API... (attemp