# Air Quality Dataset Aggregator

This notebook merges air quality data from multiple CSV files, filters by station ID, adds user-defined attributes, and splits the output into multiple files with a maximum of 10,000 rows each.

## Section 1: Import Required Libraries

In [None]:
import pandas as pd
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

## Section 2: Define User Input Parameters

Enter the station ID and the values for the new columns to be added to the dataset.

In [None]:
# Load the unified OSM features to get station data automatically
current_directory = os.getcwd()
unified_features_path = os.path.join(current_directory, "station_osm_features_unified.csv")

print("Loading unified OSM features...")
df_unified_features = pd.read_csv(unified_features_path)
print(f"✓ Loaded {len(df_unified_features)} stations from unified features")

# Identify all feature columns (exclude metadata columns)
metadata_cols = ['station_id', 'original_station_id', 'station_name', 'latitude', 'longitude', 
                 '_total_elements', '_unique_feature_types']
feature_columns = [col for col in df_unified_features.columns if col not in metadata_cols]

print(f"\nTotal feature columns: {len(feature_columns)}")
print(f"Feature columns: {feature_columns}")

print(f"\nFirst 5 stations (station info + first few features):")
display_cols = ['station_id', 'station_name'] + feature_columns[:5]
print(df_unified_features[display_cols].head())

print("\n✓ Ready to process all stations with ALL features automatically!")

## Section 3: Load Station Hour Data

In [None]:
# Load station_hour.csv data
station_hour_path = os.path.join(current_directory, "station_hour.csv")

print(f"Loading station_hour.csv from: {station_hour_path}")
station_hour_df = pd.read_csv(station_hour_path)

print(f"✓ Station Hour DataFrame shape: {station_hour_df.shape}")
print(f"  Columns: {list(station_hour_df.columns)}")

## Section 4: Process All Stations Automatically

This section automatically processes each station from the unified features dataset.

In [None]:
# Process all stations automatically
print("=" * 70)
print("AUTOMATED STATION PROCESSING")
print("=" * 70)

# Check if StationId column exists
if 'StationId' not in station_hour_df.columns:
    print("✗ StationId not found in station_hour.csv")
    print(f"Available columns: {list(station_hour_df.columns)}")
else:
    print("✓ StationId found in station_hour.csv")

# Get list of stations to process (only those in station_hour data)
available_stations = station_hour_df['StationId'].unique()
stations_to_process = df_unified_features[df_unified_features['station_id'].isin(available_stations)]

print(f"\nTotal stations in unified features: {len(df_unified_features)}")
print(f"Stations with air quality data: {len(stations_to_process)}")
print(f"Stations to process: {len(stations_to_process)}")

if len(stations_to_process) == 0:
    print("\n⚠️ Warning: No stations to process!")
else:
    print(f"\n✓ Ready to process {len(stations_to_process)} stations")

In [None]:
# Check unique StationId values in station_hour.csv
unique_station_ids = station_hour_df['StationId'].unique()

print("=" * 70)
print("UNIQUE STATION IDs IN STATION_HOUR.CSV")
print("=" * 70)
print(f"Total unique stations: {len(unique_station_ids)}")
print(f"\nStation IDs:")
print(sorted(unique_station_ids))

## Filter Unified Features for Stations with Air Quality Data

Create a filtered version of the unified features containing only the 110 stations that have air quality data.

In [None]:
# Filter unified features to only include stations with air quality data
print("Filtering unified features for stations with air quality data...")
print("=" * 70)

# Filter the unified features dataframe
df_unified_filtered = df_unified_features[df_unified_features['station_id'].isin(unique_station_ids)].copy()

print(f"Original unified features: {len(df_unified_features)} stations")
print(f"Filtered unified features: {len(df_unified_filtered)} stations")
print(f"Stations excluded: {len(df_unified_features) - len(df_unified_filtered)} stations")

# Save the filtered dataset
output_file = 'station_osm_features_filtered_110.csv'
df_unified_filtered.to_csv(output_file, index=False)

print(f"\n✓ Saved filtered dataset to '{output_file}'")
print(f"  - Rows: {len(df_unified_filtered)}")
print(f"  - Columns: {len(df_unified_filtered.columns)}")

# Show some statistics
print("\n" + "-" * 70)
print("FILTERED DATASET SUMMARY")
print("-" * 70)
print(f"\nFirst 5 stations in filtered dataset:")
display_cols = ['station_id', 'station_name', 'latitude', 'longitude']
print(df_unified_filtered[display_cols].head())

print(f"\nAll {len(df_unified_filtered)} filtered station IDs:")
print(sorted(df_unified_filtered['station_id'].tolist()))

## Section 5: Process Each Station with Features

This section loops through each station, filters the data, adds OSM features, and saves to chunks.

In [None]:
# Loop through each station and process
chunk_size = 10000
total_stations = len(stations_to_process)
processed_count = 0
failed_count = 0
total_files_created = 0

print(f"Starting automated processing of {total_stations} stations...")
print("=" * 70)

for idx, station_row in stations_to_process.iterrows():
    station_id = station_row['station_id']
    station_name = station_row['station_name']
    
    print(f"\n[{processed_count + 1}/{total_stations}] Processing: {station_id} - {station_name}")
    
    try:
        # Filter station_hour data for this station
        filtered_df = station_hour_df[station_hour_df['StationId'] == station_id].copy()
        
        if filtered_df.shape[0] == 0:
            print(f"  ⚠️ No data found for {station_id}, skipping...")
            failed_count += 1
            continue
        
        print(f"  Found {filtered_df.shape[0]} rows")
        
        # Add ALL feature columns from unified dataset
        for feature_col in feature_columns:
            # Get the feature value for this station (handle NaN values)
            feature_value = station_row[feature_col]
            if pd.isna(feature_value):
                feature_value = ""  # Empty string for missing features
            filtered_df[feature_col] = feature_value
        
        print(f"  Added {len(feature_columns)} feature columns")
        
        # Create output folder for this station
        output_folder = os.path.join(current_directory, "output", station_id)
        os.makedirs(output_folder, exist_ok=True)
        
        # Split into chunks and save
        total_rows = filtered_df.shape[0]
        num_chunks = (total_rows + chunk_size - 1) // chunk_size
        
        for i in range(num_chunks):
            start_idx = i * chunk_size
            end_idx = min((i + 1) * chunk_size, total_rows)
            chunk_df = filtered_df.iloc[start_idx:end_idx]
            
            file_number = i + 1
            filename = f"{station_id}_chunk_{file_number}.csv"
            filepath = os.path.join(output_folder, filename)
            
            chunk_df.to_csv(filepath, index=False)
            total_files_created += 1
        
        print(f"  ✓ Saved {num_chunks} chunk file(s) to output/{station_id}/")
        processed_count += 1
        
    except Exception as e:
        print(f"  ✗ Error processing {station_id}: {e}")
        failed_count += 1

# Final summary
print("\n" + "=" * 70)
print("PROCESSING COMPLETE!")
print("=" * 70)
print(f"Total stations processed: {processed_count}/{total_stations}")
print(f"Failed stations: {failed_count}")
print(f"Total files created: {total_files_created}")
print(f"\n✓ All output files saved in 'output/' folder")

## Summary

This automated notebook:
1. ✓ Loads the unified OSM features dataset
2. ✓ Automatically identifies ALL feature columns (not just 6)
3. ✓ Loads station_hour.csv with air quality data
4. ✓ Processes each station automatically:
   - Filters air quality data by station ID
   - Adds ALL OSM feature columns from unified dataset
   - Splits data into chunks of 10,000 rows
   - Saves chunks to `output/[StationID]/` folder
5. ✓ Provides detailed progress and summary statistics

All output files are organized in the `output/` folder by station ID!