In [1]:
# Set up your environment according to the repo's environment.yml file or run the following...
# Comment these out, once installed or otherwise not needed.
# This creates an empty pip_requirements.txt file used to suppress 'already satisfied' output.
import os
with open('pip_requirements.txt', mode='a'): pass
%pip install boto3        -r pip_requirements.txt | grep -v 'already satisfied'
%pip install pandas       -r pip_requirements.txt | grep -v 'already satisfied'
%pip install numpy        -r pip_requirements.txt | grep -v 'already satisfied'
%pip install requests     -r pip_requirements.txt | grep -v 'already satisfied'
%pip install ipywidgets   -r pip_requirements.txt | grep -v 'already satisfied'
%pip install scikit-learn -r pip_requirements.txt | grep -v 'already satisfied'
%pip install autogluon    -r pip_requirements.txt | grep -v 'already satisfied'
%pip install matplotlib   -r pip_requirements.txt | grep -v 'already satisfied'
%pip install nbconvert    -r pip_requirements.txt | grep -v 'already satisfied'
%pip install python-dotenv    -r pip_requirements.txt | grep -v 'already satisfied'


Collecting charset-normalizer<4,>=2
  Using cached charset_normalizer-3.4.2-cp39-cp39-macosx_10_9_universal2.whl (201 kB)
Collecting idna<4,>=2.5
  Using cached idna-3.10-py3-none-any.whl (70 kB)
Installing collected packages: idna, charset-normalizer, certifi, requests
Successfully installed certifi-2025.4.26 charset-normalizer-3.4.2 idna-3.10 requests-2.32.3
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Collecting ipywidgets
  Downloading ipywidgets-8.1.7-py3-none-any.whl (139 kB)
Collecting jupyterlab_widgets~=3.0.15
  Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl (216 kB)
Collecting widgetsnbextension~=4.0.14
  Downloading widgetsnbextension-4.0.14-py3-none-any.whl (2.2 MB)
Installing collected packages: widgetsnbextension, jupyterlab-widgets, i

In [2]:
# Import statements for packages used...
import os, glob, shutil, sys, requests, json
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ipywidgets as widgets

from botocore import UNSIGNED
from botocore.config import Config
from io import StringIO
from datetime import datetime
from types import SimpleNamespace
from IPython.display import clear_output
from dotenv import load_dotenv
import os

load_dotenv()  # This loads variables from .env into os.environ

# Now you can access them
api_key = os.getenv('OPENAQ_API_KEY')

# The following is required for matplotlib plots to display in some envs...
%matplotlib inline

In [3]:
# class AQParam => Used to define attributes for the (6) main OpenAQ parameters.
class AQParam:
    def __init__(self, id, name, unit, unhealthyThresholdDefault, desc):
        self.id                        = id
        self.name                      = name
        self.unit                      = unit
        self.unhealthyThresholdDefault = unhealthyThresholdDefault
        self.desc                      = desc
    
    def isValid(self):
        if(self is not None and self.id > 0 and self.unhealthyThresholdDefault > 0.0 and 
           len(self.name) > 0 and len(self.unit) > 0 and len(self.desc) > 0):
            return True
        else:
            return False
            
    def toJSON(self):
        return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=2)

# class AQScenario => Defines an ML scenario including a Location w/ NOAA Weather Station ID 
#                     and the target OpenAQ Param.
# Note: OpenAQ data mostly begins sometime in 2016, so using that as a default yearStart value.
class AQScenario:
    def __init__(self, location=None, noaaStationID=None, aqParamTarget=None, unhealthyThreshold=None, 
                 yearStart=2016, yearEnd=2024, aqRadiusMiles=10, featureColumnsToDrop=None):
        self.location           = location
        self.name               = location + "_" + aqParamTarget.name
        self.noaaStationID      = noaaStationID
        self.noaaStationLat     = 0.0
        self.noaaStationLng     = 0.0
        self.openAqSensorIDs       = []
        
        self.aqParamTarget      = aqParamTarget
        
        if unhealthyThreshold and unhealthyThreshold > 0.0:
            self.unhealthyThreshold = unhealthyThreshold
        else:
            self.unhealthyThreshold = self.aqParamTarget.unhealthyThresholdDefault
        
        self.yearStart          = yearStart
        self.yearEnd            = yearEnd
        self.aqRadiusMiles      = aqRadiusMiles
        self.aqRadiusMeters     = aqRadiusMiles * 1610 # Rough integer approximation is fine here.
        
        self.modelFolder        = "AutogluonModels"
            
    def getSummary(self):
        return f"Scenario: {self.name} => {self.aqParamTarget.desc} ({self.aqParamTarget.name}) with UnhealthyThreshold > {self.unhealthyThreshold} {self.aqParamTarget.unit}"
    
    def getModelPath(self):
        return f"{self.modelFolder}/aq_{self.name}_{self.yearStart}-{self.yearEnd}/"
    
    def updateNoaaStationLatLng(self, noaagsod_df_row):
        # Use a NOAA row to set Lat+Lng values used for the OpenAQ API requests...
        if(noaagsod_df_row is not None and noaagsod_df_row['LATITUDE'] and noaagsod_df_row['LONGITUDE']):
            self.noaaStationLat = noaagsod_df_row['LATITUDE']
            self.noaaStationLng = noaagsod_df_row['LONGITUDE']
            print(f"NOAA Station Lat,Lng Updated for Scenario: {self.name} => {self.noaaStationLat},{self.noaaStationLng}")
        else:
            print("NOAA Station Lat,Lng COULD NOT BE UPDATED.")
    
    def isValid(self):
        if(self is not None and self.aqParamTarget is not None and
           self.yearStart > 0 and self.yearEnd > 0 and self.yearEnd >= self.yearStart and 
           self.aqRadiusMiles > 0 and self.aqRadiusMeters > 0 and self.unhealthyThreshold > 0.0 and 
           len(self.name) > 0 and len(self.noaaStationID) > 0):
            return True
        else:
            return False
            
    def toJSON(self):
        return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=2)

# class AQbyWeatherApp => Main app class with settings, AQParams, AQScenarios, and data access methods...
class AQbyWeatherApp:
    def __init__(self, mlTargetLabel='isUnhealthy', mlEvalMetric='accuracy', mlTimeLimitSecs=None):
        self.mlTargetLabel   = mlTargetLabel
        self.mlEvalMetric    = mlEvalMetric
        self.mlTimeLimitSecs = mlTimeLimitSecs
        self.mlIgnoreColumns = ['DATE','NAME','LATITUDE','LONGITUDE','day','avg']
        
        self.defaultColumnsNOAA   = ['DATE','NAME','LATITUDE','LONGITUDE',
                                     'DEWP','WDSP','MAX','MIN','PRCP','MONTH'] # Default relevant NOAA columns
        # self.defaultColumnsOpenAQ = ['summary']       # Default relevant OpenAQ columns
        
        self.aqParams    = {} # A list to save AQParam objects
        self.aqScenarios = {} # A list to save AQScenario objects
        
        self.selectedScenario = None
    
    def addAQParam(self, aqParam):
        if aqParam and aqParam.isValid():
            self.aqParams[aqParam.name] = aqParam
            return True
        else:
            return False
    
    def addAQScenario(self, aqScenario):
        if aqScenario and aqScenario.isValid():
            self.aqScenarios[aqScenario.name] = aqScenario
            if(self.selectedScenario is None):
                self.selectedScenario = self.aqScenarios[next(iter(self.aqScenarios))] # Default selectedScenario to 1st item.
            return True
        else:
            return False
    
    def getFilenameNOAA(self):
        if self and self.selectedScenario and self.selectedScenario.isValid():
            return f"dataNOAA_{self.selectedScenario.name}_{self.selectedScenario.yearStart}-{self.selectedScenario.yearEnd}_{self.selectedScenario.noaaStationID}.csv"
        else:
            return ""
    
    def getFilenameOpenAQ(self):
        if self and self.selectedScenario and self.selectedScenario.isValid() and len(self.selectedScenario.openAqSensorIDs) > 0:
            idString = ""
            for i in range(0, len(self.selectedScenario.openAqSensorIDs)):
                idString = idString + str(self.selectedScenario.openAqSensorIDs[i]) + "-"
            idString = idString[:-1]
            return f"dataOpenAQ_{self.selectedScenario.name}_{self.selectedScenario.yearStart}-{self.selectedScenario.yearEnd}_{idString}.csv"
        else:
            return ""
    
    def getFilenameOther(self, prefix):
        if self and self.selectedScenario and self.selectedScenario.isValid():
            return f"{prefix}_{self.selectedScenario.name}_{self.selectedScenario.yearStart}-{self.selectedScenario.yearEnd}.csv"
    
    def getNoaaDataFrame(self):
        # ASDI Dataset Name: NOAA GSOD
        # ASDI Dataset URL : https://registry.opendata.aws/noaa-gsod/
        # NOAA GSOD README : https://www.ncei.noaa.gov/data/global-summary-of-the-day/doc/readme.txt
        # NOAA GSOD data in S3 is organized by year and Station ID values, so this is straight-forward
        # Example S3 path format => s3://noaa-gsod-pds/{yyyy}/{stationid}.csv
        # Let's start with a new DataFrame and load it from a local CSV or the NOAA data source...
        noaagsod_df = pd.DataFrame()
        filenameNOAA = self.getFilenameNOAA()

        if os.path.exists(filenameNOAA):
            # Use local data file already accessed + prepared...
            print('Loading NOAA GSOD data from local file: ', filenameNOAA)
            noaagsod_df = pd.read_csv(filenameNOAA)
        else:
            # Access + prepare data and save to a local data file...
            noaagsod_bucket = 'noaa-gsod-pds'
            print(f'Accessing and preparing data from ASDI-hosted NOAA GSOD dataset in Amazon S3 (bucket: {noaagsod_bucket})...')
            s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))

            for year in range(self.selectedScenario.yearStart, self.selectedScenario.yearEnd + 1):
                key = f'{year}/{self.selectedScenario.noaaStationID}.csv'                                    # Compute the key to get
                csv_obj = s3.get_object(Bucket=noaagsod_bucket, Key=key)                                     # Get the S3 object
                csv_string = csv_obj['Body'].read().decode('utf-8')                                          # Read object contents to a string
                noaagsod_df = pd.concat([noaagsod_df, pd.read_csv(StringIO(csv_string))], ignore_index=True) # Use the string to build the DataFrame

            # It may be true that Month affects air quality (ie: seasonal considerations; tends to have correlation for certain areas)
            # Extract date components for seasonality
            noaagsod_df['MONTH'] = pd.to_datetime(noaagsod_df['DATE']).dt.month
            noaagsod_df['DAYOFWEEK'] = pd.to_datetime(noaagsod_df['DATE']).dt.dayofweek
            noaagsod_df['SEASON'] = pd.to_datetime(noaagsod_df['DATE']).dt.month.map({1: 'Winter', 2: 'Winter', 3: 'Spring', 
                                                                                      4: 'Spring', 5: 'Spring', 6: 'Summer',
                                                                                      7: 'Summer', 8: 'Summer', 9: 'Fall', 
                                                                                      10: 'Fall', 11: 'Fall', 12: 'Winter'})
            
            # Calculate temperature differences and averages
            noaagsod_df['TEMP_RANGE'] = noaagsod_df['MAX'] - noaagsod_df['MIN']
            noaagsod_df['TEMP_AVG'] = (noaagsod_df['MAX'] + noaagsod_df['MIN']) / 2
            
            # Create interaction features
            noaagsod_df['TEMP_DEWP_DIFF'] = noaagsod_df['TEMP_AVG'] - noaagsod_df['DEWP']
            noaagsod_df['WDSP_TEMP'] = noaagsod_df['WDSP'] * noaagsod_df['TEMP_AVG']

            # Trim down to the desired key columns... (do this last in case engineered columns are to be removed)
            # noaagsod_df = noaagsod_df[self.defaultColumnsNOAA]
            
        return noaagsod_df
        
    def getOpenAqDataFrame(self):
        # ASDI Dataset Name: OpenAQ
        # ASDI Dataset URL : https://registry.opendata.aws/openaq/
        # OpenAQ API Docs  : https://docs.openaq.org/
        # OpenAQ S3 data is only organized by date folders, so each folder is large and contains all stations.
        # Because of this, it's better to query ASDI OpenAQ data using the CloudFront-hosted API.
        # Note that some days may not have values and will get filtered out via an INNER JOIN later.
        # Let's start with a new DataFrame and load it from a local CSV or the NOAA data source...
        aq_df = pd.DataFrame()
        aq_reqUrlBase = "https://api.openaq.org/v3" # OpenAQ ASDI API Endpoint URL Base
        print(f"API Key: {api_key}")
        headers = {
            'accept': 'application/json',
            'x-api-key': api_key
        }

        if self.selectedScenario.noaaStationLat == 0.0 or self.selectedScenario.noaaStationLng == 0.0:
            print("NOAA Station Lat/Lng NOT DEFINED. CANNOT PROCEED")
            return aq_df
        
        if len(self.selectedScenario.openAqSensorIDs) == 0:
            # Find OpenAQ sensors near the NOAA station location
            print('Finding OpenAQ sensors near NOAA station location...')
            
            # Query OpenAQ locations API with coordinates
            aq_reqParams = {
                'coordinates': f"{self.selectedScenario.noaaStationLat},{self.selectedScenario.noaaStationLng}",
                'radius': 25000, # 25km radius
                'parameter': self.selectedScenario.aqParamTarget.name,
                'limit': 100
            }
            
            aq_resp = requests.get(aq_reqUrlBase + "/locations", params=aq_reqParams, headers=headers)
            aq_data = aq_resp.json()
            
            if 'results' in aq_data:
                for location in aq_data['results']:
                    # Check each location's sensors for our target parameter
                    for sensor in location['sensors']:
                        if sensor['parameter']['name'] == self.selectedScenario.aqParamTarget.name:
                            self.selectedScenario.openAqSensorIDs.append(sensor['id'])
                            break # Only need one sensor per location
                            
            print(f'Found {len(self.selectedScenario.openAqSensorIDs)} OpenAQ locations with {self.selectedScenario.aqParamTarget.name} sensors')
        
        if len(self.selectedScenario.openAqSensorIDs) >= 1:
            filenameOpenAQ = self.getFilenameOpenAQ()

            if os.path.exists(filenameOpenAQ):
                # Use local data file already accessed + prepared...
                print('Loading OpenAQ data from local file: ', filenameOpenAQ)
                aq_df = pd.read_csv(filenameOpenAQ)
            else:
                # Access + prepare data (NOTE: calling OpenAQ API one year at a time to avoid timeouts)
                print('Accessing ASDI-hosted OpenAQ Measurements (HTTPS API)...')
                
                for year in range(self.selectedScenario.yearStart, self.selectedScenario.yearEnd + 1):
                    for sensor_id in self.selectedScenario.openAqSensorIDs:
                        # Get daily measurements for this sensor and year
                        aq_reqUrl = f"{aq_reqUrlBase}/sensors/{sensor_id}/days"
                        aq_reqParams = {
                            'date_from': f'{year}-01-01',
                            'date_to': f'{year}-12-31',
                            'limit': 366
                        }
                        
                        print(f'Fetching data for sensor {sensor_id} in {year}')
                        aq_resp = requests.get(aq_reqUrl, params=aq_reqParams, headers=headers)
                        aq_data = aq_resp.json()
                        
                        if 'results' in aq_data:
                            for measurement in aq_data['results']:
                                dt = datetime.strptime(measurement['period']['datetimeFrom']['utc'], '%Y-%m-%dT%H:%M:%SZ')
                                if measurement['value'] is not None:
                                    date_df = pd.DataFrame({'day': [dt.date()], 'avg': [measurement['value']]})
                                    aq_df = pd.concat([aq_df, date_df], ignore_index=True)

                # Group by day and calculate daily averages
                if not aq_df.empty:
                    aq_df = aq_df.groupby('day')['avg'].mean().reset_index()

                # Perform some Label Engineering to add our binary classification label => {0=OKAY, 1=UNHEALTHY}
                if not aq_df.empty:
                    aq_df[self.mlTargetLabel] = np.where(aq_df['avg'] <= self.selectedScenario.unhealthyThreshold, 0, 1)
        
        return aq_df
    
    def getMergedDataFrame(self, noaagsod_df, aq_df):
        if len(noaagsod_df) > 0 and len(aq_df) > 0:
            # Print shapes before merge to debug
            print(f"NOAA GSOD shape before merge: {noaagsod_df.shape}")
            print(f"AQ data shape before merge: {aq_df.shape}")
            print("\nNOAA GSOD sample:")
            print(noaagsod_df.head())
            print("\nAQ data sample:")
            print(aq_df.head())
            
            merged_df = pd.merge(noaagsod_df, aq_df, how="inner", left_on="DATE", right_on="day")
            
            # Print shape after merge to see if rows were lost
            print(f"\nMerged shape: {merged_df.shape}")
            
            if len(merged_df) == 0:
                print("\nMerge resulted in empty DataFrame. This means there are no matching dates between the two datasets.")
                print("Check that DATE and day columns have the same format (both should be datetime or string)")
                print(f"DATE dtype: {noaagsod_df['DATE'].dtype}")
                print(f"day dtype: {aq_df['day'].dtype}")
            
            display(merged_df)
            merged_df = merged_df.drop(columns=self.mlIgnoreColumns)
            return merged_df
        else:
            return pd.DataFrame()
    
    def getConfusionMatrixData(self, cm):
        cmData = SimpleNamespace()
        cmData.TN = cm[0][0]
        cmData.TP = cm[1][1]
        cmData.FN = cm[1][0]
        cmData.FP = cm[0][1]
        
        cmData.TN_Rate = cmData.TN/(cmData.TN+cmData.FP)
        cmData.TP_Rate = cmData.TP/(cmData.TP+cmData.FN)
        cmData.FN_Rate = cmData.FN/(cmData.FN+cmData.TP)
        cmData.FP_Rate = cmData.FP/(cmData.FP+cmData.TN)
        
        cmData.TN_Output = f"True Negatives  (TN): {cmData.TN} of {cmData.TN+cmData.FP} => {round(cmData.TN_Rate * 100, 2)}%"
        cmData.TP_Output = f"True Positives  (TP): {cmData.TP} of {cmData.TP+cmData.FN} => {round(cmData.TP_Rate * 100, 2)}%"
        cmData.FN_Output = f"False Negatives (FN): {cmData.FN} of {cmData.FN+cmData.TP} => {round(cmData.FN_Rate * 100, 2)}%"
        cmData.FP_Output = f"False Positives (FP): {cmData.FP} of {cmData.FP+cmData.TN} => {round(cmData.FP_Rate * 100, 2)}%"
        
        return cmData
            
print("Classes and Variables are ready.")

Classes and Variables are ready.


In [4]:
# CELL #4: Review the pre-defined AQParams and AQScenarios in this cell. You can edit these and/or use your own...
# AQParams are added with default thresholds, which can be overridden on a per-AQScenario basis.
# These AQParams are based on the OpenAQ /parameters API call where isCore=true (https://api.openaq.org/v2/parameters).
# Default thresholds where provided using data from EPA.gov (https://www.epa.gov/criteria-air-pollutants/naaqs-table).
# Confirm and adjust params or thresholds as needed for your needs... Not for scientific or health purposes.

# Instantiate main App class with explicit mlTargetLabel and mlEvalMetric provided...
AQbyWeather = AQbyWeatherApp(mlTargetLabel='isUnhealthy', mlEvalMetric='accuracy', mlTimeLimitSecs=120)

# Define and add new AQParams...
AQbyWeather.addAQParam(AQParam( 1, "pm10", "µg/m³", 150.0, "Particulate Matter < 10 micrometers"))
AQbyWeather.addAQParam(AQParam( 2, "pm25", "µg/m³",  12.0, "Particulate Matter < 2.5 micrometers"))
AQbyWeather.addAQParam(AQParam( 7, "no2",  "ppm",   100.0, "Nitrogen Dioxide"))
AQbyWeather.addAQParam(AQParam( 8, "co",   "ppm",     9.0, "Carbon Monoxide"))
AQbyWeather.addAQParam(AQParam( 9, "so2",  "ppm",    75.0, "Sulfur Dioxide"))
AQbyWeather.addAQParam(AQParam(10, "o3",   "ppm",   0.070, "Ground Level Ozone"))

# Define available AQ Scenarios for certain locations with their associated NOAA GSOD StationID values...
# NOAA GSOD Station Search: https://www.ncei.noaa.gov/access/search/data-search/global-summary-of-the-day
# TODO: Someday consider how to OPTIONALLY append more scenarios via an optional JSON file
#       (ie: without adding a dependecy outside the .ipynb file)
# NOTE: For Ozone Scenarios, we're generally using 0.035 ppm to override the default threshold.
AQbyWeather.addAQScenario(AQScenario("bakersfield", "72384023155", AQbyWeather.aqParams["pm25"], None))
AQbyWeather.addAQScenario(AQScenario("bakersfield", "72384023155", AQbyWeather.aqParams["pm10"], None)) # Attempt at pm10 prediction.
AQbyWeather.addAQScenario(AQScenario("bakersfield", "72384023155", AQbyWeather.aqParams["o3"],  0.035))
AQbyWeather.addAQScenario(AQScenario("fresno",      "72389093193", AQbyWeather.aqParams["pm25"], None))
AQbyWeather.addAQScenario(AQScenario("fresno",      "72389093193", AQbyWeather.aqParams["o3"],  0.035))
AQbyWeather.addAQScenario(AQScenario("visalia",     "72389693144", AQbyWeather.aqParams["pm25"], None))
AQbyWeather.addAQScenario(AQScenario("visalia",     "72389693144", AQbyWeather.aqParams["o3"],  0.035))
AQbyWeather.addAQScenario(AQScenario("san-jose",    "72494693232", AQbyWeather.aqParams["pm25"], None))
AQbyWeather.addAQScenario(AQScenario("san-jose",    "72494693232", AQbyWeather.aqParams["o3"],  0.035))
AQbyWeather.addAQScenario(AQScenario("los-angeles", "72287493134", AQbyWeather.aqParams["pm25"], None))
AQbyWeather.addAQScenario(AQScenario("los-angeles", "72287493134", AQbyWeather.aqParams["o3"],  0.035))
AQbyWeather.addAQScenario(AQScenario("phoenix",     "72278023183", AQbyWeather.aqParams["pm25"], None))
AQbyWeather.addAQScenario(AQScenario("phoenix",     "72278023183", AQbyWeather.aqParams["o3"],  0.035))
AQbyWeather.addAQScenario(AQScenario("fairbanks",   "70261026411", AQbyWeather.aqParams["pm25"], None))
AQbyWeather.addAQScenario(AQScenario("lahore-pk",   "41640099999", AQbyWeather.aqParams["pm25"], None))

print(f"AQbyWeather.aqParams: {str(len(AQbyWeather.aqParams))}")
print(f"AQbyWeather.aqScenarios: {str(len(AQbyWeather.aqScenarios))} (Default Selected: {AQbyWeather.selectedScenario.name})")

AQbyWeather.aqParams: 6
AQbyWeather.aqScenarios: 15 (Default Selected: bakersfield_pm25)


In [5]:
# CELL #5: Select a Scenario via DROP DOWN LIST to use throughout the Notebook. This will drive the ML process...
# A default "value" is set to avoid issues. Change this default to run the Notebook from start-to-finish for that Scenario.
print("*** CHOOSE YOUR OWN ADVENTURE HERE ***")
print("Please select a Scenario via the following drop-down-list...")
print("(NOTE: If you change Scenario, you must re-run remaining cells to see changes.)")
ddl = widgets.Dropdown(options=AQbyWeather.aqScenarios.keys(), 
                       value=AQbyWeather.aqScenarios["los-angeles_pm25"].name) # <-- DEFAULT / FULL-RUN VALUE
ddl

*** CHOOSE YOUR OWN ADVENTURE HERE ***
Please select a Scenario via the following drop-down-list...
(NOTE: If you change Scenario, you must re-run remaining cells to see changes.)


Dropdown(index=9, options=('bakersfield_pm25', 'bakersfield_pm10', 'bakersfield_o3', 'fresno_pm25', 'fresno_o3…

In [6]:
if ddl.value:
    AQbyWeather.selectedScenario = AQbyWeather.aqScenarios[ddl.value]
    print(AQbyWeather.selectedScenario.getSummary())
    print(AQbyWeather.selectedScenario.toJSON())
else:
    print("Please select a Scenario via the above drop-down-list.")

Scenario: los-angeles_pm25 => Particulate Matter < 2.5 micrometers (pm25) with UnhealthyThreshold > 12.0 µg/m³
{
  "aqParamTarget": {
    "desc": "Particulate Matter < 2.5 micrometers",
    "id": 2,
    "name": "pm25",
    "unhealthyThresholdDefault": 12.0,
    "unit": "\u00b5g/m\u00b3"
  },
  "aqRadiusMeters": 16100,
  "aqRadiusMiles": 10,
  "location": "los-angeles",
  "modelFolder": "AutogluonModels",
  "name": "los-angeles_pm25",
  "noaaStationID": "72287493134",
  "noaaStationLat": 0.0,
  "noaaStationLng": 0.0,
  "openAqSensorIDs": [],
  "unhealthyThreshold": 12.0,
  "yearEnd": 2024,
  "yearStart": 2016
}


In [7]:
# GET NOAA GSOD WEATHER DATA...
print(AQbyWeather.selectedScenario.getSummary())
noaagsod_df = AQbyWeather.getNoaaDataFrame()

if(len(noaagsod_df) >= 1):
    # Update NOAA Station Lat/Lng...
    AQbyWeather.selectedScenario.updateNoaaStationLatLng(noaagsod_df.iloc[0])
    
    # Save DataFrame to CSV...
    noaagsod_df.to_csv(AQbyWeather.getFilenameNOAA(), index=False)

    # Output DataFrame properties...
    print('noaagsod_df.shape =', noaagsod_df.shape)
    display(noaagsod_df)

Scenario: los-angeles_pm25 => Particulate Matter < 2.5 micrometers (pm25) with UnhealthyThreshold > 12.0 µg/m³
Accessing and preparing data from ASDI-hosted NOAA GSOD dataset in Amazon S3 (bucket: noaa-gsod-pds)...
NOAA Station Lat,Lng Updated for Scenario: los-angeles_pm25 => 34.0236,-118.2911
noaagsod_df.shape = (3061, 35)


Unnamed: 0,STATION,DATE,LATITUDE,LONGITUDE,ELEVATION,NAME,TEMP,TEMP_ATTRIBUTES,DEWP,DEWP_ATTRIBUTES,...,PRCP_ATTRIBUTES,SNDP,FRSHTT,MONTH,DAYOFWEEK,SEASON,TEMP_RANGE,TEMP_AVG,TEMP_DEWP_DIFF,WDSP_TEMP
0,72287493134,2016-01-01,34.0236,-118.2911,54.6,"LOS ANGELES DOWNTOWN USC, CA US",52.7,24,19.1,24,...,G,999.9,0,1,4,Winter,23.9,52.95,33.85,84.720
1,72287493134,2016-01-02,34.0236,-118.2911,54.6,"LOS ANGELES DOWNTOWN USC, CA US",54.8,24,20.1,24,...,G,999.9,0,1,5,Winter,21.9,53.95,33.85,113.295
2,72287493134,2016-01-03,34.0236,-118.2911,54.6,"LOS ANGELES DOWNTOWN USC, CA US",52.6,24,38.2,24,...,G,999.9,0,1,6,Winter,20.8,54.50,16.30,38.150
3,72287493134,2016-01-04,34.0236,-118.2911,54.6,"LOS ANGELES DOWNTOWN USC, CA US",59.3,24,44.1,24,...,G,999.9,10000,1,0,Winter,25.0,56.60,12.50,62.260
4,72287493134,2016-01-05,34.0236,-118.2911,54.6,"LOS ANGELES DOWNTOWN USC, CA US",57.4,24,51.9,24,...,G,999.9,110000,1,1,Winter,15.1,61.55,9.65,129.255
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3056,72287493134,2024-05-16,34.0236,-118.2911,54.6,"LOS ANGELES DOWNTOWN USC, CA US",62.9,24,53.2,24,...,G,999.9,10000,5,3,Spring,7.9,64.05,10.85,51.240
3057,72287493134,2024-05-17,34.0236,-118.2911,54.6,"LOS ANGELES DOWNTOWN USC, CA US",63.2,24,53.8,24,...,G,999.9,0,5,4,Spring,9.0,64.60,10.80,58.140
3058,72287493134,2024-05-18,34.0236,-118.2911,54.6,"LOS ANGELES DOWNTOWN USC, CA US",62.9,24,53.5,24,...,G,999.9,0,5,5,Spring,10.1,64.05,10.55,83.265
3059,72287493134,2024-05-19,34.0236,-118.2911,54.6,"LOS ANGELES DOWNTOWN USC, CA US",62.9,24,52.1,24,...,G,999.9,0,5,6,Spring,13.0,65.50,13.40,91.700


In [8]:
# GET OPENAQ AIR QUALITY DAILY AVERAGES DATA...
print(AQbyWeather.selectedScenario.getSummary())
aq_df = AQbyWeather.getOpenAqDataFrame() # Gets nearby Location IDs THEN gets associated daily averages.
print(aq_df)

if len(aq_df) > 0:
    # Output DataFrame properties...
    print('aq_df.shape =', aq_df.shape)
    display(aq_df)
    aq_df.to_csv(AQbyWeather.getFilenameOpenAQ(), index=False)

Scenario: los-angeles_pm25 => Particulate Matter < 2.5 micrometers (pm25) with UnhealthyThreshold > 12.0 µg/m³
API Key: 49c492bb9d9e24d5e78ae75b2034430080bc132bcd7c9dfa3f2b08fabda35a9d
Finding OpenAQ sensors near NOAA station location...
Found 90 OpenAQ locations with pm25 sensors
Accessing ASDI-hosted OpenAQ Measurements (HTTPS API)...
Fetching data for sensor 2775 in 2016
Fetching data for sensor 25551 in 2016
Fetching data for sensor 15731 in 2016
Fetching data for sensor 25196 in 2016
Fetching data for sensor 24000 in 2016
Fetching data for sensor 1654141 in 2016
Fetching data for sensor 1654143 in 2016
Fetching data for sensor 1654156 in 2016
Fetching data for sensor 1654168 in 2016
Fetching data for sensor 1654191 in 2016
Fetching data for sensor 2000869 in 2016
Fetching data for sensor 1654217 in 2016
Fetching data for sensor 2000834 in 2016
Fetching data for sensor 1654333 in 2016
Fetching data for sensor 1654360 in 2016
Fetching data for sensor 2000475 in 2016
Fetching data fo

Unnamed: 0,day,avg,isUnhealthy
0,2016-03-06,8.000000,0
1,2016-03-07,4.500000,0
2,2016-03-10,15.000000,1
3,2016-03-11,9.080000,0
4,2016-03-12,7.860000,0
...,...,...,...
1978,2024-12-27,15.900000,1
1979,2024-12-28,23.159091,1
1980,2024-12-29,24.290909,1
1981,2024-12-30,25.827273,1


OSError: [Errno 63] File name too long: 'dataOpenAQ_los-angeles_pm25_2016-2024_2775-25551-15731-25196-24000-1654141-1654143-1654156-1654168-1654191-2000869-1654217-2000834-1654333-1654360-2000475-2000731-2000674-1654499-1654545-1654577-1654556-1654629-2000900-1999896-2001073-2001289-2001343-2088604-2088548-2088589-2000821-2000905-2000768-2000525-2000753-2000561-2000676-2001076-2000935-2000945-2000750-2001039-2000499-2000798-2000553-2000565-2000718-2000977-2000505-2000853-2000522-2000723-2000819-2000729-2001006-2000981-2000667-2000840-2000567-2000963-2000537-2000471-2001038-2000956-2000694-2000953-2000629-2000671-2000967-2000999-2000660-2001013-2000613-2000483-2000550-2000832-2000796-2000934-2001040-2000838-2001062-2000933-2000443-2000993-2000942-2000852-2000445-2001033-2000984.csv'

In [None]:
# Merge the NOAA GSOD weather data with our OpenAQ data by DATE...
# Perform another column drop to remove columns we don't want as features/inputs.
# This column removal will NOT be necessary once we can use Autogluon ignore_columns param (TBD).
print(AQbyWeather.selectedScenario.getSummary())
# Debug merge operation
print("NOAA GSOD data shape:", noaagsod_df.shape)
print("\nNOAA GSOD sample dates:")
print(noaagsod_df['DATE'].head())
print(noaagsod_df.columns.tolist())   # list of all column names

print("\nOpenAQ data shape:", aq_df.shape) 
print("\nOpenAQ sample dates:")
print(aq_df['day'].head())

# Check for date format consistency
print("\nNOAA date type:", noaagsod_df['DATE'].dtype)
print("OpenAQ date type:", aq_df['day'].dtype)

# Attempt merge with debug info
merged_df = AQbyWeather.getMergedDataFrame(noaagsod_df, aq_df)
print("\nMerged data shape:", merged_df.shape)
if(len(merged_df) > 0):
    # Output DataFrame properties...
    print('merged_df.shape =', merged_df.shape)
    display(merged_df)
    merged_df.groupby([AQbyWeather.mlTargetLabel]).size().plot(kind="bar")
    merged_df.to_csv(AQbyWeather.getFilenameOther("dataMERGED"), index=False)

display(merged_df)

In [None]:
from autogluon.features.generators import PipelineFeatureGenerator, CategoryFeatureGenerator, IdentityFeatureGenerator
from autogluon.common.features.types import R_INT, R_FLOAT
from autogluon.features.generators import AutoMLPipelineFeatureGenerator

columns_to_drop=['MIN_ATTRIBUTES', 'MAX_ATTRIBUTES']
merged_df = merged_df.drop(columns=[col for col in columns_to_drop if col in merged_df.columns])

merged_df = merged_df.replace(r'^\s*$', np.nan, regex=True)

auto_ml_pipeline_feature_generator = AutoMLPipelineFeatureGenerator()
merged_df = auto_ml_pipeline_feature_generator.fit_transform(X=merged_df)

print("\nColumn dtypes:")
print(merged_df.dtypes)

# mypipeline = PipelineFeatureGenerator(
#     generators = [[        
#         CategoryFeatureGenerator(maximum_num_cat=10),  # Overridden from default.
#         IdentityFeatureGenerator(infer_features_in_args=dict(valid_raw_types=[R_INT, R_FLOAT])),
#     ]]
# )
# mypipeline.fit_transform(X=merged_df)


In [None]:
# Visualize correlations in our merged dataframe...
print(AQbyWeather.selectedScenario.getSummary())
correlations = merged_df.corr()
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0, len(merged_df.columns), 1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(merged_df.columns)
ax.set_yticklabels(merged_df.columns)
plt.show()

In [None]:
# Merge the NOAA GSOD weather data with our OpenAQ data by DATE...
# Perform another column drop to remove columns we don't want as features/inputs.
# This column removal will NOT be necessary once we can use Autogluon ignore_columns param (TBD).
print(AQbyWeather.selectedScenario.getSummary())
# Debug merge operation
print("NOAA GSOD data shape:", noaagsod_df.shape)
print("\nNOAA GSOD sample dates:")
print(noaagsod_df['DATE'].head())

print("\nOpenAQ data shape:", aq_df.shape) 
print("\nOpenAQ sample dates:")
print(aq_df['day'].head())

# Check for date format consistency
print("\nNOAA date type:", noaagsod_df['DATE'].dtype)
print("OpenAQ date type:", aq_df['day'].dtype)

# Attempt merge with debug info
merged_df = AQbyWeather.getMergedDataFrame(noaagsod_df, aq_df)
print("\nMerged data shape:", merged_df.shape)
if(len(merged_df) > 0):
    merged_df.groupby([AQbyWeather.mlTargetLabel]).size().plot(kind="bar")
    merged_df.to_csv(AQbyWeather.getFilenameOther("dataMERGED"), index=False)

display(merged_df)

In [None]:
# Additional import statements for autogluon+sklearn and split out train_df + validate_df data...
print(AQbyWeather.selectedScenario.getSummary())
from autogluon.tabular import TabularPredictor
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
train_df, validate_df = train_test_split(merged_df, test_size=0.25, random_state=1)
print('Number of training samples:', len(train_df))
print('Number of validation samples:', len(validate_df))

In [None]:
# Create the test_df data and remove the target label column...
print(AQbyWeather.selectedScenario.getSummary())
test_df=validate_df.drop([AQbyWeather.mlTargetLabel], axis=1)
display(test_df)

In [38]:
from autogluon.common import space

nn_options = {  # specifies non-default hyperparameter values for neural network models
    'num_epochs': 50,  # number of training epochs (controls training time of NN models)
    'learning_rate': space.Real(1e-4, 1e-2, default=1e-3, log=True),  # learning rate used in training (real-valued hyperparameter searched on log-scale)
    'activation': space.Categorical('relu', 'softrelu', 'tanh'),  # activation function used in NN (categorical hyperparameter, default = first entry)
    'dropout_prob': space.Real(0.0, 0.5, default=0.1),  # dropout probability (real-valued hyperparameter)
}

gbm_options = {  # specifies non-default hyperparameter values for lightGBM gradient boosted trees
    'num_boost_round': 100,  # number of boosting rounds (controls training time of GBM models)
    'num_leaves': space.Int(lower=26, upper=66, default=36),  # number of leaves in trees (integer hyperparameter)
}

hyperparameters = {  # hyperparameters of each model type
                   'GBM': gbm_options,
                   'NN_TORCH': nn_options,  # NOTE: comment this line out if you get errors on Mac OSX
                  }  # When these keys are missing from hyperparameters dict, no models of that type are trained

num_trials = 10  # try at most 5 different hyperparameter configurations for each type of model
search_strategy = 'auto'  # to tune hyperparameters using random search routine with a local scheduler
time_limit = 60 * 60
hyperparameter_tune_kwargs = {  # HPO is not performed unless hyperparameter_tune_kwargs is specified
    'num_trials': num_trials,
    'scheduler' : 'local',
    'searcher': search_strategy,
}  # Refer to TabularPredictor.fit docstring for all valid values


In [None]:
# Use AutoGluon TabularPredictor to fit a model for our training data...
display(AQbyWeather.selectedScenario.getSummary()) #Using display for consistent/sequential output order.
predictor = TabularPredictor(label=AQbyWeather.mlTargetLabel, 
                             eval_metric=AQbyWeather.mlEvalMetric, 
                             path=AQbyWeather.selectedScenario.getModelPath())
predictor.fit(train_data=train_df, auto_stack=True, time_limit=time_limit, hyperparameters=hyperparameters,
    hyperparameter_tune_kwargs=hyperparameter_tune_kwargs, verbosity=3, presets='best_quality')

In [None]:
# Get dataframes for feature importance + model leaderboard AND get+display model evaluation...
display(AQbyWeather.selectedScenario.getSummary()) #Using display for consistent/sequential output order.
featureimp_df   = predictor.feature_importance(validate_df)
leaderboard_df  = predictor.leaderboard(validate_df, silent=True)
modelEvaluation = predictor.evaluate(validate_df, auxiliary_metrics=True)

In [None]:
# View Autogluon Individual Model Leaderboard...
print(AQbyWeather.selectedScenario.getSummary())
display(leaderboard_df)

In [None]:
# View and Plot Feature Importance... (this various from Scenario to Scenario)
print(AQbyWeather.selectedScenario.getSummary())
display(featureimp_df)
featureimp_df.drop(columns=["n"]).plot(kind="bar", figsize=(12, 4), xlabel="feature")

In [None]:
# Load + Use Our Model (this line is unnecessary, but shows how to load a built model)...
predictor = TabularPredictor.load(AQbyWeather.selectedScenario.getModelPath())

# Make Predictions, which are saved to an array: y_pred
print(AQbyWeather.selectedScenario.getSummary())
y_pred = predictor.predict(test_df)
display(y_pred)

In [None]:
# Get true label values as an array: y_true
print(AQbyWeather.selectedScenario.getSummary())
y_true = validate_df[AQbyWeather.mlTargetLabel]
display(y_true)

In [None]:
# Get Confusion Matrix (CM), Get Additional CM Data, View CM...
# Learn More: https://towardsdatascience.com/confusion-matrix-what-is-it-e859e1bbecdc
#             https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
print(AQbyWeather.selectedScenario.getSummary())
cm = confusion_matrix(y_true, y_pred)

# Print Confusion Matrix data...
cmData = AQbyWeather.getConfusionMatrixData(cm)
print(cmData.TN_Output)
print(cmData.TP_Output)
print(cmData.FN_Output)
print(cmData.FP_Output)

# Plot Confusion Matrix...
cmd = ConfusionMatrixDisplay(confusion_matrix=cm)
fig, ax = plt.subplots(figsize=(8, 6))
cmd.plot(ax=ax)

In [None]:
# Create and save final results...
print(AQbyWeather.selectedScenario.getSummary())
resultsFile = AQbyWeather.getFilenameOther("dataRESULTS")
results_df = pd.DataFrame()
results_df['PREDICTION'] = pd.DataFrame(y_pred)
results_df = pd.concat([validate_df, results_df], axis=1)
results_df.to_csv(resultsFile, index=False)
print(f"Results saved to {resultsFile}. DONE.")
display(results_df)