In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:

#
#    These are standard python modules
#
#import json, time, urllib.parse
import json, time
import numpy as np
import pandas as pd
#
#    The 'requests' module is a distribution module for making web requests.
#
import requests
from tqdm import tqdm

In [3]:
#########
#
#    CONSTANTS
#

#
#    This is the root of all AQS API URLs
#
API_REQUEST_URL = 'https://aqs.epa.gov/data/api'

#
#    These are 'actions' we can ask the API to take or requests that we can make of the API
#
#    Sign-up request - generally only performed once - unless you lose your key
API_ACTION_SIGNUP = '/signup?email={email}'
#
#    List actions provide information on API parameter values that are required by some other actions/requests
API_ACTION_LIST_CLASSES = '/list/classes?email={email}&key={key}'
API_ACTION_LIST_PARAMS = '/list/parametersByClass?email={email}&key={key}&pc={pclass}'
API_ACTION_LIST_SITES = '/list/sitesByCounty?email={email}&key={key}&state={state}&county={county}'
#
#    Monitor actions are requests for monitoring stations that meet specific criteria
API_ACTION_MONITORS_COUNTY = '/monitors/byCounty?email={email}&key={key}&param={param}&bdate={begin_date}&edate={end_date}&state={state}&county={county}'
API_ACTION_MONITORS_BOX = '/monitors/byBox?email={email}&key={key}&param={param}&bdate={begin_date}&edate={end_date}&minlat={minlat}&maxlat={maxlat}&minlon={minlon}&maxlon={maxlon}'
#
#    Summary actions are requests for summary data. These are for daily summaries
API_ACTION_DAILY_SUMMARY_COUNTY = '/dailyData/byCounty?email={email}&key={key}&param={param}&bdate={begin_date}&edate={end_date}&state={state}&county={county}'
API_ACTION_DAILY_SUMMARY_BOX = '/dailyData/byBox?email={email}&key={key}&param={param}&bdate={begin_date}&edate={end_date}&minlat={minlat}&maxlat={maxlat}&minlon={minlon}&maxlon={maxlon}'
#
#    It is always nice to be respectful of a free data resource.
#    We're going to observe a 100 requests per minute limit - which is fairly nice
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED
#
#
#    This is a template that covers most of the parameters for the actions we might take, from the set of actions
#    above. In the examples below, most of the time parameters can either be supplied as individual values to a
#    function - or they can be set in a copy of the template and passed in with the template.
#
AQS_REQUEST_TEMPLATE = {
    "email":      "",
    "key":        "",
    "state":      "",     # the two digit state FIPS # as a string
    "county":     "",     # the three digit county FIPS # as a string
    "begin_date": "",     # the start of a time window in YYYYMMDD format
    "end_date":   "",     # the end of a time window in YYYYMMDD format, begin_date and end_date must be in the same year
    "minlat":    0.0,
    "maxlat":    0.0,
    "minlon":    0.0,
    "maxlon":    0.0,
    "param":     "",     # a list of comma separated 5 digit codes, max 5 codes requested
    "pclass":    ""      # parameter class is only used by the List calls
}

In [4]:
def request_signup(email_address = None,
                   endpoint_url = API_REQUEST_URL,
                   endpoint_action = API_ACTION_SIGNUP,
                   request_template = AQS_REQUEST_TEMPLATE,
                   headers = None):

    # Make sure we have a string - if you don't have access to this email addres, things might go badly for you
    if email_address:
        request_template['email'] = email_address
    if not request_template['email']:
        raise Exception("Must supply an email address to call 'request_signup()'")

    # Compose the signup url - create a request URL by combining the endpoint_url with the parameters for the request
    request_url = endpoint_url+endpoint_action.format(**request_template)

    # make the request
    try:
        # Wait first, to make sure we don't exceed a rate limit in the situation where an exception occurs
        # during the request processing - throttling is always a good practice with a free data source
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

In [5]:
print("Requesting SIGNUP ...")
response = request_signup("hmuppa@uw.edu")
print(json.dumps(response,indent=4))

Requesting SIGNUP ...
{
    "Header": [
        {
            "status": "Success",
            "request_time": "2023-11-09T14:20:56-05:00",
            "url": "https://aqs.epa.gov/data/api/signup?email=hmuppa@uw.edu"
        }
    ],
    "Data": [
        "You should receive an email containing your new key shortly."
    ]
}


In [15]:
APIKEY = "taupegazelle31"
USERNAME = "hmuppa@uw.edu"
AQI_PARAM_CLASS = "AQI POLLUTANTS"

In [7]:
#   Gaseous AQI pollutants CO, SO2, NO2, and O2
AQI_PARAMS_GASEOUS = "42101,42401,42602,44201"
#
#   Particulate AQI pollutants PM10, PM2.5, and Acceptable PM2.5
AQI_PARAMS_PARTICULATES = "81102,88101,88502"

In [8]:

CITY_LOCATIONS = {
    'Williston' :       {'city'   : 'Williston',
                  'county' : 'Williams',
                  'state'  : 'North Dakota',
                  'fips'   : '38105',
                  'latlon' : [48.1470, -103.6180] }
}

In [16]:
#
#    This implements the list request. There are several versions of the list request that only require email and key.
#    This code sets the default action/requests to list the groups or parameter class descriptors. Having those descriptors
#    allows one to request the individual (proprietary) 5 digit codes for individual air quality measures by using the
#    param request. Some code in later cells will illustrate those requests.
#
def request_list_info(email_address = None, key = None,
                      endpoint_url = API_REQUEST_URL,
                      endpoint_action = API_ACTION_LIST_CLASSES,
                      request_template = AQS_REQUEST_TEMPLATE,
                      headers = None):

    #  Make sure we have email and key - at least
    #  This prioritizes the info from the call parameters - not what's already in the template
    if email_address:
        request_template['email'] = email_address
    if key:
        request_template['key'] = key

    # For the basic request we need an email address and a key
    if not request_template['email']:
        raise Exception("Must supply an email address to call 'request_list_info()'")
    if not request_template['key']:
        raise Exception("Must supply a key to call 'request_list_info()'")

    # compose the request
    request_url = endpoint_url+endpoint_action.format(**request_template)

    # make the request
    try:
        # Wait first, to make sure we don't exceed a rate limit in the situation where an exception occurs
        # during the request processing - throttling is always a good practice with a free data source
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

In [17]:
#
#  This list request should give us a list of all the monitoring stations in the county specified by the
#  given city selected from the CITY_LOCATIONS dictionary
#
request_data = AQS_REQUEST_TEMPLATE.copy()
request_data['email'] = USERNAME
request_data['key'] = APIKEY
request_data['state'] = CITY_LOCATIONS['Williston']['fips'][:2]   # the first two digits (characters) of FIPS is the state code
request_data['county'] = CITY_LOCATIONS['Williston']['fips'][2:]  # the last three digits (characters) of FIPS is the county code

response = request_list_info(request_template=request_data, endpoint_action=API_ACTION_LIST_SITES)

if response["Header"][0]['status'] == "Success":
    print(json.dumps(response['Data'],indent=4))
else:
    print(json.dumps(response,indent=4))

[
    {
        "code": "0001",
        "value_represented": null
    },
    {
        "code": "0002",
        "value_represented": "LOCATED NORTH OF MERCY HOSPITAL ALONG HIGHWAYS 2/85 BYPASS"
    },
    {
        "code": "0003",
        "value_represented": "Williston"
    },
    {
        "code": "0101",
        "value_represented": null
    },
    {
        "code": "0102",
        "value_represented": null
    },
    {
        "code": "0103",
        "value_represented": null
    },
    {
        "code": "0104",
        "value_represented": null
    },
    {
        "code": "0105",
        "value_represented": null
    },
    {
        "code": "0106",
        "value_represented": null
    }
]


In [18]:
#
#    This implements the daily summary request. Daily summary provides a daily summary value for each sensor being requested
#    from the start date to the end date.
#
#    Like the two other functions, this can be called with a mixture of a defined parameter dictionary, or with function
#    parameters. If function parameters are provided, those take precedence over any parameters from the request template.
#
def request_daily_summary(email_address = None, key = None, param=None,
                          begin_date = None, end_date = None, fips = None,
                          endpoint_url = API_REQUEST_URL,
                          endpoint_action = API_ACTION_DAILY_SUMMARY_COUNTY,
                          request_template = AQS_REQUEST_TEMPLATE,
                          headers = None):

    #  This prioritizes the info from the call parameters - not what's already in the template
    if email_address:
        request_template['email'] = email_address
    if key:
        request_template['key'] = key
    if param:
        request_template['param'] = param
    if begin_date:
        request_template['begin_date'] = begin_date
    if end_date:
        request_template['end_date'] = end_date
    if fips and len(fips)==5:
        request_template['state'] = fips[:2]
        request_template['county'] = fips[2:]

    # Make sure there are values that allow us to make a call - these are always required
    if not request_template['email']:
        raise Exception("Must supply an email address to call 'request_daily_summary()'")
    if not request_template['key']:
        raise Exception("Must supply a key to call 'request_daily_summary()'")
    if not request_template['param']:
        raise Exception("Must supply param values to call 'request_daily_summary()'")
    if not request_template['begin_date']:
        raise Exception("Must supply a begin_date to call 'request_daily_summary()'")
    if not request_template['end_date']:
        raise Exception("Must supply an end_date to call 'request_daily_summary()'")
    # Note we're not validating FIPS fields because not all of the daily summary actions require the FIPS numbers

    # compose the request
    request_url = endpoint_url+endpoint_action.format(**request_template)

    # make the request
    try:
        # Wait first, to make sure we don't exceed a rate limit in the situation where an exception occurs
        # during the request processing - throttling is always a good practice with a free data source
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

In [19]:
request_data = AQS_REQUEST_TEMPLATE.copy()
request_data['email'] = USERNAME
request_data['key'] = APIKEY
request_data['param'] = AQI_PARAMS_GASEOUS
request_data['state'] = CITY_LOCATIONS['Williston']['fips'][:2]
request_data['county'] = CITY_LOCATIONS['Williston']['fips'][2:]

# request daily summary data for 2020
gaseous_aqi = request_daily_summary(request_template=request_data, begin_date="20200101", end_date="20201231")
print("Response for the gaseous pollutants ...")
#
if gaseous_aqi["Header"][0]['status'] == "Success":
    print(json.dumps(gaseous_aqi['Data'],indent=4))
elif gaseous_aqi["Header"][0]['status'].startswith("No data "):
    print("Looks like the response generated no data. You might take a closer look at your request and the response data.")
else:
    print(json.dumps(gaseous_aqi,indent=4))

request_data['param'] = AQI_PARAMS_PARTICULATES
# request daily summary data for 2020
particulate_aqi = request_daily_summary(request_template=request_data, begin_date="20200101", end_date="20201231")
print("Response for the particulate pollutants ...")
#
if particulate_aqi["Header"][0]['status'] == "Success":
    print(json.dumps(particulate_aqi['Data'],indent=4))
elif particulate_aqi["Header"][0]['status'].startswith("No data "):
    print("Looks like the response generated no data. You might take a closer look at your request and the response data.")
else:
    print(json.dumps(particulate_aqi,indent=4))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Response for the particulate pollutants ...
Looks like the response generated no data. You might take a closer look at your request and the response data.


In [20]:
def request_monitors(email_address = None, key = None, param=None,
                          begin_date = None, end_date = None, fips = None,
                          endpoint_url = API_REQUEST_URL,
                          endpoint_action = API_ACTION_MONITORS_COUNTY,
                          request_template = AQS_REQUEST_TEMPLATE,
                          headers = None):

    #  This prioritizes the info from the call parameters - not what's already in the template
    if email_address:
        request_template['email'] = email_address
    if key:
        request_template['key'] = key
    if param:
        request_template['param'] = param
    if begin_date:
        request_template['begin_date'] = begin_date
    if end_date:
        request_template['end_date'] = end_date
    if fips and len(fips)==5:
        request_template['state'] = fips[:2]
        request_template['county'] = fips[2:]

    # Make sure there are values that allow us to make a call - these are always required
    if not request_template['email']:
        raise Exception("Must supply an email address to call 'request_monitors()'")
    if not request_template['key']:
        raise Exception("Must supply a key to call 'request_monitors()'")
    if not request_template['param']:
        raise Exception("Must supply param values to call 'request_monitors()'")
    if not request_template['begin_date']:
        raise Exception("Must supply a begin_date to call 'request_monitors()'")
    if not request_template['end_date']:
        raise Exception("Must supply an end_date to call 'request_monitors()'")
    # Note we're not validating FIPS fields because not all of the monitors actions require the FIPS numbers

    # compose the request
    request_url = endpoint_url+endpoint_action.format(**request_template)

    # make the request
    try:
        # Wait first, to make sure we don't exceed a rate limit in the situation where an exception occurs
        # during the request processing - throttling is always a good practice with a free data source
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

In [21]:

#
request_data = AQS_REQUEST_TEMPLATE.copy()
request_data['email'] = USERNAME
request_data['key'] = APIKEY
request_data['param'] = AQI_PARAMS_PARTICULATES     # remember we have both gaseous and particulates
#
#   We got the monitoring stations for Bend OR above (Deschutes county) - let's work with that one again
request_data['state'] = CITY_LOCATIONS['Williston']['fips'][:2]
request_data['county'] = CITY_LOCATIONS['Williston']['fips'][2:]
#
# the first example uses the default - request monitors by county, we'll just use a recent date for now
response = request_monitors(request_template=request_data, begin_date="20210701", end_date="20210731")
#
# the response should be similar to the 'list' request above - but in this case we should only get monitors that
# monitor the AQI_PARAMS_PARTICULATES set of params.
#
if response["Header"][0]['status'] == "Success":
    print(json.dumps(response['Data'],indent=4))
else:
    print(json.dumps(response,indent=4))

{
    "Header": [
        {
            "status": "No data matched your selection",
            "request_time": "2023-11-09T14:25:00-05:00",
            "url": "https://aqs.epa.gov/data/api/monitors/byCounty?email=hmuppa@uw.edu&key=taupegazelle31&param=81102,88101,88502&bdate=20210701&edate=20210731&state=38&county=105",
            "rows": 0
        }
    ],
    "Data": []
}



In the provided code, scaling is employed to define a geographical bounding box for the retrieval of air quality data. The original range, determined by latitude and longitude values, is adjusted using scaling techniques to ensure that the specified area of interest around the city of Williston is well-defined, even in the presence of null or missing latitude and longitude values. Scaling allows for a consistent and effective way to establish the geographic boundaries without encountering issues arising from null values. By applying scaling, the code ensures that the minimum and maximum latitude and longitude coordinates are correctly calculated and creates a precise bounding box for the region. This approach not only handles the presence of null values within the original range but also guarantees that the requested air quality data is accurately gathered within the defined geographical area, facilitating reliable and meaningful analysis.

In [31]:
LAT_25MILES = 25.0 * (1.0/69.0)    # This is about 25 miles of latitude in decimal degrees
LON_25MILES = 25.0 * (1.0/54.6)

minlat = CITY_LOCATIONS['Williston']['latlon'][0] - float(5) * LAT_25MILES
maxlat = CITY_LOCATIONS['Williston']['latlon'][0] + float(5) * LAT_25MILES
minlon = CITY_LOCATIONS['Williston']['latlon'][1] - float(5) * LON_25MILES
maxlon = CITY_LOCATIONS['Williston']['latlon'][1] + float(5) * LON_25MILES

#
#    Create a copy of the AQS_REQUEST_TEMPLATE
#
request_data = AQS_REQUEST_TEMPLATE.copy()
request_data['email'] = USERNAME
request_data['key'] = APIKEY
request_data['param'] = AQI_PARAMS_PARTICULATES     # same particulate request as the one abover

request_data['minlat'] = minlat
request_data['maxlat'] = maxlat
request_data['minlon'] = minlon
request_data['maxlon'] = maxlon

#
#   we need to change the action for the API from the default to the bounding box - same recent date for now
response = request_monitors(request_template=request_data, begin_date="20020602", end_date="20020603",
                            endpoint_action = API_ACTION_MONITORS_BOX)
#
#
#
if response["Header"][0]['status'] == "Success":
    for station in response['Data']:
      print("state_code: ",station["state_code"])
      print("county_code: ",station["county_code"])
      print("open_date: ",station["open_date"])
      print("close_date: ",station["close_date"])
      print(" ")
    #print(json.dumps(response['Data'],indent=4))
else:
    print(json.dumps(response,indent=4))

state_code:  30
county_code:  085
open_date:  1995-06-14
close_date:  2002-12-31
 
state_code:  CC
county_code:  011
open_date:  2000-11-23
close_date:  2006-06-30
 
state_code:  38
county_code:  053
open_date:  2002-01-01
close_date:  2006-12-31
 
state_code:  38
county_code:  055
open_date:  1992-10-28
close_date:  2008-03-31
 
state_code:  38
county_code:  057
open_date:  1999-01-01
close_date:  2016-04-01
 
state_code:  38
county_code:  057
open_date:  1999-01-01
close_date:  2003-07-14
 
state_code:  38
county_code:  053
open_date:  1987-01-01
close_date:  2004-06-26
 
state_code:  30
county_code:  091
open_date:  1999-12-15
close_date:  None
 
state_code:  38
county_code:  013
open_date:  1999-12-15
close_date:  None
 
state_code:  38
county_code:  007
open_date:  2000-07-01
close_date:  2016-04-01
 
state_code:  38
county_code:  013
open_date:  2000-01-01
close_date:  2005-12-30
 
state_code:  38
county_code:  013
open_date:  1999-09-03
close_date:  2005-12-30
 
state_code:  38


The provided code defines a function, extract_summary_from_response, that is responsible for extracting and structuring summary data from a response object, presumably related to air quality monitoring. The function processes the response data and creates a structured summary based on monitoring sites, parameters, and dates. It organizes the data into a hierarchical dictionary, where each site is associated with location information and pollutant types, and each pollutant type is linked to specific data collected on various dates. This structured summary provides a comprehensive view of air quality monitoring, allowing for easy access to essential information such as pollutant names, units of measure, and data records. The function is designed to handle potential missing values and ensure that requested fields are consistently included in the summary, making it a robust tool for organizing air quality data for further analysis and reporting.

In [34]:
EXTRACTION_FIELDS = ['sample_duration','observation_count','arithmetic_mean','aqi']

#
#    The function creates a summary record
def extract_summary_from_response(r=None, fields=EXTRACTION_FIELDS):
    ## the result will be structured around monitoring site, parameter, and then date
    result = dict()
    data = r["Data"]
    for record in data:
        # make sure the record is set up
        site = record['site_number']
        param = record['parameter_code']
        #date = record['date_local']    # this version keeps the respnse value YYYY-
        date = record['date_local'].replace('-','') # this puts it in YYYYMMDD format
        if site not in result:
            result[site] = dict()
            result[site]['local_site_name'] = record['local_site_name']
            result[site]['site_address'] = record['site_address']
            result[site]['state'] = record['state']
            result[site]['county'] = record['county']
            result[site]['city'] = record['city']
            result[site]['pollutant_type'] = dict()
        if param not in result[site]['pollutant_type']:
            result[site]['pollutant_type'][param] = dict()
            result[site]['pollutant_type'][param]['parameter_name'] = record['parameter']
            result[site]['pollutant_type'][param]['units_of_measure'] = record['units_of_measure']
            result[site]['pollutant_type'][param]['method'] = record['method']
            result[site]['pollutant_type'][param]['data'] = dict()
        if date not in result[site]['pollutant_type'][param]['data']:
            result[site]['pollutant_type'][param]['data'][date] = list()

        # now extract the specified fields
        extract = dict()
        for k in fields:
            if str(k) in record:
                extract[str(k)] = record[k]
            else:
                # this makes sure we always have the requested fields, even if
                # we have a missing value for a given day/month
                extract[str(k)] = None

        # add this extraction to the list for the day
        result[site]['pollutant_type'][param]['data'][date].append(extract)

    return result

The provided code segment is part of a data retrieval process that requests daily air quality index (AQI) summary data for gaseous and particulate pollutants over a specified range of years, typically from 1963 to 2023. It uses a defined request template ('AQS_REQUEST_TEMPLATE') to specify the parameters, email, and API key for the data request. The code loops through each year within this range, with the start year set to 1963 and the end year set to 2023, and constructs begin and end dates for each year accordingly.

For each year, the code sends separate requests for gaseous and particulate AQI data, checking the response status. If the response indicates success, the code proceeds to extract and organize the data using the 'extract_summary_from_response' function, creating structured summaries for gaseous pollutants. These summaries are then appended to the 'all_gaseous_data' list for later analysis.

In cases where no data is available or an error occurs during data retrieval, the code provides informative messages. This process allows for the collection of historical air quality data, particularly for gaseous pollutants, over the specified years, making it a valuable resource for environmental analysis and research. Similarly, it is done for particulate matter too.

In [61]:
request_data = AQS_REQUEST_TEMPLATE.copy()
request_data['email'] = USERNAME
request_data['key'] = APIKEY
request_data['param'] = AQI_PARAMS_GASEOUS
request_data['state'] = CITY_LOCATIONS['Williston']['fips'][:2]
request_data['county'] = CITY_LOCATIONS['Williston']['fips'][2:]

# Initialize an empty list to store the results
all_gaseous_data = []

# Define the request template and base URL
request_template = "YOUR_REQUEST_TEMPLATE_HERE"
base_url = "YOUR_BASE_URL_HERE"

# Define the years you want to retrieve data for
start_year = 1963
end_year = 2023

# Loop through the years and request daily summary data
for year in range(start_year, end_year + 1):
    begin_date = f"{year}0101"
    end_date = f"{year}1231"

    # Make the request for the current year
    gaseous_aqi = request_daily_summary(request_template=request_data, begin_date=begin_date, end_date=end_date)
    particulate_aqi = request_daily_summary(request_template=request_data, begin_date=begin_date, end_date=end_date)
    # Check the response and append data to the list
    if gaseous_aqi["Header"][0]['status'] == "Success":
        #all_gaseous_data.extend(gaseous_aqi['Data'])
        extract_gaseous = extract_summary_from_response(gaseous_aqi)
        #print("Summary of particulate extraction ...")
        #print(json.dumps(extract_gaseous,indent=4))
        all_gaseous_data.append(extract_gaseous)

    elif gaseous_aqi["Header"][0]['status'].startswith("No data "):
        print(f"No data available for {year}.")

    else:
        print(f"Error in retrieving data for {year}.")

No data available for 1963.
No data available for 1964.
No data available for 1965.
No data available for 1966.
No data available for 1967.
No data available for 1968.
No data available for 1969.
No data available for 1970.
No data available for 1971.
No data available for 1972.
No data available for 1973.
No data available for 1974.
No data available for 1975.
No data available for 1976.
No data available for 1977.
No data available for 1978.
No data available for 1979.
No data available for 1980.


In [44]:
data = open('/content/drive/MyDrive/gaseous.json', "w")
json.dump(all_gaseous_data, data, indent = 4)
data.close()

In [62]:
request_data = AQS_REQUEST_TEMPLATE.copy()
request_data['email'] = USERNAME
request_data['key'] = APIKEY
request_data['param'] = AQI_PARAMS_PARTICULATES
request_data['state'] = CITY_LOCATIONS['Williston']['fips'][:2]
request_data['county'] = CITY_LOCATIONS['Williston']['fips'][2:]

# Initialize an empty list to store the results
all_particulate_data = []

# Define the request template and base URL
request_template = "YOUR_REQUEST_TEMPLATE_HERE"
base_url = "YOUR_BASE_URL_HERE"

# Define the years you want to retrieve data for
start_year = 1963
end_year = 2023

# Loop through the years and request daily summary data
for year in range(start_year, end_year + 1):
    begin_date = f"{year}0101"
    end_date = f"{year}1231"

    # Make the request for the current year
    particulate_aqi = request_daily_summary(request_template=request_data, begin_date=begin_date, end_date=end_date)

    # Check the response and append data to the list
    if particulate_aqi["Header"][0]['status'] == "Success":
        #all_gaseous_data.extend(gaseous_aqi['Data'])
        extract_particulate = extract_summary_from_response(particulate_aqi)
        #print("Summary of particulate extraction ...")
        #print(json.dumps(extract_gaseous,indent=4))
        all_particulate_data.append(extract_particulate)

    elif particulate_aqi["Header"][0]['status'].startswith("No data "):
        print(f"No data available for {year}.")

    else:
        print(f"Error in retrieving data for {year}.")

No data available for 1963.
No data available for 1964.
No data available for 1965.
No data available for 1966.
No data available for 1967.
No data available for 1968.
No data available for 1969.
No data available for 1970.
No data available for 1971.
No data available for 1972.
No data available for 1973.
No data available for 1974.
No data available for 1975.
No data available for 1976.
No data available for 1977.
No data available for 1978.
No data available for 1979.
No data available for 1980.
No data available for 1981.
No data available for 1982.
No data available for 1983.
No data available for 1984.
No data available for 1999.
No data available for 2000.
No data available for 2001.
No data available for 2002.
No data available for 2003.
No data available for 2004.
No data available for 2005.
No data available for 2006.
No data available for 2007.
No data available for 2008.
No data available for 2009.
No data available for 2010.
No data available for 2011.
No data available fo

In [63]:
data = open('/content/drive/MyDrive/particulate.json', "w")
json.dump(all_particulate_data, data, indent = 4)
data.close()

The provided code defines a recursive function, 'extract_aqi,' that is designed to parse and extract specific data elements from a nested dictionary structure, particularly focusing on extracting air quality index (AQI) values and related information. The function takes 'data' as input and allows for the inclusion of 'dynamic_keys,' which can be used to track the hierarchical structure of the data being processed. The code iterates through the dictionary, and for each key-value pair, it checks if the 'value' is a dictionary or a list. If it's a dictionary, the function recursively calls itself, extending the 'dynamic_keys' with the current key and exploring the nested structure further. When it encounters a list of dictionaries, it extracts 'aqi' and 'sample_duration' values and stores them along with the associated keys in the 'extracted_data' list. This function is useful for navigating complex nested data structures and isolating specific information, such as AQI values, making it easier to work with and analyze such data in a structured and organized manner.

In [65]:
def extract_aqi(data, dynamic_keys=None):
    if dynamic_keys is None:
        dynamic_keys = []

    extracted_data = []

    for key, value in data.items():
        current_keys = dynamic_keys + [key]

        if isinstance(value, dict):
            extracted_data.extend(extract_aqi(value, current_keys))
        elif isinstance(value, list):
            for item in value:
                if isinstance(item, dict):
                    aqi = item.get("aqi")
                    sample_duration = item.get("sample_duration")
                    if aqi is not None:
                        extracted_data.append({"keys": current_keys,"sample_duration": sample_duration, "aqi": aqi})

    return extracted_data

In [66]:
# Initialize a list to store the extracted data
extracted_data_gaseous = []

extracted_data_particulate = []


# Iterate through the list of datasets
for dataset in all_gaseous_data:
    extracted_data_gaseous.extend(extract_aqi(dataset))


for dataset in all_particulate_data:
    extracted_data_particulate.extend(extract_aqi(dataset))

# Print or further process the extracted data
for entry in extracted_data_gaseous:
    keys = " > ".join(entry["keys"])
    aqi = entry["aqi"]
    sample = entry['sample_duration']

for entry in extracted_data_particulate:
    keys = " > ".join(entry["keys"])
    aqi = entry["aqi"]
    sample = entry['sample_duration']

In [67]:
formatted_data1 = []

for entry in extracted_data_particulate:
    keys = entry['keys']
    date = keys[-1]
    aqi = entry['aqi']
    sample_duration = entry['sample_duration']
    pollutant_type = keys[-3]

    formatted_data1.append({'Date': date, 'AQI': aqi, 'Sample_Duration': sample_duration, 'Pollutant_Type': pollutant_type})

# Create a DataFrame from the formatted data
particulate_df = pd.DataFrame(formatted_data1)

# Convert the 'Date' column to a datetime format
particulate_df['Date'] = pd.to_datetime(particulate_df['Date'], format='%Y%m%d')

formatted_data2 = []

for entry in extracted_data_gaseous:
    keys = entry['keys']
    date = keys[-1]
    aqi = entry['aqi']
    sample_duration = entry['sample_duration']
    pollutant_type = keys[-3]

    formatted_data2.append({'Date': date, 'AQI': aqi, 'Sample_Duration': sample_duration, 'Pollutant_Type': pollutant_type})

# Create a DataFrame from the formatted data
gaseous_df = pd.DataFrame(formatted_data2)

# Convert the 'Date' column to a datetime format
gaseous_df['Date'] = pd.to_datetime(gaseous_df['Date'], format='%Y%m%d')

In [68]:
gaseous_df.head()

Unnamed: 0,Date,AQI,Sample_Duration,Pollutant_Type
0,1981-08-20,0,1 HOUR,42401
1,1981-08-20,0,1 HOUR,42401
2,1981-08-21,0,1 HOUR,42401
3,1981-08-21,0,1 HOUR,42401
4,1981-08-22,0,1 HOUR,42401


In [69]:
particulate_df.head()

Unnamed: 0,Date,AQI,Sample_Duration,Pollutant_Type
0,1985-06-14,29,24 HOUR,81102
1,1985-06-16,54,24 HOUR,81102
2,1985-06-18,29,24 HOUR,81102
3,1985-06-20,26,24 HOUR,81102
4,1985-06-22,39,24 HOUR,81102


In [70]:
gaseous_df.to_csv("/content/drive/MyDrive/gaseous_aqi_data_processed.csv")
particulate_df.to_csv("/content/drive/MyDrive/particulate_aqi_data_processed.csv")

In this code, two original datasets, 'particulate_df' and 'gaseous_df,' likely containing air quality data for particulate and gaseous pollutants, undergo a series of data transformations. These transformations include dropping unnecessary columns, grouping the data by date, calculating the average air quality index (AQI) values for each pollutant type, and merging the results into a single DataFrame, 'combined_df.' The maximum AQI value between gaseous and particulate pollutants is chosen for each date, and missing values are filled using available data. The code also extracts the 'Year' from the date and calculates the mean AQI for each year, resulting in the 'aqi_df' DataFrame, which provides a concise and structured summary of annual AQI trends for further analysis.

In [71]:
particulate_df1 = particulate_df.copy()
gaseous_df1 = gaseous_df.copy()
particulate_df1 = particulate_df1.drop(['Sample_Duration', 'Pollutant_Type'], axis=1)
gaseous_df1 = gaseous_df1.drop(['Sample_Duration', 'Pollutant_Type'], axis=1)
# Group by 'Date' and calculate the mean for each group
particulate_avg_df = particulate_df1.groupby('Date', as_index=False).mean()
gaseous_avg_df = gaseous_df1.groupby('Date', as_index=False).mean()
# Merge the DataFrames on 'Date' and take the maximum AQI or fill with values from either DataFrame
combined_df = gaseous_avg_df.merge(particulate_avg_df, on='Date', how='outer', suffixes=('_gaseous', '_particulate'))
combined_df['AQI'] = combined_df[['AQI_gaseous', 'AQI_particulate']].max(axis=1)
combined_df['AQI'].fillna(combined_df['AQI_gaseous'].combine_first(combined_df['AQI_particulate']), inplace=True)
combined_df.drop(columns=['AQI_gaseous', 'AQI_particulate'], inplace=True)
# Extract the year from the 'Date' column
combined_df['Year'] = combined_df['Date'].dt.year
# Group by 'Year' and calculate the mean AQI for each year
aqi_df = combined_df.groupby('Year')['AQI'].mean().reset_index()
# Displaying what aqi_df looks like
aqi_df.head()

Unnamed: 0,Year,AQI
0,1981,0.19469
1,1982,7.76431
2,1983,13.492582
3,1984,14.084699
4,1985,16.915966


In [72]:
aqi_df.to_csv("/content/drive/MyDrive/aqi.csv")