# Setting environment

In [1]:
from pathlib import Path
import os
# get current file directory
cfd = Path.cwd()
print(f"Current file directory: {cfd}")
new_cwd = cfd.parent
os.chdir(new_cwd)
print(f"Current working directory changed to: {Path.cwd()}")

Current file directory: /mnt/c/Users/Usuario/PycharmProjects/data-life-cycle-project-2025/code
Current working directory changed to: /mnt/c/Users/Usuario/PycharmProjects/data-life-cycle-project-2025


In [2]:
!ls

LICENSE  README.md  code  data	input  tex


# Explore asthma dataset

## Inspect metadata

In [4]:
# load metadata
import json
raw_data_dir = Path("data/raw")
with open(raw_data_dir / "metadata.json", "r") as f:
    metadata = json.load(f)
# pretty print data
import pprint
pprint.pprint(metadata)

{'@type': 'dcat:Dataset',
 'accessLevel': 'public',
 'contactPoint': {'@type': 'vcard:Contact',
                  'fn': 'California Department of Public Health, California '
                        'Breathing Asthma Program',
                  'hasEmail': 'mailto:California.Breathing@cdph.ca.gov'},
 'description': 'This dataset contains the estimated percentage of '
                'Californians with asthma (asthma prevalence). Two types of '
                'asthma prevalence are included: 1) lifetime asthma prevalence '
                'describes the percentage of people who have ever been '
                'diagnosed with asthma by a health care provider, 2) current '
                'asthma prevalence describes the percentage of people who have '
                'ever been diagnosed with asthma by a health care provider AND '
                'report they still have asthma and/or had an asthma episode or '
                'attack within the past 12 months. The tables “Lifetime Asthm

## Download and extract geographical data

In [7]:
import requests
# download county shapefiles from US Census Bureau
shapes_url = 'https://www2.census.gov/geo/tiger/TIGER2025/COUNTY/tl_2025_us_county.zip'
geo_dir = Path("data/geographical")
geo_dir.mkdir(parents=True, exist_ok=True)
shapes_path = geo_dir / "tl_2025_us_county.zip"
if not shapes_path.exists():
    print(f"Downloading county shapefiles from {shapes_url}...")
    r = requests.get(shapes_url)
    with open(shapes_path, 'wb') as f:
        f.write(r.content)
    print(f"Downloaded to {shapes_path}")
else:
    print(f"Shapefiles already exist at {shapes_path}")

Downloading county shapefiles from https://www2.census.gov/geo/tiger/TIGER2025/COUNTY/tl_2025_us_county.zip...
Downloaded to data/geographical/tl_2025_us_county.zip


In [8]:
# extract shapefiles to subdirectory
import zipfile
with zipfile.ZipFile(shapes_path, 'r') as zip_ref:
    zip_ref.extractall(geo_dir / "california counties")
print(f"Extracted shapefiles to {geo_dir / 'california counties'}")

Extracted shapefiles to data/geographical/california counties


## Inspect geographical data

In [13]:
# load shapefiles using geopandas
import geopandas as gpd
shapefile_path = geo_dir / "california counties" / "tl_2025_us_county.shp"
counties_gdf = gpd.read_file(shapefile_path)
counties_gdf.head()

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,GEOID,GEOIDFQ,NAME,NAMELSAD,LSAD,CLASSFP,MTFCC,CSAFP,CBSAFP,METDIVFP,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
0,40,75,1101825,40075,0500000US40075,Kiowa,Kiowa County,6,H1,G4020,,,,A,2629039892,40296743,34.9214893,-98.9816168,"POLYGON ((-98.95506 35.11643, -98.94903 35.116..."
1,46,79,1265776,46079,0500000US46079,Lake,Lake County,6,H1,G4020,,,,A,1457916151,31746795,44.0284497,-97.1232229,"POLYGON ((-96.88886 43.9353, -96.88886 43.9351..."
2,37,33,1008542,37033,0500000US37033,Caswell,Caswell County,6,H1,G4020,,,,A,1102042927,8293623,36.3943252,-79.3396193,"POLYGON ((-79.14343 36.4422, -79.14345 36.4418..."
3,48,377,1383974,48377,0500000US48377,Presidio,Presidio County,6,H1,G4020,,,,A,9985057447,1773188,30.0058912,-104.2616192,"POLYGON ((-104.98078 30.62552, -104.98073 30.6..."
4,39,57,1074041,39057,0500000US39057,Greene,Greene County,6,H1,G4020,212.0,19430.0,,A,1071302625,6798109,39.6874785,-83.8948943,"POLYGON ((-84.10668 39.68891, -84.10662 39.689..."


In [14]:
# california state uses STATEFP = 06
california_counties = counties_gdf[counties_gdf['STATEFP'] == '06']
california_counties

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,GEOID,GEOIDFQ,NAME,NAMELSAD,LSAD,CLASSFP,MTFCC,CSAFP,CBSAFP,METDIVFP,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
31,6,77,277303,6077,0500000US06077,San Joaquin,San Joaquin County,6,H1,G4020,488.0,44700.0,,A,3606041616,88864241,37.9349815,-121.272244,"POLYGON ((-121.17845 37.70553, -121.17886 37.7..."
74,6,25,277277,6025,0500000US06025,Imperial,Imperial County,6,H1,G4020,,20940.0,,A,10814374223,790135128,33.0408143,-115.3554001,"POLYGON ((-114.72652 32.71827, -114.72706 32.7..."
272,6,89,1682610,6089,0500000US06089,Shasta,Shasta County,6,H1,G4020,454.0,39820.0,,A,9778891282,185818274,40.7605142,-122.0435558,"POLYGON ((-121.32288 40.58492, -121.32288 40.5..."
357,6,23,1681908,6023,0500000US06023,Humboldt,Humboldt County,6,H1,G4020,,21700.0,,A,9241141620,1254149638,40.7066554,-123.9261757,"POLYGON ((-124.482 40.44032, -124.47916 40.452..."
362,6,59,277294,6059,0500000US06059,Orange,Orange County,6,H1,G4020,348.0,31080.0,11244.0,A,2054504217,405282838,33.6756872,-117.7772068,"POLYGON ((-118.09706 33.77438, -118.09706 33.7..."
383,6,47,277288,6047,0500000US06047,Merced,Merced County,6,H1,G4020,488.0,32900.0,,A,5019448521,105236254,37.1948063,-120.7228019,"POLYGON ((-120.5417 37.0445, -120.54179 37.044..."
442,6,105,277317,6105,0500000US06105,Trinity,Trinity County,6,H1,G4020,,,,A,8234265082,73407950,40.6478582,-123.114666,"POLYGON ((-122.99825 40.41821, -122.99868 40.4..."
452,6,53,277291,6053,0500000US06053,Monterey,Monterey County,6,H1,G4020,,41500.0,,A,8499610242,1267806310,36.2401044,-121.3155781,"POLYGON ((-122.05188 36.8196, -122.04172 36.82..."
462,6,5,1675841,6005,0500000US06005,Amador,Amador County,6,H1,G4020,,,,A,1539967079,29437117,38.4435493,-120.653858,"POLYGON ((-121.02771 38.50011, -121.0277 38.50..."
817,6,71,277300,6071,0500000US06071,San Bernardino,San Bernardino County,6,H1,G4020,348.0,40140.0,,A,51976967449,96404497,34.8566615,-116.1815707,"POLYGON ((-114.82952 34.0796, -114.83076 34.07..."


## Query data from OpenAQ API

In [23]:
import yaml

with open(Path.cwd() / "code" / "keys.yaml", 'r') as f:
   keys = yaml.safe_load(f)
API_KEY = keys['api_keys']['OpenAQ']
API_KEY

'94e6fcef08ee681b1921a232bc2dcbbaf3959e917a7e4c3e50582c59c9557199'

In [26]:
# define bounding box for California
x_range = (-125, -113)
y_range = (32, 42.5)

import requests

BASE = "https://api.openaq.org/v3"
headers = {"X-API-Key": API_KEY}

all_results = []
page = 1
limit = 1000   # OpenAQ v3 maximum

while True:
    r = requests.get(
        f"{BASE}/locations",
        headers=headers,
        params={
            "bbox": f"{x_range[0]},{y_range[0]},{x_range[1]},{y_range[1]}",
            "limit": limit,
            "page": page,
            "country": "US",   # optional but helps validation
        },
        timeout=60,
    )
    r.raise_for_status()
    data = r.json()

    results = data.get("results", [])
    if not results:
        break

    all_results.extend(results)
    print(f"page {page}: {len(results)}")

    page += 1

print("TOTAL returned:", len(all_results))
print("first name:", all_results[0].get("name"))

page 1: 1000
page 2: 980
TOTAL returned: 1980
first name: MMFRA1001


In [28]:
# to dataframe
import pandas as pd
stations_df = pd.json_normalize(all_results)
stations_df.head()

Unnamed: 0,id,name,locality,timezone,isMobile,isMonitor,instruments,sensors,licenses,bounds,...,provider.id,provider.name,coordinates.latitude,coordinates.longitude,datetimeFirst.utc,datetimeFirst.local,datetimeLast.utc,datetimeLast.local,datetimeFirst,datetimeLast
0,207,MMFRA1001,,America/Los_Angeles,False,True,"[{'id': 2, 'name': 'Government Monitor'}]","[{'id': 350, 'name': 'pm25 µg/m³', 'parameter'...","[{'id': 33, 'name': 'US Public Domain', 'attri...","[-121.221235, 39.482481, -121.221235, 39.482481]",...,119,AirNow,39.482481,-121.221235,2016-03-06T20:00:00Z,2016-03-06T12:00:00-08:00,2016-03-16T22:00:00Z,2016-03-16T15:00:00-07:00,,
1,211,Felton Cal-Fire,,America/Los_Angeles,False,True,"[{'id': 2, 'name': 'Government Monitor'}]","[{'id': 354, 'name': 'pm25 µg/m³', 'parameter'...","[{'id': 33, 'name': 'US Public Domain', 'attri...","[-122.074603, 37.0481, -122.074603, 37.0481]",...,119,AirNow,37.0481,-122.074603,2016-03-07T15:00:00Z,2016-03-07T07:00:00-08:00,2022-04-08T18:00:00Z,2022-04-08T11:00:00-07:00,,
2,214,MMFRA1001,,America/Los_Angeles,False,True,"[{'id': 2, 'name': 'Government Monitor'}]","[{'id': 357, 'name': 'pm25 µg/m³', 'parameter'...","[{'id': 33, 'name': 'US Public Domain', 'attri...","[-121.221128, 39.482385, -121.221128, 39.482385]",...,119,AirNow,39.482385,-121.221128,2016-03-11T17:00:00Z,2016-03-11T09:00:00-08:00,2016-03-16T05:00:00Z,2016-03-15T22:00:00-07:00,,
3,230,Barstow,Riverside-San Bernardino-Ontario,America/Los_Angeles,False,True,"[{'id': 2, 'name': 'Government Monitor'}]","[{'id': 2637, 'name': 'o3 ppm', 'parameter': {...","[{'id': 33, 'name': 'US Public Domain', 'attri...","[-117.024756, 34.894054, -117.024756, 34.894054]",...,119,AirNow,34.894054,-117.024756,2016-03-06T20:00:00Z,2016-03-06T12:00:00-08:00,2025-12-21T11:00:00Z,2025-12-21T03:00:00-08:00,,
4,231,Lucerne Valley,SAN BERNARDINO,America/Los_Angeles,False,True,"[{'id': 2, 'name': 'Government Monitor'}]","[{'id': 7941720, 'name': 'o3 ppm', 'parameter'...","[{'id': 33, 'name': 'US Public Domain', 'attri...","[-116.906874, 34.4101, -116.906874, 34.4101]",...,119,AirNow,34.4101,-116.906874,2016-03-13T10:00:00Z,2016-03-13T03:00:00-07:00,2025-12-21T11:00:00Z,2025-12-21T03:00:00-08:00,,


In [29]:
# save to raw data directory
stations_path = raw_data_dir / "openaq_stations_bbox.csv"
stations_df.to_csv(stations_path, index=False)
print(f"Saved OpenAQ stations data to {stations_path}")

Saved OpenAQ stations data to data/raw/openaq_stations_ca.csv


## Filter stations in California

In [31]:
county_geometry = california_counties[['NAME', 'geometry']].set_index('NAME')
# order index
county_geometry = county_geometry.sort_index()
county_geometry

Unnamed: 0_level_0,geometry
NAME,Unnamed: 1_level_1
Alameda,"POLYGON ((-122.37312 37.88388, -122.37114 37.8..."
Alpine,"POLYGON ((-120.07333 38.70109, -120.07324 38.7..."
Amador,"POLYGON ((-121.02771 38.50011, -121.0277 38.50..."
Butte,"POLYGON ((-122.06943 39.84053, -122.06874 39.8..."
Calaveras,"POLYGON ((-120.6318 38.34603, -120.63066 38.34..."
Colusa,"POLYGON ((-121.91512 38.92535, -121.91527 38.9..."
Contra Costa,"POLYGON ((-121.69732 37.78244, -121.69748 37.7..."
Del Norte,"POLYGON ((-124.31611 41.72839, -124.33061 41.7..."
El Dorado,"POLYGON ((-120.18443 39.03101, -120.1841 39.03..."
Fresno,"POLYGON ((-119.57319 36.48884, -119.57322 36.4..."


In [32]:
# add county column to stations_df
from shapely.geometry import Point
def get_county(lon, lat, county_gdf):
    point = Point(lon, lat)
    for county_name, row in county_gdf.iterrows():
        if row['geometry'].contains(point):
            return county_name
    return None
stations_df['county'] = stations_df.apply(
    lambda row: get_county(row['coordinates.longitude'], row['coordinates.latitude'], county_geometry),
    axis=1
)
stations_df

Unnamed: 0,id,name,locality,timezone,isMobile,isMonitor,instruments,sensors,licenses,bounds,...,provider.name,coordinates.latitude,coordinates.longitude,datetimeFirst.utc,datetimeFirst.local,datetimeLast.utc,datetimeLast.local,datetimeFirst,datetimeLast,county
0,207,MMFRA1001,,America/Los_Angeles,False,True,"[{'id': 2, 'name': 'Government Monitor'}]","[{'id': 350, 'name': 'pm25 µg/m³', 'parameter'...","[{'id': 33, 'name': 'US Public Domain', 'attri...","[-121.221235, 39.482481, -121.221235, 39.482481]",...,AirNow,39.482481,-121.221235,2016-03-06T20:00:00Z,2016-03-06T12:00:00-08:00,2016-03-16T22:00:00Z,2016-03-16T15:00:00-07:00,,,Yuba
1,211,Felton Cal-Fire,,America/Los_Angeles,False,True,"[{'id': 2, 'name': 'Government Monitor'}]","[{'id': 354, 'name': 'pm25 µg/m³', 'parameter'...","[{'id': 33, 'name': 'US Public Domain', 'attri...","[-122.074603, 37.0481, -122.074603, 37.0481]",...,AirNow,37.048100,-122.074603,2016-03-07T15:00:00Z,2016-03-07T07:00:00-08:00,2022-04-08T18:00:00Z,2022-04-08T11:00:00-07:00,,,Santa Cruz
2,214,MMFRA1001,,America/Los_Angeles,False,True,"[{'id': 2, 'name': 'Government Monitor'}]","[{'id': 357, 'name': 'pm25 µg/m³', 'parameter'...","[{'id': 33, 'name': 'US Public Domain', 'attri...","[-121.221128, 39.482385, -121.221128, 39.482385]",...,AirNow,39.482385,-121.221128,2016-03-11T17:00:00Z,2016-03-11T09:00:00-08:00,2016-03-16T05:00:00Z,2016-03-15T22:00:00-07:00,,,Yuba
3,230,Barstow,Riverside-San Bernardino-Ontario,America/Los_Angeles,False,True,"[{'id': 2, 'name': 'Government Monitor'}]","[{'id': 2637, 'name': 'o3 ppm', 'parameter': {...","[{'id': 33, 'name': 'US Public Domain', 'attri...","[-117.024756, 34.894054, -117.024756, 34.894054]",...,AirNow,34.894054,-117.024756,2016-03-06T20:00:00Z,2016-03-06T12:00:00-08:00,2025-12-21T11:00:00Z,2025-12-21T03:00:00-08:00,,,San Bernardino
4,231,Lucerne Valley,SAN BERNARDINO,America/Los_Angeles,False,True,"[{'id': 2, 'name': 'Government Monitor'}]","[{'id': 7941720, 'name': 'o3 ppm', 'parameter'...","[{'id': 33, 'name': 'US Public Domain', 'attri...","[-116.906874, 34.4101, -116.906874, 34.4101]",...,AirNow,34.410100,-116.906874,2016-03-13T10:00:00Z,2016-03-13T03:00:00-07:00,2025-12-21T11:00:00Z,2025-12-21T03:00:00-08:00,,,San Bernardino
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1975,6170870,North Alhambra,,America/Los_Angeles,False,False,"[{'id': 7, 'name': 'Unknown AirGradient Sensor'}]","[{'id': 14935594, 'name': 'pm1 µg/m³', 'parame...","[{'id': 41, 'name': 'CC BY 4.0', 'attribution'...","[-118.13855657348095, 34.09975528725245, -118....",...,AirGradient,34.099755,-118.138557,2025-12-19T20:00:00Z,2025-12-19T12:00:00-08:00,2025-12-21T09:00:00Z,2025-12-21T01:00:00-08:00,,,Los Angeles
1976,6172339,Carson City - Indian Hills,,America/Los_Angeles,False,False,"[{'id': 7, 'name': 'Unknown AirGradient Sensor'}]","[{'id': 14946536, 'name': 'pm1 µg/m³', 'parame...","[{'id': 41, 'name': 'CC BY 4.0', 'attribution'...","[-119.779649, 39.083528, -119.779649, 39.083528]",...,AirGradient,39.083528,-119.779649,2025-12-20T22:00:00Z,2025-12-20T14:00:00-08:00,2025-12-21T09:00:00Z,2025-12-21T01:00:00-08:00,,,
1977,6172496,SJVAir-af04,,America/Los_Angeles,False,False,"[{'id': 7, 'name': 'Unknown AirGradient Sensor'}]","[{'id': 14947719, 'name': 'pm1 µg/m³', 'parame...","[{'id': 41, 'name': 'CC BY 4.0', 'attribution'...","[-119.66648590037425, 36.821894255074376, -119...",...,AirGradient,36.821894,-119.666486,2025-12-21T01:00:00Z,2025-12-20T17:00:00-08:00,2025-12-21T09:00:00Z,2025-12-21T01:00:00-08:00,,,Fresno
1978,6172684,"Center St, Stratford",,America/Los_Angeles,False,False,"[{'id': 7, 'name': 'Unknown AirGradient Sensor'}]","[{'id': 14949136, 'name': 'pm1 µg/m³', 'parame...","[{'id': 41, 'name': 'CC BY 4.0', 'attribution'...","[-119.82181, 36.194647, -119.82181, 36.194647]",...,AirGradient,36.194647,-119.821810,2025-12-21T05:00:00Z,2025-12-20T21:00:00-08:00,2025-12-21T09:00:00Z,2025-12-21T01:00:00-08:00,,,Kings


In [34]:
california_stations_df = stations_df[stations_df['county'].notnull()]
california_stations_df

Unnamed: 0,id,name,locality,timezone,isMobile,isMonitor,instruments,sensors,licenses,bounds,...,provider.name,coordinates.latitude,coordinates.longitude,datetimeFirst.utc,datetimeFirst.local,datetimeLast.utc,datetimeLast.local,datetimeFirst,datetimeLast,county
0,207,MMFRA1001,,America/Los_Angeles,False,True,"[{'id': 2, 'name': 'Government Monitor'}]","[{'id': 350, 'name': 'pm25 µg/m³', 'parameter'...","[{'id': 33, 'name': 'US Public Domain', 'attri...","[-121.221235, 39.482481, -121.221235, 39.482481]",...,AirNow,39.482481,-121.221235,2016-03-06T20:00:00Z,2016-03-06T12:00:00-08:00,2016-03-16T22:00:00Z,2016-03-16T15:00:00-07:00,,,Yuba
1,211,Felton Cal-Fire,,America/Los_Angeles,False,True,"[{'id': 2, 'name': 'Government Monitor'}]","[{'id': 354, 'name': 'pm25 µg/m³', 'parameter'...","[{'id': 33, 'name': 'US Public Domain', 'attri...","[-122.074603, 37.0481, -122.074603, 37.0481]",...,AirNow,37.048100,-122.074603,2016-03-07T15:00:00Z,2016-03-07T07:00:00-08:00,2022-04-08T18:00:00Z,2022-04-08T11:00:00-07:00,,,Santa Cruz
2,214,MMFRA1001,,America/Los_Angeles,False,True,"[{'id': 2, 'name': 'Government Monitor'}]","[{'id': 357, 'name': 'pm25 µg/m³', 'parameter'...","[{'id': 33, 'name': 'US Public Domain', 'attri...","[-121.221128, 39.482385, -121.221128, 39.482385]",...,AirNow,39.482385,-121.221128,2016-03-11T17:00:00Z,2016-03-11T09:00:00-08:00,2016-03-16T05:00:00Z,2016-03-15T22:00:00-07:00,,,Yuba
3,230,Barstow,Riverside-San Bernardino-Ontario,America/Los_Angeles,False,True,"[{'id': 2, 'name': 'Government Monitor'}]","[{'id': 2637, 'name': 'o3 ppm', 'parameter': {...","[{'id': 33, 'name': 'US Public Domain', 'attri...","[-117.024756, 34.894054, -117.024756, 34.894054]",...,AirNow,34.894054,-117.024756,2016-03-06T20:00:00Z,2016-03-06T12:00:00-08:00,2025-12-21T11:00:00Z,2025-12-21T03:00:00-08:00,,,San Bernardino
4,231,Lucerne Valley,SAN BERNARDINO,America/Los_Angeles,False,True,"[{'id': 2, 'name': 'Government Monitor'}]","[{'id': 7941720, 'name': 'o3 ppm', 'parameter'...","[{'id': 33, 'name': 'US Public Domain', 'attri...","[-116.906874, 34.4101, -116.906874, 34.4101]",...,AirNow,34.410100,-116.906874,2016-03-13T10:00:00Z,2016-03-13T03:00:00-07:00,2025-12-21T11:00:00Z,2025-12-21T03:00:00-08:00,,,San Bernardino
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1974,6169863,1417 Cabrillo Ave,,America/Los_Angeles,False,False,"[{'id': 7, 'name': 'Unknown AirGradient Sensor'}]","[{'id': 14928467, 'name': 'pm1 µg/m³', 'parame...","[{'id': 41, 'name': 'CC BY 4.0', 'attribution'...","[-122.37466, 37.58592, -122.37466, 37.58592]",...,AirGradient,37.585920,-122.374660,2025-12-19T01:00:00Z,2025-12-18T17:00:00-08:00,2025-12-21T09:00:00Z,2025-12-21T01:00:00-08:00,,,San Mateo
1975,6170870,North Alhambra,,America/Los_Angeles,False,False,"[{'id': 7, 'name': 'Unknown AirGradient Sensor'}]","[{'id': 14935594, 'name': 'pm1 µg/m³', 'parame...","[{'id': 41, 'name': 'CC BY 4.0', 'attribution'...","[-118.13855657348095, 34.09975528725245, -118....",...,AirGradient,34.099755,-118.138557,2025-12-19T20:00:00Z,2025-12-19T12:00:00-08:00,2025-12-21T09:00:00Z,2025-12-21T01:00:00-08:00,,,Los Angeles
1977,6172496,SJVAir-af04,,America/Los_Angeles,False,False,"[{'id': 7, 'name': 'Unknown AirGradient Sensor'}]","[{'id': 14947719, 'name': 'pm1 µg/m³', 'parame...","[{'id': 41, 'name': 'CC BY 4.0', 'attribution'...","[-119.66648590037425, 36.821894255074376, -119...",...,AirGradient,36.821894,-119.666486,2025-12-21T01:00:00Z,2025-12-20T17:00:00-08:00,2025-12-21T09:00:00Z,2025-12-21T01:00:00-08:00,,,Fresno
1978,6172684,"Center St, Stratford",,America/Los_Angeles,False,False,"[{'id': 7, 'name': 'Unknown AirGradient Sensor'}]","[{'id': 14949136, 'name': 'pm1 µg/m³', 'parame...","[{'id': 41, 'name': 'CC BY 4.0', 'attribution'...","[-119.82181, 36.194647, -119.82181, 36.194647]",...,AirGradient,36.194647,-119.821810,2025-12-21T05:00:00Z,2025-12-20T21:00:00-08:00,2025-12-21T09:00:00Z,2025-12-21T01:00:00-08:00,,,Kings


Let's see how many stations we have in each county.

In [37]:
california_stations_df.groupby('county').size().sort_values(ascending=False)

county
Los Angeles        403
Alameda            190
Mono                94
Contra Costa        87
Sacramento          83
San Francisco       66
Monterey            50
Sonoma              48
San Diego           46
San Luis Obispo     44
Inyo                43
Riverside           41
Kern                40
San Bernardino      36
Orange              36
Santa Barbara       35
Tulare              35
Santa Clara         34
San Mateo           34
Humboldt            33
Santa Cruz          32
El Dorado           28
Ventura             26
Shasta              25
Fresno              23
Yuba                22
Napa                19
Mendocino           19
Siskiyou            18
Marin               15
Butte               13
Merced              12
Alpine              12
Imperial            11
Kings               11
Lake                10
Placer              10
Mariposa            10
San Joaquin          9
Sutter               9
Yolo                 8
San Benito           8
Nevada               6
Plum

## Request data for each station
Since there are so many stations, we shall limit our request to 1 station per county.

In [38]:
# We choose one station per county
selected_stations = california_stations_df.groupby('county').first().reset_index()
selected_stations

Unnamed: 0,county,id,name,locality,timezone,isMobile,isMonitor,instruments,sensors,licenses,...,provider.id,provider.name,coordinates.latitude,coordinates.longitude,datetimeFirst.utc,datetimeFirst.local,datetimeLast.utc,datetimeLast.local,datetimeFirst,datetimeLast
0,Alameda,1021,Patterson Pass,San Francisco-Oakland-Fremont,America/Los_Angeles,False,True,"[{'id': 2, 'name': 'Government Monitor'}]","[{'id': 1840, 'name': 'no2 ppm', 'parameter': ...","[{'id': 33, 'name': 'US Public Domain', 'attri...",...,119,AirNow,37.689615,-121.631916,2016-03-06T20:00:00Z,2016-03-06T12:00:00-08:00,2017-03-30T18:00:00Z,2017-03-30T11:00:00-07:00,,
1,Alpine,4958,GBUAPCD EBAM @ Alpin,GBU,America/Los_Angeles,False,True,"[{'id': 2, 'name': 'Government Monitor'}]","[{'id': 12646, 'name': 'pm25 µg/m³', 'paramete...","[{'id': 33, 'name': 'US Public Domain', 'attri...",...,119,AirNow,38.766278,-119.807625,2017-08-10T22:00:00Z,2017-08-10T15:00:00-07:00,2017-11-14T21:00:00Z,2017-11-14T13:00:00-08:00,,
2,Amador,905,Jackson - Clinton,AMADOR,America/Los_Angeles,False,True,"[{'id': 2, 'name': 'Government Monitor'}]","[{'id': 1656, 'name': 'o3 ppm', 'parameter': {...","[{'id': 33, 'name': 'US Public Domain', 'attri...",...,119,AirNow,38.34,-120.7625,2016-03-06T20:00:00Z,2016-03-06T12:00:00-08:00,2019-04-18T17:00:00Z,2019-04-18T10:00:00-07:00,,
3,Butte,889,Paradise - Airport,BUTTE,America/Los_Angeles,False,True,"[{'id': 2, 'name': 'Government Monitor'}]","[{'id': 1619, 'name': 'o3 ppm', 'parameter': {...","[{'id': 33, 'name': 'US Public Domain', 'attri...",...,119,AirNow,39.7564,-121.6044,2016-03-06T20:00:00Z,2016-03-06T12:00:00-08:00,2016-08-02T11:00:00Z,2016-08-02T04:00:00-07:00,,
4,Calaveras,6891,San Andreas,CALAVERAS,America/Los_Angeles,False,True,"[{'id': 2, 'name': 'Government Monitor'}]","[{'id': 19798, 'name': 'o3 ppm', 'parameter': ...","[{'id': 33, 'name': 'US Public Domain', 'attri...",...,119,AirNow,38.20185,-120.680277,2019-04-18T19:00:00Z,2019-04-18T12:00:00-07:00,2025-12-21T11:00:00Z,2025-12-21T03:00:00-08:00,,
5,Colusa,320,Cortina Rancheria,COLUSA,America/Los_Angeles,False,True,"[{'id': 2, 'name': 'Government Monitor'}]","[{'id': 537, 'name': 'pm25 µg/m³', 'parameter'...","[{'id': 33, 'name': 'US Public Domain', 'attri...",...,119,AirNow,39.015,-122.286111,2016-03-06T20:00:00Z,2016-03-06T12:00:00-08:00,2019-09-17T14:00:00Z,2019-09-17T07:00:00-07:00,,
6,Contra Costa,1520,Bethel Island,San Francisco-Oakland-Fremont,America/Los_Angeles,False,True,"[{'id': 2, 'name': 'Government Monitor'}]","[{'id': 25749, 'name': 'co ppm', 'parameter': ...","[{'id': 33, 'name': 'US Public Domain', 'attri...",...,119,AirNow,38.006311,-121.641918,2016-03-06T20:00:00Z,2016-03-06T12:00:00-08:00,2025-12-21T07:00:00Z,2025-12-20T23:00:00-08:00,,
7,Del Norte,7860,Crescent City,DEL NORTE,America/Los_Angeles,False,True,"[{'id': 2, 'name': 'Government Monitor'}]","[{'id': 22825, 'name': 'pm25 µg/m³', 'paramete...","[{'id': 33, 'name': 'US Public Domain', 'attri...",...,119,AirNow,41.75613,-124.20347,2020-09-08T22:00:00Z,2020-09-08T15:00:00-07:00,2024-07-11T15:00:00Z,2024-07-11T08:00:00-07:00,,
8,El Dorado,883,Echo Summit (seasona,Sacramento--Arden-Arcade--Roseville,America/Los_Angeles,False,True,"[{'id': 2, 'name': 'Government Monitor'}]","[{'id': 1608, 'name': 'o3 ppm', 'parameter': {...","[{'id': 33, 'name': 'US Public Domain', 'attri...",...,119,AirNow,38.81161,-120.03308,2016-03-30T22:00:00Z,2016-03-30T15:00:00-07:00,2024-11-06T17:00:00Z,2024-11-06T09:00:00-08:00,,
9,Fresno,787,Parlier,Fresno,America/Los_Angeles,False,True,"[{'id': 2, 'name': 'Government Monitor'}]","[{'id': 4272286, 'name': 'no ppm', 'parameter'...","[{'id': 33, 'name': 'US Public Domain', 'attri...",...,119,AirNow,36.5975,-119.5036,2016-03-10T08:00:00Z,2016-03-10T00:00:00-08:00,2025-12-21T11:00:00Z,2025-12-21T03:00:00-08:00,,


In [66]:
# estimate number of sensors
selected_stations['sensors_count'] = selected_stations['sensors'].apply(len)
selected_stations['sensors_count'].sum()

np.int64(138)

In [67]:
# The API has a rate limit of 60 per minute and 2000 per hour.
from tqdm import tqdm
import time

SENSOR_BASE = "https://api.openaq.org/v3/sensors"
aq_data_path = Path("data/openaq_data")
aq_data_path.mkdir(parents=True, exist_ok=True)
sensor_metadata_df = pd.DataFrame(columns=[
    'county', 'station_id', 'sensor_id', 'pollutant', 'unit'
])
pbar = tqdm(total=selected_stations['sensors_count'].sum())
for _, row in selected_stations.iterrows():
    station = row
    sensors_list = row['sensors']
    for sensor in sensors_list:
        # check if sensor data file already exists
        sensor_filename = aq_data_path / f"sensor_{sensor['id']}.json"
        if sensor_filename.exists():
            pbar.update(1)
            continue
        sensor_id = sensor['id']
        r = requests.get(
            f"{SENSOR_BASE}/{sensor_id}/measurements",
            headers=headers,
            timeout=60,
        )
        r.raise_for_status()
        sensor_data = r.json().get("results", {})
        # save sensor data to file
        sensor_file_path = aq_data_path / f"sensor_{sensor_id}.json"
        with open(sensor_file_path, 'w') as f:
            json.dump(sensor_data, f, indent=4)
        pbar.set_description(f"Saved sensor data to {sensor_file_path}")
        # append to metadata dataframe
        sensor_metadata_df.loc[len(sensor_metadata_df)] = {
            'county': row['county'],
            'station_id': row['id'],
            'sensor_id': sensor_id,
            'pollutant': sensor['parameter']['name'],
            'unit': sensor['parameter']['units']
        }
        pbar.update(1)
        if r.headers.get('X-RateLimit-Remaining') == '0':
            reset_time = int(r.headers.get('X-RateLimit-Reset', 60))
            pbar.set_description(f"Rate limit reached. Sleeping for {reset_time + 1} seconds...")
            time.sleep(reset_time + 1)
sensor_metadata_df.to_csv(aq_data_path / "sensor_metadata.csv", index=False)
print(f"Saved sensor metadata to {aq_data_path / 'sensor_metadata.csv'}")

Saved sensor data to data/openaq_data/sensor_350.json: 100%|██████████████████████████| 138/138 [01:08<00:00,  1.10s/it]

Saved sensor metadata to data/openaq_data/sensor_metadata.csv


Saved sensor data to data/openaq_data/sensor_350.json: 100%|██████████████████████████| 138/138 [01:27<00:00,  1.10s/it]