# Setting environment

In [1]:
from pathlib import Path
import os
# get current file directory
cfd = Path.cwd()
print(f"Current file directory: {cfd}")
new_cwd = cfd.parent
os.chdir(new_cwd)
print(f"Current working directory changed to: {Path.cwd()}")

Current file directory: /mnt/c/Users/Usuario/PycharmProjects/data-life-cycle-project-2025/code
Current working directory changed to: /mnt/c/Users/Usuario/PycharmProjects/data-life-cycle-project-2025


In [2]:
!ls

LICENSE  README.md  code  data	input  tex


# Explore asthma dataset

## Inspect metadata

In [4]:
# load metadata
import json
raw_data_dir = Path("data/raw")
with open(raw_data_dir / "metadata.json", "r") as f:
    metadata = json.load(f)
# pretty print data
import pprint
pprint.pprint(metadata)

{'@type': 'dcat:Dataset',
 'accessLevel': 'public',
 'contactPoint': {'@type': 'vcard:Contact',
                  'fn': 'California Department of Public Health, California '
                        'Breathing Asthma Program',
                  'hasEmail': 'mailto:California.Breathing@cdph.ca.gov'},
 'description': 'This dataset contains the estimated percentage of '
                'Californians with asthma (asthma prevalence). Two types of '
                'asthma prevalence are included: 1) lifetime asthma prevalence '
                'describes the percentage of people who have ever been '
                'diagnosed with asthma by a health care provider, 2) current '
                'asthma prevalence describes the percentage of people who have '
                'ever been diagnosed with asthma by a health care provider AND '
                'report they still have asthma and/or had an asthma episode or '
                'attack within the past 12 months. The tables “Lifetime Asthm

## Download and extract geographical data

In [7]:
import requests
# download county shapefiles from US Census Bureau
shapes_url = 'https://www2.census.gov/geo/tiger/TIGER2025/COUNTY/tl_2025_us_county.zip'
geo_dir = Path("data/geographical")
geo_dir.mkdir(parents=True, exist_ok=True)
shapes_path = geo_dir / "tl_2025_us_county.zip"
if not shapes_path.exists():
    print(f"Downloading county shapefiles from {shapes_url}...")
    r = requests.get(shapes_url)
    with open(shapes_path, 'wb') as f:
        f.write(r.content)
    print(f"Downloaded to {shapes_path}")
else:
    print(f"Shapefiles already exist at {shapes_path}")

Downloading county shapefiles from https://www2.census.gov/geo/tiger/TIGER2025/COUNTY/tl_2025_us_county.zip...
Downloaded to data/geographical/tl_2025_us_county.zip


In [8]:
# extract shapefiles to subdirectory
import zipfile
with zipfile.ZipFile(shapes_path, 'r') as zip_ref:
    zip_ref.extractall(geo_dir / "california counties")
print(f"Extracted shapefiles to {geo_dir / 'california counties'}")

Extracted shapefiles to data/geographical/california counties


## Inspect geographical data

In [13]:
# load shapefiles using geopandas
import geopandas as gpd
shapefile_path = geo_dir / "california counties" / "tl_2025_us_county.shp"
counties_gdf = gpd.read_file(shapefile_path)
counties_gdf.head()

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,GEOID,GEOIDFQ,NAME,NAMELSAD,LSAD,CLASSFP,MTFCC,CSAFP,CBSAFP,METDIVFP,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
0,40,75,1101825,40075,0500000US40075,Kiowa,Kiowa County,6,H1,G4020,,,,A,2629039892,40296743,34.9214893,-98.9816168,"POLYGON ((-98.95506 35.11643, -98.94903 35.116..."
1,46,79,1265776,46079,0500000US46079,Lake,Lake County,6,H1,G4020,,,,A,1457916151,31746795,44.0284497,-97.1232229,"POLYGON ((-96.88886 43.9353, -96.88886 43.9351..."
2,37,33,1008542,37033,0500000US37033,Caswell,Caswell County,6,H1,G4020,,,,A,1102042927,8293623,36.3943252,-79.3396193,"POLYGON ((-79.14343 36.4422, -79.14345 36.4418..."
3,48,377,1383974,48377,0500000US48377,Presidio,Presidio County,6,H1,G4020,,,,A,9985057447,1773188,30.0058912,-104.2616192,"POLYGON ((-104.98078 30.62552, -104.98073 30.6..."
4,39,57,1074041,39057,0500000US39057,Greene,Greene County,6,H1,G4020,212.0,19430.0,,A,1071302625,6798109,39.6874785,-83.8948943,"POLYGON ((-84.10668 39.68891, -84.10662 39.689..."


In [14]:
# california state uses STATEFP = 06
california_counties = counties_gdf[counties_gdf['STATEFP'] == '06']
california_counties

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,GEOID,GEOIDFQ,NAME,NAMELSAD,LSAD,CLASSFP,MTFCC,CSAFP,CBSAFP,METDIVFP,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
31,6,77,277303,6077,0500000US06077,San Joaquin,San Joaquin County,6,H1,G4020,488.0,44700.0,,A,3606041616,88864241,37.9349815,-121.272244,"POLYGON ((-121.17845 37.70553, -121.17886 37.7..."
74,6,25,277277,6025,0500000US06025,Imperial,Imperial County,6,H1,G4020,,20940.0,,A,10814374223,790135128,33.0408143,-115.3554001,"POLYGON ((-114.72652 32.71827, -114.72706 32.7..."
272,6,89,1682610,6089,0500000US06089,Shasta,Shasta County,6,H1,G4020,454.0,39820.0,,A,9778891282,185818274,40.7605142,-122.0435558,"POLYGON ((-121.32288 40.58492, -121.32288 40.5..."
357,6,23,1681908,6023,0500000US06023,Humboldt,Humboldt County,6,H1,G4020,,21700.0,,A,9241141620,1254149638,40.7066554,-123.9261757,"POLYGON ((-124.482 40.44032, -124.47916 40.452..."
362,6,59,277294,6059,0500000US06059,Orange,Orange County,6,H1,G4020,348.0,31080.0,11244.0,A,2054504217,405282838,33.6756872,-117.7772068,"POLYGON ((-118.09706 33.77438, -118.09706 33.7..."
383,6,47,277288,6047,0500000US06047,Merced,Merced County,6,H1,G4020,488.0,32900.0,,A,5019448521,105236254,37.1948063,-120.7228019,"POLYGON ((-120.5417 37.0445, -120.54179 37.044..."
442,6,105,277317,6105,0500000US06105,Trinity,Trinity County,6,H1,G4020,,,,A,8234265082,73407950,40.6478582,-123.114666,"POLYGON ((-122.99825 40.41821, -122.99868 40.4..."
452,6,53,277291,6053,0500000US06053,Monterey,Monterey County,6,H1,G4020,,41500.0,,A,8499610242,1267806310,36.2401044,-121.3155781,"POLYGON ((-122.05188 36.8196, -122.04172 36.82..."
462,6,5,1675841,6005,0500000US06005,Amador,Amador County,6,H1,G4020,,,,A,1539967079,29437117,38.4435493,-120.653858,"POLYGON ((-121.02771 38.50011, -121.0277 38.50..."
817,6,71,277300,6071,0500000US06071,San Bernardino,San Bernardino County,6,H1,G4020,348.0,40140.0,,A,51976967449,96404497,34.8566615,-116.1815707,"POLYGON ((-114.82952 34.0796, -114.83076 34.07..."


## Filter OpenAQ stations to California

In [20]:
import yaml

with open(Path.cwd() / "code" / "keys.yaml", 'r') as f:
   keys = yaml.safe_load(f)
openaq_key = keys['api_keys']['OpenAQ']
openaq_key

'94e6fcef08ee681b1921a232bc2dcbbaf3959e917a7e4c3e50582c59c9557199'

In [None]:
x_range = (-125, -113)
y_range = (32, 42.5)
import requests
openaq_url = "https://api.openaq.org/v2/locations"
params = {
    "country": "US",
    "coordinates": f"{(y_range[0] + y_range[1]) / 2},{(x_range[0] + x_range[1]) / 2}",
    "radius": 500000,  # in meters
    "limit": 10000,