# Setting environment

In [1]:
from pathlib import Path
import os
# get current file directory
cfd = Path.cwd()
print(f"Current file directory: {cfd}")
new_cwd = cfd.parent
os.chdir(new_cwd)
print(f"Current working directory changed to: {Path.cwd()}")

Current file directory: /mnt/c/Users/Usuario/PycharmProjects/data-life-cycle-project-2025/code
Current working directory changed to: /mnt/c/Users/Usuario/PycharmProjects/data-life-cycle-project-2025


In [2]:
!ls

LICENSE  README.md  code  data	input  tex


# Explore asthma dataset

## Inspect metadata

In [4]:
# load metadata
import json
raw_data_dir = Path("data/raw")
with open(raw_data_dir / "metadata.json", "r") as f:
    metadata = json.load(f)
# pretty print data
import pprint
pprint.pprint(metadata)

{'@type': 'dcat:Dataset',
 'accessLevel': 'public',
 'contactPoint': {'@type': 'vcard:Contact',
                  'fn': 'California Department of Public Health, California '
                        'Breathing Asthma Program',
                  'hasEmail': 'mailto:California.Breathing@cdph.ca.gov'},
 'description': 'This dataset contains the estimated percentage of '
                'Californians with asthma (asthma prevalence). Two types of '
                'asthma prevalence are included: 1) lifetime asthma prevalence '
                'describes the percentage of people who have ever been '
                'diagnosed with asthma by a health care provider, 2) current '
                'asthma prevalence describes the percentage of people who have '
                'ever been diagnosed with asthma by a health care provider AND '
                'report they still have asthma and/or had an asthma episode or '
                'attack within the past 12 months. The tables “Lifetime Asthm

## Download and extract geographical data

In [7]:
import requests
# download county shapefiles from US Census Bureau
shapes_url = 'https://www2.census.gov/geo/tiger/TIGER2025/COUNTY/tl_2025_us_county.zip'
geo_dir = Path("data/geographical")
geo_dir.mkdir(parents=True, exist_ok=True)
shapes_path = geo_dir / "tl_2025_us_county.zip"
if not shapes_path.exists():
    print(f"Downloading county shapefiles from {shapes_url}...")
    r = requests.get(shapes_url)
    with open(shapes_path, 'wb') as f:
        f.write(r.content)
    print(f"Downloaded to {shapes_path}")
else:
    print(f"Shapefiles already exist at {shapes_path}")

Downloading county shapefiles from https://www2.census.gov/geo/tiger/TIGER2025/COUNTY/tl_2025_us_county.zip...
Downloaded to data/geographical/tl_2025_us_county.zip


In [8]:
# extract shapefiles to subdirectory
import zipfile
with zipfile.ZipFile(shapes_path, 'r') as zip_ref:
    zip_ref.extractall(geo_dir / "california counties")
print(f"Extracted shapefiles to {geo_dir / 'california counties'}")

Extracted shapefiles to data/geographical/california counties


## Inspect geographical data

In [13]:
# load shapefiles using geopandas
import geopandas as gpd
shapefile_path = geo_dir / "california counties" / "tl_2025_us_county.shp"
counties_gdf = gpd.read_file(shapefile_path)
counties_gdf.head()

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,GEOID,GEOIDFQ,NAME,NAMELSAD,LSAD,CLASSFP,MTFCC,CSAFP,CBSAFP,METDIVFP,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
0,40,75,1101825,40075,0500000US40075,Kiowa,Kiowa County,6,H1,G4020,,,,A,2629039892,40296743,34.9214893,-98.9816168,"POLYGON ((-98.95506 35.11643, -98.94903 35.116..."
1,46,79,1265776,46079,0500000US46079,Lake,Lake County,6,H1,G4020,,,,A,1457916151,31746795,44.0284497,-97.1232229,"POLYGON ((-96.88886 43.9353, -96.88886 43.9351..."
2,37,33,1008542,37033,0500000US37033,Caswell,Caswell County,6,H1,G4020,,,,A,1102042927,8293623,36.3943252,-79.3396193,"POLYGON ((-79.14343 36.4422, -79.14345 36.4418..."
3,48,377,1383974,48377,0500000US48377,Presidio,Presidio County,6,H1,G4020,,,,A,9985057447,1773188,30.0058912,-104.2616192,"POLYGON ((-104.98078 30.62552, -104.98073 30.6..."
4,39,57,1074041,39057,0500000US39057,Greene,Greene County,6,H1,G4020,212.0,19430.0,,A,1071302625,6798109,39.6874785,-83.8948943,"POLYGON ((-84.10668 39.68891, -84.10662 39.689..."


In [None]:
# california state uses STATEFP = 06
