In [1]:
# --- Cell 0: repo path shim ---
# Ensures we can import our local `etl` package no matter where Jupyter is launched.
# Looks upward from the current working directory until it finds `etl/` and `sql/`.
import sys
from pathlib import Path

def add_project_root():
    cwd = Path.cwd()
    for p in [cwd, *cwd.parents]:
        if (p / "etl").is_dir() and (p / "sql").exists():
            if str(p) not in sys.path:
                sys.path.insert(0, str(p))
            return
    raise RuntimeError("Could not find project root with 'etl' and 'sql'.")

add_project_root()

In [2]:
# --- Cell 1: imports ---
# Registry → lookup configs (API path, schema, mapper)
# Raw I/O  → insert_raw_payloads handles API fetch + insert into ipeds_raw.<endpoint>_raw
from etl.registry import get_endpoint_config, list_endpoints
from etl.raw_io import insert_raw_payloads

In [3]:
# --- Cell 2: parameters ---
# Pick the endpoint + year range for the RAW load.
ENDPOINT = "directory"   # currently only "directory" is in registry.py
YEAR_START = 2010        # inclusive
YEAR_END   = 2023        # inclusive
FORCE_PAGE_SIZE = None   # override default chunk size (None = 500 from settings.RAW_PAGE_SIZE)

# Grab config for this endpoint (path, schema, pk, mapper).
cfg = get_endpoint_config(ENDPOINT)

print("Loading RAW endpoint:", ENDPOINT)
print("API path template:", cfg["path"])
print("Available endpoints in registry:", list_endpoints())

Loading RAW endpoint: directory
API path template: ipeds/directory/{year}/
Available endpoints in registry: ['directory']


In [4]:
# --- Cell 3: raw load loop ---
# For each year, fetch API results, chunk into pages, and upsert into ipeds_raw.<endpoint>_raw.
summaries = []
for year in range(YEAR_START, YEAR_END + 1):
    pages = insert_raw_payloads(
        endpoint=ENDPOINT,
        year=year,
        endpoint_path=cfg["path"],  # e.g., "ipeds/directory/{year}/"
        page_size=FORCE_PAGE_SIZE,
    )
    summaries.append({"year": year, "pages": pages})
    print(f"[RAW] {ENDPOINT} {year}: {pages} page(s) upserted")

summaries

[OK] Fetched 7,503 records from ipeds/directory/2010/ (year=2010)
[OK] directory 2010: inserted/updated 16 page(s) into ipeds_raw.directory_raw
[RAW] directory 2010: 16 page(s) upserted
[OK] Fetched 7,643 records from ipeds/directory/2011/ (year=2011)
[OK] directory 2011: inserted/updated 16 page(s) into ipeds_raw.directory_raw
[RAW] directory 2011: 16 page(s) upserted
[OK] Fetched 7,735 records from ipeds/directory/2012/ (year=2012)
[OK] directory 2012: inserted/updated 16 page(s) into ipeds_raw.directory_raw
[RAW] directory 2012: 16 page(s) upserted
[OK] Fetched 7,764 records from ipeds/directory/2013/ (year=2013)
[OK] directory 2013: inserted/updated 16 page(s) into ipeds_raw.directory_raw
[RAW] directory 2013: 16 page(s) upserted
[OK] Fetched 7,687 records from ipeds/directory/2014/ (year=2014)
[OK] directory 2014: inserted/updated 16 page(s) into ipeds_raw.directory_raw
[RAW] directory 2014: 16 page(s) upserted
[OK] Fetched 7,647 records from ipeds/directory/2015/ (year=2015)
[OK]

[{'year': 2010, 'pages': 16},
 {'year': 2011, 'pages': 16},
 {'year': 2012, 'pages': 16},
 {'year': 2013, 'pages': 16},
 {'year': 2014, 'pages': 16},
 {'year': 2015, 'pages': 16},
 {'year': 2016, 'pages': 16},
 {'year': 2017, 'pages': 15},
 {'year': 2018, 'pages': 14},
 {'year': 2019, 'pages': 14},
 {'year': 2020, 'pages': 13},
 {'year': 2021, 'pages': 13},
 {'year': 2022, 'pages': 13},
 {'year': 2023, 'pages': 13}]

In [5]:
# --- Cell 4: optional verification ---
# Quick sanity checks: show how many pages/records landed in RAW and preview some data.
from sqlalchemy import text
from etl.db import get_sqlalchemy_engine
import pandas as pd

eng = get_sqlalchemy_engine()

# 1. Page-level counts
df_pages = pd.read_sql(
    text("""
        SELECT year,
               page_number,
               jsonb_array_length(payload) AS records_in_chunk,
               ingested_at
        FROM ipeds_raw.directory_raw
        WHERE year BETWEEN :y0 AND :y1
        ORDER BY year, page_number
    """),
    eng, params={"y0": YEAR_START, "y1": YEAR_END}
)
print("Page counts per year:")
display(df_pages.head())

# 2. Flattened preview (first few records)
df_flat = pd.read_sql(
    text("""
        SELECT
          (elem->>'unitid')::int    AS unitid,
          elem->>'inst_name'        AS inst_name,
          elem->>'city'             AS city,
          elem->>'state_abbr'       AS state_abbr,
          (elem->>'sector')::int    AS sector
        FROM ipeds_raw.directory_raw r
        CROSS JOIN LATERAL jsonb_array_elements(r.payload) AS elem
        WHERE r.year BETWEEN :y0 AND :y1
        LIMIT 10
    """),
    eng, params={"y0": YEAR_START, "y1": YEAR_END}
)
print("Sample flattened records:")
display(df_flat)

Page counts per year:


Unnamed: 0,year,page_number,records_in_chunk,ingested_at
0,2010,1,500,2025-09-15 16:19:15.087658+00:00
1,2010,2,500,2025-09-15 16:19:15.121330+00:00
2,2010,3,500,2025-09-15 16:19:15.148185+00:00
3,2010,4,500,2025-09-15 16:19:15.185171+00:00
4,2010,5,500,2025-09-15 16:19:15.224061+00:00


Sample flattened records:


Unnamed: 0,unitid,inst_name,city,state_abbr,sector
0,100636,Community College of the Air Force,Montgomery,AL,4
1,100654,Alabama A & M University,Normal,AL,1
2,100663,University of Alabama at Birmingham,Birmingham,AL,1
3,100690,Amridge University,Montgomery,AL,2
4,100706,University of Alabama at Huntsville,Huntsville,AL,1
5,100724,Alabama State University,Montgomery,AL,1
6,100733,University of Alabama System Office,Tuscaloosa,AL,0
7,100751,The University of Alabama,Tuscaloosa,AL,1
8,100760,Central Alabama Community College,Alexander City,AL,4
9,100812,Athens State University,Athens,AL,1
