In [1]:
import sys, os
from pathlib import Path

def add_project_root():
    cwd = Path.cwd()
    for p in (cwd, *cwd.parents):
        if (p / "etl").is_dir() and (p / "sql").exists():
            if str(p) not in sys.path:
                sys.path.insert(0, str(p))
            print("✔ project root:", p)
            return p
    raise RuntimeError("Could not find project root with 'etl' and 'sql'.")

add_project_root()

✔ project root: C:\Users\keith\Documents\ipeds_etl\ipeds_etl


WindowsPath('C:/Users/keith/Documents/ipeds_etl/ipeds_etl')

In [2]:
from sqlalchemy.engine import make_url
from etl.config import settings, dump_settings
from etl.db import ping, list_ipeds_schemas

print("DB URL:", make_url(settings.DATABASE_URL).render_as_string(hide_password=True))
print("Ping:", ping())                  # expect ('ipeds_db', 'ipeds_loader')
print("Schemas:", list_ipeds_schemas()) # expect ipeds_raw/core/dim/vw + public

DB URL: postgresql+psycopg2://ipeds_loader:***@localhost:5432/ipeds_db
Ping: ('ipeds_db', 'ipeds_loader')
Schemas: ['ipeds_core', 'ipeds_dim', 'ipeds_meta', 'ipeds_raw', 'ipeds_vw']


In [3]:
# 1) (You already ran add_project_root)
from etl.config import settings
print("BASE:", settings.URBAN_BASE_URL)
# 2) If you JUST edited http.py, reload it (safe to run either way)
from importlib import reload
import etl.http as http
reload(http)

from etl.raw_io import insert_raw_payloads
insert_raw_payloads("directory", 2022, "ipeds/directory/")

BASE: https://educationdata.urban.org/api/v1/college-university/
[OK] Fetched 6,256 records from ipeds/directory/2022/ (year=2022)
[OK] directory 2022: inserted/updated 13 page(s) into ipeds_raw.directory_raw


13

In [4]:
from sqlalchemy import text
from etl.db import get_sqlalchemy_engine
import pandas as pd

eng = get_sqlalchemy_engine()

# page-level counts
df_pages = pd.read_sql(
    text("""
      SELECT page_number, jsonb_array_length(payload) AS records_in_chunk, ingested_at
      FROM ipeds_raw.directory_raw
      WHERE year = :y
      ORDER BY page_number
    """),
    eng, params={"y": 2022}
)
df_pages.head()

# flattened preview
df_flat = pd.read_sql(
    text("""
      SELECT
        (elem->>'unitid')::int AS unitid,
        elem->>'inst_name'     AS inst_name,
        elem->>'city'          AS city,
        elem->>'state_abbr'    AS state_abbr,
        (elem->>'sector')::int AS sector
      FROM ipeds_raw.directory_raw r
      CROSS JOIN LATERAL jsonb_array_elements(r.payload) AS elem
      WHERE r.year = :y
      LIMIT 200
    """),
    eng, params={"y": 2022}
)
df_flat.head()

Unnamed: 0,unitid,inst_name,city,state_abbr,sector
0,100654,Alabama A & M University,Normal,AL,1
1,100663,University of Alabama at Birmingham,Birmingham,AL,1
2,100690,Amridge University,Montgomery,AL,2
3,100706,University of Alabama in Huntsville,Huntsville,AL,1
4,100724,Alabama State University,Montgomery,AL,1
