In [1]:
# 00_env_check.ipynb
# -------------------------------------------------------------------
# Purpose: Verify that the environment is wired up correctly
# before running any IPEDS ETL jobs.
#
# Checks:
#   - Can we connect to the Postgres database?
#   - Do required schemas exist?
#   - Are tables visible inside those schemas?
#   - Do we have the right permissions (at least SELECT)?
#   - Are server basics (version, settings) sane?
#
# This is a "stop early" notebook: if any of these fail, 
# fix them before moving to loaders.
# -------------------------------------------------------------------

import os
from sqlalchemy import create_engine, text

# DATABASE_URL should come from your environment (.env file or OS env var).
# Example format:
#   postgresql+psycopg2://postgres:YOURPW@localhost:5432/ipeds_db
DATABASE_URL = os.getenv("DATABASE_URL")

# If DATABASE_URL isn’t set, prompt interactively.
if not DATABASE_URL:
    DATABASE_URL = input("Enter DATABASE_URL: ").strip()

def mask(url: str) -> str:
    """
    Hide user/password in logs so they don't end up
    printed to screen, notebooks, or git commits.
    """
    try:
        creds = url.split("://",1)[1].split("@",1)[0]
        return url.replace(creds, "****:****")
    except Exception:
        return url

print("Using DB connection string:", mask(DATABASE_URL))

# Create a SQLAlchemy engine (future=True = modern API)
eng = create_engine(DATABASE_URL, future=True)


Enter DATABASE_URL:  postgresql+psycopg2://ipeds_loader:password@localhost:5432/ipeds_db


Using DB connection string: postgresql+psycopg2://****:****@localhost:5432/ipeds_db


In [2]:
# Basic connectivity test: if this fails, nothing else matters.
# Confirms: username/password/host/port/db are correct, Postgres server is up.
with eng.begin() as cx:
    version = cx.execute(text("SELECT version();")).scalar_one()
print("Connected to:", version)

Connected to: PostgreSQL 17.5 on x86_64-windows, compiled by msvc-19.44.35209, 64-bit


In [3]:
# IPEDS ETL expects five schemas to exist inside ipeds_db:
#   ipeds_raw   = raw API payloads
#   ipeds_core  = cleaned/typed tables
#   ipeds_dim   = small lookup/label tables
#   ipeds_vw    = BI-friendly views
#   ipeds_meta  = lineage (load_log, source_trace)
#
# This query lists which are present and asserts that none are missing.

REQ_SCHEMAS = ("ipeds_raw","ipeds_core","ipeds_dim","ipeds_vw","ipeds_meta")

q = """
SELECT nspname AS schema
FROM pg_namespace
WHERE nspname = ANY(:schemas)
ORDER BY 1;
"""
with eng.begin() as cx:
    rows = cx.execute(text(q), {"schemas": list(REQ_SCHEMAS)}).all()

present = {r[0] for r in rows}
missing = [s for s in REQ_SCHEMAS if s not in present]

print("Schemas found :", sorted(present))
print("Schemas missing:", missing)
assert not missing, f"ERROR: Missing schemas: {missing}"

Schemas found : ['ipeds_core', 'ipeds_dim', 'ipeds_meta', 'ipeds_raw', 'ipeds_vw']
Schemas missing: []


In [4]:
# Quick inventory: show all tables we have so far.
# Early in the project this may be empty (before you’ve loaded anything).
q = """
SELECT table_schema, table_name
FROM information_schema.tables
WHERE table_schema = ANY(:schemas)
ORDER BY table_schema, table_name;
"""
with eng.begin() as cx:
    rows = cx.execute(text(q), {"schemas": list(REQ_SCHEMAS)}).all()

if rows:
    for schema, table in rows:
        print(f"{schema}.{table}")
else:
    print("(No tables yet — expected on day 0)")

ipeds_core.directory
ipeds_meta.load_log
ipeds_meta.source_trace
ipeds_raw.directory_raw


In [5]:
# Verify that the current user can SELECT from ipeds_meta.load_log.
# This is a harmless table to test permissions.
# (If the user has only read privileges, the SELECT should succeed.)
q = "SELECT COUNT(*) FROM ipeds_meta.load_log"
try:
    with eng.begin() as cx:
        cnt = cx.execute(text(q)).scalar_one()
    print(f"OK: SELECT on ipeds_meta.load_log ({cnt} rows)")
except Exception as e:
    print("Permission test failed:", e)
    print("Note: If this URL points to a reader-only role, that's fine.")

OK: SELECT on ipeds_meta.load_log (0 rows)


In [6]:
# Check a couple of read-only settings to confirm
# we’re connected to the server we expect.
with eng.begin() as cx:
    version = cx.execute(text("SHOW server_version;")).scalar_one()
    max_conn = cx.execute(text("SHOW max_connections;")).scalar_one()

print("Postgres version:", version)
print("Max connections allowed:", max_conn)

Postgres version: 17.5
Max connections allowed: 100


In [7]:
# This shows what grants are in place for the current database.
# Useful for confirming that ipeds_loader / ipeds_reader / bi_user
# got their permissions applied correctly.
q = """
SELECT grantee, table_schema, privilege_type
FROM information_schema.role_table_grants
WHERE table_schema = ANY(:schemas)
ORDER BY grantee, table_schema, privilege_type;
"""
with eng.begin() as cx:
    rows = cx.execute(text(q), {"schemas": list(REQ_SCHEMAS)}).all()

print("Sample of role grants:")
for r in rows[:40]:
    print(r)
print(f"... ({len(rows)} grants total)")

Sample of role grants:
('bi_user', 'ipeds_core', 'SELECT')
('ipeds_loader', 'ipeds_core', 'DELETE')
('ipeds_loader', 'ipeds_core', 'INSERT')
('ipeds_loader', 'ipeds_core', 'REFERENCES')
('ipeds_loader', 'ipeds_core', 'SELECT')
('ipeds_loader', 'ipeds_core', 'TRIGGER')
('ipeds_loader', 'ipeds_core', 'TRUNCATE')
('ipeds_loader', 'ipeds_core', 'UPDATE')
('ipeds_loader', 'ipeds_meta', 'INSERT')
('ipeds_loader', 'ipeds_meta', 'INSERT')
('ipeds_loader', 'ipeds_meta', 'SELECT')
('ipeds_loader', 'ipeds_meta', 'SELECT')
('ipeds_loader', 'ipeds_raw', 'DELETE')
('ipeds_loader', 'ipeds_raw', 'INSERT')
('ipeds_loader', 'ipeds_raw', 'REFERENCES')
('ipeds_loader', 'ipeds_raw', 'SELECT')
('ipeds_loader', 'ipeds_raw', 'TRIGGER')
('ipeds_loader', 'ipeds_raw', 'TRUNCATE')
('ipeds_loader', 'ipeds_raw', 'UPDATE')
('ipeds_reader', 'ipeds_core', 'SELECT')
... (20 grants total)
