In [0]:
# CONFIG (do not commit)
HUD_API_TOKEN = "KEY"

import requests

BASE_URL = "https://www.huduser.gov/hudapi/public"
headers = {"Authorization": f"Bearer {HUD_API_TOKEN}"}

r = requests.get(f"{BASE_URL}/fmr/listStates", headers=headers)
print("status:", r.status_code)
print(r.text[:300])

status: 200
[{"state_name":"Alabama","state_code":"AL","state_num":"1.0","category":"State"},{"state_name":"Alaska","state_code":"AK","state_num":"2.0","category":"State"},{"state_name":"American Samoa","state_code":"AS","state_num":"60.0","category":"State"},{"state_name":"Arizona","state_code":"AZ","state_num


In [0]:
#
# Ingest HUD Fair Market Rent (FMR) benchmark data from the official HUD API into a Bronze (raw) Delta table.
# Raw landing table used downstream for Silver parsing/cleaning and Gold analytics.
#
# Notes:
# - TARGET_YEAR controls which FMR vintage is pulled.
# - METRO_LIMIT is a development throttle to avoid pulling the full catalog during testing.
# - HUD_API_TOKEN should be stored in Databricks Secrets / environment variables (do not hardcode in GitHub).

import requests, json
from datetime import datetime, UTC
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# CONFIG
HUD_API_TOKEN = "KEY"
TARGET_YEAR = 2025
METRO_LIMIT = 50

BASE_URL = "https://www.huduser.gov/hudapi/public"
headers = {"Authorization": f"Bearer {HUD_API_TOKEN}"}

spark = SparkSession.builder.getOrCreate()

# 1) Get metro list
payload = requests.get(f"{BASE_URL}/fmr/listMetroAreas", headers=headers).json()

if isinstance(payload, dict):
    metros = payload.get("data", {}).get("metroareas") or payload.get("metroareas")
elif isinstance(payload, list):
    metros = payload
else:
    raise ValueError(f"Unexpected response type: {type(payload)}")

print("metro count:", len(metros))
print("sample metro keys:", list(metros[0].keys()))
print("sample metro:", metros[0])

# # STEP 2: Pull FMR data for each metro (identified by CBSA code) for the target year.
# We store the full raw JSON response (no transformations) along with:
# - entity identifiers (type + cbsa_code)
# - ingestion timestamp
# - HTTP status for monitoring/retries
# This is the Bronze layer: raw, auditable, and replayable.
raw_records = []
for metro in metros[:METRO_LIMIT]:
    entity_id = metro.get("cbsa_code")  # <-- FIX HERE
    r = requests.get(f"{BASE_URL}/fmr/data/{entity_id}?year={TARGET_YEAR}", headers=headers)

    raw_records.append((
        "metro",
        str(entity_id),
        int(TARGET_YEAR),
        datetime.now(UTC).isoformat(),
        int(r.status_code),
        json.dumps(r.json()) if r.status_code == 200 else None
    ))

# Create a Delta table with an explicit schema.
# Avoiding schema inference prevents drift and makes downstream Silver parsing predictable.
schema = StructType([
    StructField("entity_type", StringType(), False),
    StructField("entity_id", StringType(), False),
    StructField("year", IntegerType(), False),
    StructField("ingested_at", StringType(), False),
    StructField("http_status", IntegerType(), False),
    StructField("raw_payload", StringType(), True),
])

df = spark.createDataFrame(raw_records, schema=schema)

df.write.format("delta").mode("overwrite").saveAsTable("bronze_hud_fmr_raw")

metro count: 639
sample metro keys: ['cbsa_code', 'area_name', 'category']
sample metro: {'cbsa_code': 'METRO10180M10180', 'area_name': 'Abilene, TX MSA', 'category': 'MetroArea'}


In [0]:
spark.sql("""
COMMENT ON TABLE bronze_hud_fmr_raw IS
'Bronze layer: raw HUD Fair Market Rent (FMR) API responses for metro areas (CBSA codes) and a specific year. Stores unmodified JSON payloads plus ingestion metadata (timestamp and HTTP status) to support auditable downstream Silver parsing and Gold analytics.'
""")

spark.sql("ALTER TABLE bronze_hud_fmr_raw ALTER COLUMN entity_type COMMENT 'Entity category for the HUD request. Here: metro.'")
spark.sql("ALTER TABLE bronze_hud_fmr_raw ALTER COLUMN entity_id COMMENT 'HUD metro identifier used for requests. For metros, this is the CBSA code.'")
spark.sql("ALTER TABLE bronze_hud_fmr_raw ALTER COLUMN year COMMENT 'Target FMR year requested from HUD API.'")
spark.sql("ALTER TABLE bronze_hud_fmr_raw ALTER COLUMN ingested_at COMMENT 'UTC timestamp when the API response was ingested into Bronze.'")
spark.sql("ALTER TABLE bronze_hud_fmr_raw ALTER COLUMN http_status COMMENT 'HTTP status code returned by the HUD API request (used for monitoring and retries).'")
spark.sql("ALTER TABLE bronze_hud_fmr_raw ALTER COLUMN raw_payload COMMENT 'Raw JSON response from HUD FMR API stored as a string. Parsed into structured fields in Silver.'")


DataFrame[]

In [0]:
# DATA QUALITY CHECKS â€“ BRONZE HUD FMR INGESTION

# Validate that the HUD FMR API ingestion completed successfully before downstream processing.
# These checks confirm:
# 1) How many API requests succeeded vs failed (via HTTP status codes)
# 2) That successful records contain expected metadata (entity_id, year, ingest timestamp)
#
# This step supports pipeline reliability and observability.
# It does not perform analysis or answer the research question directly.
spark.sql("""
SELECT http_status, COUNT(*) AS n
FROM bronze_hud_fmr_raw
GROUP BY http_status
ORDER BY http_status
""").show()

spark.sql("""
SELECT entity_id, year, ingested_at
FROM bronze_hud_fmr_raw
WHERE http_status = 200
LIMIT 5
""").show(truncate=False)

+-----------+---+
|http_status|  n|
+-----------+---+
|        200| 50|
+-----------+---+

+----------------+----+--------------------------------+
|entity_id       |year|ingested_at                     |
+----------------+----+--------------------------------+
|METRO10180M10180|2025|2026-01-06T00:07:38.173497+00:00|
|METRO29180N22001|2025|2026-01-06T00:07:38.277228+00:00|
|METRO10380M10380|2025|2026-01-06T00:07:38.398953+00:00|
|METRO10420M10420|2025|2026-01-06T00:07:38.517883+00:00|
|METRO10500M10500|2025|2026-01-06T00:07:38.620821+00:00|
+----------------+----+--------------------------------+



In [0]:
spark.sql("SELECT COUNT(*) AS rows FROM bronze_hud_fmr_raw").show()

+----+
|rows|
+----+
|  50|
+----+



In [0]:
# All HUD FMR API requests returned HTTP 200.
# This confirms that the Bronze table contains a complete set of successful
# raw FMR responses for the sampled metro areas and target year.

spark.sql("""
SELECT http_status, COUNT(*) AS n
FROM bronze_hud_fmr_raw
GROUP BY http_status
ORDER BY http_status
""").show()

+-----------+---+
|http_status|  n|
+-----------+---+
|        200| 50|
+-----------+---+



In [0]:
# Preview the first 120 characters of raw JSON to confirm payload presence and shape before parsing into Silver.
spark.sql("""
SELECT entity_id, year, http_status, substr(raw_payload, 1, 120) AS payload_preview
FROM bronze_hud_fmr_raw
LIMIT 5
""").show(truncate=False)

+----------------+----+-----------+------------------------------------------------------------------------------------------------------------------------+
|entity_id       |year|http_status|payload_preview                                                                                                         |
+----------------+----+-----------+------------------------------------------------------------------------------------------------------------------------+
|METRO10180M10180|2025|200        |{"data": {"county_name": "", "counties_msa": "Callahan County, TX; Jones County, TX; and Taylor County, TX", "town_name"|
|METRO29180N22001|2025|200        |{"data": {"county_name": "", "counties_msa": "Acadia Parish, LA; ", "town_name": "", "metro_status": "1", "metro_name": |
|METRO10380M10380|2025|200        |{"data": {"county_name": "", "counties_msa": "Aguada Municipio, PR; Aguadilla Municipio, PR; A\u00f1asco Municipio, PR; |
|METRO10420M10420|2025|200        |{"data": {"county_name"

In [0]:
# Use CBSA code as the entity identifier.
# HUD FMR metro-level endpoints require CBSA-based metro IDs.
entity_id = metro.get("cbsa_code")

In [0]:
#  summarizes HUD Fair Market Rent benchmarks by bedroom type to establish baseline rent levels

df = spark.sql(
    """
    SELECT
        bedroom_label,
        ROUND(AVG(fmr), 0) AS avg_fmr
    FROM
        gold_fmr_dashboard
    GROUP BY
        bedroom_label
    ORDER BY
        avg_fmr DESC
    """
)
display(df)

bedroom_label,avg_fmr
4 Bedroom,1918.0
3 Bedroom,1645.0
2 Bedroom,1268.0
1 Bedroom,1046.0
Efficiency,966.0


Databricks visualization. Run in Databricks to view.