In [0]:
env = "dev"  # or "prod"
catalog = f"{env}_catalog"
schema = f"crime_data_{env}"

# Create schema if not exists
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog}.{schema}")

In [0]:
# ============ 1. Community Areas (CSV) ============
df_community = (spark.read
    .format("csv")
    .option("header", "true")
    .option("inferSchema", "true")
    .load("abfss://reference@kevintestdatabricks.dfs.core.windows.net/ref/community_areas.csv")
)

# Write as Delta table with Change Data Feed enabled
(df_community.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .option("delta.enableChangeDataFeed", "true")
    .saveAsTable(f"{catalog}.{schema}.ref_community_areas")
)

print("✅ ref_community_areas created")

In [0]:
# ============ 2. IUCR Codes (JSON) ============
from pyspark.sql.types import StructType, StructField, StringType
from pyspark.sql.functions import lpad, col

# 定义 schema，强制 iucr 为 string
iucr_schema = StructType([
    StructField("iucr", StringType(), True),
    StructField("primary_description", StringType(), True),
    StructField("secondary_description", StringType(), True),
    StructField("index_code", StringType(), True)
    # 根据你的实际字段调整
])

df_iucr = (spark.read
    .format("json")
    .option("multiLine", "true")
    .schema(iucr_schema)
    .load("abfss://reference@kevintestdatabricks.dfs.core.windows.net/ref/iucr_codes.json")
)

df_iucr = df_iucr.withColumn("iucr", lpad(col("iucr").cast("string"), 4, "0"))

(df_iucr.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .option("delta.enableChangeDataFeed", "true")
    .saveAsTable(f"{catalog}.{schema}.ref_iucr_codes")
)

print("✅ ref_iucr_codes created")

In [0]:
# ============ 3. Wards Current (JSON) ============
df_wards = (spark.read
    .format("json")
    .option("multiLine", "true")
    .load("abfss://reference@kevintestdatabricks.dfs.core.windows.net/ref/wards_current.json")
)

(df_wards.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .option("delta.enableChangeDataFeed", "true")
    .saveAsTable(f"{catalog}.{schema}.ref_wards_current")
)

print("✅ ref_wards_current created")

In [0]:
def refresh_reference_table(source_path, table_name, file_format="json"):
    """Refresh a reference table from source"""
    
    read_options = {"multiLine": "true"} if file_format == "json" else {"header": "true", "inferSchema": "true"}
    
    df = (spark.read
        .format(file_format)
        .options(**read_options)
        .load(source_path)
    )
    
    (df.write
        .format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "true")
        .saveAsTable(table_name)
    )
    
    print(f"✅ Refreshed {table_name} with {df.count()} rows")