In [0]:
from pyspark.sql.functions import current_timestamp, col, lit

# ============ 参数配置 ============
dbutils.widgets.text("env", "dev")
env = dbutils.widgets.get("env")

# 根据环境设置路径
config = {
    "dev": {
        "catalog": "dev_catalog",
        "schema": "crime_data_dev",
        "managed_path": "abfss://dev@kevintestdatabricks.dfs.core.windows.net/"
    },
    "prod": {
        "catalog": "prod_catalog", 
        "schema": "crime_data_prod",
        "managed_path": "abfss://prod@kevintestdatabricks.dfs.core.windows.net/"
    }
}

# 获取当前环境配置
catalog = config[env]["catalog"]
schema = config[env]["schema"]
managed_path = config[env]["managed_path"]

# 源数据路径（两个环境共用）
source_path = "abfss://data@kevintestdatabricks.dfs.core.windows.net/"

# Checkpoint 路径（放在各自环境的 container 里）
checkpoint_path = f"{managed_path}_checkpoints/bronze_crime/"

# 目标表
target_table = f"{catalog}.{schema}.bronze_crime"

print(f"""
========================================
Environment: {env}
Source: {source_path}
Checkpoint: {checkpoint_path}
Target Table: {target_table}
========================================
""")

In [0]:
# ============ AutoLoader 读取 ============
df = (spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "json")
    .option("cloudFiles.schemaLocation", f"{checkpoint_path}schema/")
    .option("cloudFiles.inferColumnTypes", "true")
    .option("multiLine", "true")
    .load(source_path)
)

# 添加元数据列
df_with_metadata = (df
    .withColumn("_ingestion_time", current_timestamp())
    .withColumn("_source_file", col("_metadata.file_path"))
    .withColumn("_env", lit(env))
)

print("✅ Stream configured")

In [0]:
# ============ 写入 Bronze 表 ============
query = (df_with_metadata.writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", f"{checkpoint_path}data/")
    .option("mergeSchema", "true")
    .trigger(availableNow=True)
    .toTable(target_table)
)

query.awaitTermination()
print(f"✅ Data written to {target_table}")