In [None]:
ddl_statements = [
    """CREATE SCHEMA IF NOT EXISTS datamodel_db""",
    """
CREATE TABLE IF NOT EXISTS datamodel_db.episode (
  episode_id STRING NOT NULL,
  episode_version INT NOT NULL,
  created_at TIMESTAMP,
  started_at TIMESTAMP,
  ended_at TIMESTAMP,
  duration_ms BIGINT,
  status STRING NOT NULL,
  workflow_name STRING,
  model_name STRING,
  model_version STRING,
  agent_version STRING,
  is_golden BOOLEAN,
  golden_template_id STRING,
  source STRING,
  source_ref STRING,
  inputs_json STRING,
  expected_outputs_json STRING,
  actual_outputs_json STRING,
  metadata_json STRING,
  cost_usd DOUBLE,
  cost_input_usd DOUBLE,
  cost_output_usd DOUBLE,
  input_tokens BIGINT,
  output_tokens BIGINT,
  total_tokens BIGINT,
  CONSTRAINT episode_status_check CHECK (status IN ('successful', 'degraded', 'failed'))
)
USING DELTA
""",
    """
CREATE TABLE IF NOT EXISTS datamodel_db.episode_steps (
  episode_id STRING NOT NULL,
  episode_version INT NOT NULL,
  step_index INT NOT NULL,
  step_type STRING,
  step_name STRING,
  content STRING,
  created_at TIMESTAMP,
  tokens_in BIGINT,
  tokens_out BIGINT,
  total_tokens BIGINT,
  latency_ms BIGINT,
  score DOUBLE,
  failure_type STRING,
  invariant_violated STRING,
  metadata_json STRING,
  CONSTRAINT step_index_nonnegative CHECK (step_index >= 0)
)
USING DELTA
""",
    """
CREATE TABLE IF NOT EXISTS datamodel_db.episode_evaluation (
  evaluation_id STRING NOT NULL,
  episode_id STRING NOT NULL,
  episode_version INT NOT NULL,
  evaluated_at TIMESTAMP,
  evaluator_name STRING,
  evaluator_version STRING,
  mlflow_run_id STRING,
  match_outcome STRING,
  overall_score DOUBLE,
  drift_score DOUBLE,
  coherence_score DOUBLE,
  idempotency_score DOUBLE,
  artifact_uri STRING,
  metrics_json STRING,
  CONSTRAINT match_outcome_check CHECK (match_outcome IN ('match', 'mismatch', 'undetermined'))
)
USING DELTA
""",
]

try:
    spark
except NameError as e:
    raise RuntimeError('This notebook must be run on a Spark cluster (Databricks) with an active `spark` session.') from e

for stmt in ddl_statements:
    spark.sql(stmt)

# If tables already existed, ensure new columns are present.
# (CREATE TABLE IF NOT EXISTS will not evolve schemas automatically.)

def _add_columns_if_missing(table_name: str, columns_ddl: list[tuple[str, str]]):
    existing = {r.col_name for r in spark.sql(f"DESCRIBE {table_name}").collect() if r.col_name and not r.col_name.startswith('#')}
    to_add = [(name, dtype) for (name, dtype) in columns_ddl if name not in existing]
    if to_add:
        cols_sql = ", ".join([f"{name} {dtype}" for name, dtype in to_add])
        spark.sql(f"ALTER TABLE {table_name} ADD COLUMNS ({cols_sql})")

_add_columns_if_missing(
    "datamodel_db.episode",
    [
        ("started_at", "TIMESTAMP"),
        ("ended_at", "TIMESTAMP"),
        ("duration_ms", "BIGINT"),
        ("workflow_name", "STRING"),
        ("model_name", "STRING"),
        ("model_version", "STRING"),
        ("agent_version", "STRING"),
        ("is_golden", "BOOLEAN"),
        ("golden_template_id", "STRING"),
        ("cost_usd", "DOUBLE"),
        ("cost_input_usd", "DOUBLE"),
        ("cost_output_usd", "DOUBLE"),
    ],
)

_add_columns_if_missing(
    "datamodel_db.episode_steps",
    [
        ("step_name", "STRING"),
        ("tokens_in", "BIGINT"),
        ("tokens_out", "BIGINT"),
        ("total_tokens", "BIGINT"),
        ("latency_ms", "BIGINT"),
        ("score", "DOUBLE"),
        ("failure_type", "STRING"),
        ("invariant_violated", "STRING"),
    ],
)

_add_columns_if_missing(
    "datamodel_db.episode_evaluation",
    [
        ("artifact_uri", "STRING"),
    ],
)

print('âœ“ Created/updated schema and tables in datamodel_db')


In [None]:
spark.sql('SHOW TABLES IN datamodel_db').show(truncate=False)
spark.sql('DESCRIBE TABLE datamodel_db.episode').show(truncate=False)