
### Casper's Ghost Kitchen Initializer

Select `Run All` to initialize Casper's Databricks environment.

In [0]:
%pip install --upgrade databricks-sdk

In [0]:
dbutils.library.restartPython()

In [0]:
CATALOG = dbutils.widgets.get("CATALOG")
EVENTS_VOLUME = dbutils.widgets.get("EVENTS_VOLUME")
SIMULATOR_SCHEMA = dbutils.widgets.get("SIMULATOR_SCHEMA")

In [0]:
# Be cautious about proceeding if the catalog already exists

catalogs = [row.catalog for row in spark.sql("SHOW CATALOGS").collect()]
if CATALOG in catalogs:
    raise Exception(f"Catalog '{CATALOG}' already exists. Please proceed with caution or choose a different catalog. Use the destroy notebook to clear out previous instances of Casper's.")


##### Create main catalog, simulator related schemas and volumes

In [0]:
import os
os.makedirs("./.state", exist_ok=True)

In [0]:
%sql
CREATE CATALOG IF NOT EXISTS ${CATALOG};
CREATE SCHEMA IF NOT EXISTS ${CATALOG}.${SIMULATOR_SCHEMA};
CREATE VOLUME IF NOT EXISTS ${CATALOG}.${SIMULATOR_SCHEMA}.${EVENTS_VOLUME};


##### Create tables from parquet data

In [0]:
import pandas as pd

spark.createDataFrame(pd.read_parquet("./data/dimensional/brands.parquet")) \
    .write.mode("overwrite").saveAsTable(f"{CATALOG}.{SIMULATOR_SCHEMA}.brands")
spark.createDataFrame(pd.read_parquet("./data/dimensional/menus.parquet")) \
    .write.mode("overwrite").saveAsTable(f"{CATALOG}.{SIMULATOR_SCHEMA}.menus")
spark.createDataFrame(pd.read_parquet("./data/dimensional/categories.parquet")) \
    .write.mode("overwrite").saveAsTable(f"{CATALOG}.{SIMULATOR_SCHEMA}.categories")
spark.createDataFrame(pd.read_parquet("./data/dimensional/items.parquet")) \
    .write.mode("overwrite").saveAsTable(f"{CATALOG}.{SIMULATOR_SCHEMA}.items")


##### Start data generation


##### Read all .jsons in ./data/generator/configs. Each json file represents a location.

In [0]:
import glob
import os

# Get all JSON file paths under the directory
json_paths = glob.glob("data/generator/configs/*.json")

# Read each file's content as a string and collect into a dict mapping filename to content
config_json_map = {}
for path in json_paths:
    filename = os.path.basename(path)
    with open(path, "r", encoding="utf-8") as f:
        config_json_map[filename] = f.read()


##### Start a job for each of the config jsons found in ./data/generator/configs

In [0]:
from databricks.sdk import WorkspaceClient
import databricks.sdk.service.jobs as j
import os, json

w = WorkspaceClient()

# Resolve the workspace notebook path (keeps your approach)
notebook_abs_path = os.path.abspath("./data/generator/generator")
notebook_dbx_path = notebook_abs_path.replace(
    os.environ.get("DATABRICKS_WORKSPACE_ROOT", "/Workspace"),
    "/Workspace"
)

state_dir = "./.state"
os.makedirs(state_dir, exist_ok=True)
jobs_file_path = os.path.join(state_dir, "jobs")

for filename, json_content in config_json_map.items():
    job_name = f"Order Flow Generator: {filename}"

    # Ensure SIM_CFG_JSON is a JSON string (Jobs widget params are strings)
    sim_cfg_str = json_content if isinstance(json_content, str) else json.dumps(json_content)

    job = w.jobs.create(
        name=job_name,
        tasks=[
            j.Task(
                task_key="order_flow_generator",
                notebook_task=j.NotebookTask(
                    notebook_path=notebook_dbx_path,
                    base_parameters={
                        "CATALOG": CATALOG,
                        "VOLUME": EVENTS_VOLUME,
                        "SCHEMA": SIMULATOR_SCHEMA,
                        "SIM_CFG_JSON": sim_cfg_str,
                    },
                )
            )
        ],
    )
    print(f"Created job_id={job.job_id} for {filename}")
    w.jobs.run_now(job_id=job.job_id)
    with open(jobs_file_path, "a") as f:
        f.write(f"{job.job_id}\n")


##### Blocking cell to wait for some data to arrive at the volume.

The lakeflow declarative pipeline that comes next infers the schema from existing data

In [0]:
import time

# Construct the path to the volume where JSONs will arrive
volume_path = f"/Volumes/{CATALOG}/{SIMULATOR_SCHEMA}/{EVENTS_VOLUME}"

def wait_for_data(path, timeout=300, poll_interval=5):
    """
    Wait until at least one file appears in the given path.
    Args:
        path (str): The directory to watch.
        timeout (int): Maximum seconds to wait.
        poll_interval (int): Seconds between checks.
    Raises:
        TimeoutError: If no file appears within the timeout.
    """
    start = time.time()
    while time.time() - start < timeout:
        files = dbutils.fs.ls(path)
        if any(f.size > 0 for f in files if not f.path.endswith('/')):
            print("Data arrived. Safe to proceed.")
            return
        time.sleep(poll_interval)
    raise TimeoutError(f"No data found in {path} after {timeout} seconds.")

wait_for_data(volume_path)


##### Lakeflow Pipeline

In [0]:
import os
import time

from databricks.sdk import WorkspaceClient
from databricks.sdk.service import pipelines

w = WorkspaceClient()

root_abs_path = os.path.abspath("./pipelines/order_items")
root_dbx_path = root_abs_path.replace(
    os.environ.get("DATABRICKS_WORKSPACE_ROOT", "/Workspace"),
    "/Workspace"
)

created = w.pipelines.create(
    catalog=CATALOG,
    schema='lakeflow',
    continuous=True,
    name=f"Order Items Medallion DLT",
    serverless=True,
    configuration={
        "RAW_DATA_CATALOG":CATALOG,
        "RAW_DATA_SCHEMA":SIMULATOR_SCHEMA,
        "RAW_DATA_VOLUME":EVENTS_VOLUME
    },
    root_path=root_dbx_path,
    libraries=[pipelines.PipelineLibrary(glob=pipelines.PathPattern(include=f"{root_dbx_path}/**"))]
)

print(f"Created pipeline_id={created.pipeline_id}")

state_dir = "./.state"
pipelines_file_path = os.path.join(state_dir, "pipelines")
with open(pipelines_file_path, "a") as f:
    f.write(f"{created.pipeline_id}\n")