
### Raw Data

This notebook will bootstrap Caspers raw data into the provided catalog and schema

In [0]:
%pip install --upgrade databricks-sdk

In [0]:
dbutils.library.restartPython()

In [0]:
CATALOG = dbutils.widgets.get("CATALOG")
EVENTS_VOLUME = dbutils.widgets.get("EVENTS_VOLUME")
SIMULATOR_SCHEMA = dbutils.widgets.get("SIMULATOR_SCHEMA")


##### Create main catalog, simulator related schemas and volumes

In [None]:
%sql
CREATE CATALOG IF NOT EXISTS ${CATALOG};
CREATE SCHEMA IF NOT EXISTS ${CATALOG}.${SIMULATOR_SCHEMA};
CREATE VOLUME IF NOT EXISTS ${CATALOG}.${SIMULATOR_SCHEMA}.${EVENTS_VOLUME};


##### Create tables from parquet data

In [None]:
import pandas as pd

spark.createDataFrame(pd.read_csv("JMR_Scratch/brands.csv")) \
    .write.mode("overwrite").saveAsTable(f"{CATALOG}.{SIMULATOR_SCHEMA}.brands")
spark.createDataFrame(pd.read_csv("JMR_Scratch/menus.csv")) \
    .write.mode("overwrite").saveAsTable(f"{CATALOG}.{SIMULATOR_SCHEMA}.menus")
spark.createDataFrame(pd.read_csv("JMR_Scratch/categories.csv")) \
    .write.mode("overwrite").saveAsTable(f"{CATALOG}.{SIMULATOR_SCHEMA}.categories")
spark.createDataFrame(pd.read_csv("JMR_Scratch/items.csv")) \
    .write.mode("overwrite").saveAsTable(f"{CATALOG}.{SIMULATOR_SCHEMA}.items")


##### Start data generation


##### Read all .jsons in ./data/generator/configs. Each json file represents a location.

In [0]:
import glob
import os

# Get all JSON file paths under the directory
json_paths = glob.glob("../data/generator/configs/*.json")

# Read each file's content as a string and collect into a dict mapping filename to content
config_json_map = {}
for path in json_paths:
    filename = os.path.basename(path)
    with open(path, "r", encoding="utf-8") as f:
        config_json_map[filename] = f.read()


##### Start a job for each of the config jsons found in ./data/generator/configs

In [None]:
# Filter locations based on LOCATIONS parameter
locations_param = dbutils.widgets.get("LOCATIONS")
if locations_param.lower() != "all":
    selected = [loc.strip() for loc in locations_param.split(",")]
    config_json_map = {k: v for k, v in config_json_map.items() if k in selected}
    print(f"Running selected locations: {list(config_json_map.keys())}")
else:
    print(f"Running all locations: {list(config_json_map.keys())}")

In [None]:
from databricks.sdk import WorkspaceClient
import databricks.sdk.service.jobs as j
import os, json

w = WorkspaceClient()

# Resolve the workspace notebook path (keeps your approach)
notebook_abs_path = os.path.abspath("../data/generator/generator")
notebook_dbx_path = notebook_abs_path.replace(
    os.environ.get("DATABRICKS_WORKSPACE_ROOT", "/Workspace"),
    "/Workspace"
)

import sys
sys.path.append('../utils')
from uc_state import add

for filename, json_content in config_json_map.items():
    job_name = f"Order Flow Generator: {filename}"

    # Ensure SIM_CFG_JSON is a JSON string (Jobs widget params are strings)
    sim_cfg_str = json_content if isinstance(json_content, str) else json.dumps(json_content)

    job = w.jobs.create(
        name=job_name,
        tasks=[
            j.Task(
                task_key="order_flow_generator",
                notebook_task=j.NotebookTask(
                    notebook_path=notebook_dbx_path,
                    base_parameters={
                        "CATALOG": CATALOG,
                        "VOLUME": EVENTS_VOLUME,
                        "SCHEMA": SIMULATOR_SCHEMA,
                        "SIM_CFG_JSON": sim_cfg_str,
                    },
                )
            )
        ],
    )
    print(f"Created job_id={job.job_id} for {filename}")
    add(CATALOG, "jobs", job)
    w.jobs.run_now(job_id=job.job_id)


##### Blocking cell to wait for some data to arrive at the volume.

The lakeflow declarative pipeline that comes next infers the schema from existing data.

Lakeflow Jobs doesn't have a file arrival trigger at the task level (yet?)

In [0]:
import time

# Construct the path to the volume where JSONs will arrive
volume_path = f"/Volumes/{CATALOG}/{SIMULATOR_SCHEMA}/{EVENTS_VOLUME}"

def wait_for_data(path, timeout=300, poll_interval=5):
    """
    Wait until at least one file appears in the given path.
    Args:
        path (str): The directory to watch.
        timeout (int): Maximum seconds to wait.
        poll_interval (int): Seconds between checks.
    Raises:
        TimeoutError: If no file appears within the timeout.
    """
    start = time.time()
    while time.time() - start < timeout:
        files = dbutils.fs.ls(path)
        if any(f.size > 0 for f in files if not f.path.endswith('/')):
            print("Data arrived. Safe to proceed.")
            return
        time.sleep(poll_interval)
    raise TimeoutError(f"No data found in {path} after {timeout} seconds.")

wait_for_data(volume_path)