In [0]:
from pyspark.sql import functions as F, types as T
import json

dbutils.widgets.text("dt", "")
dbutils.widgets.text("bucket", "")
dbutils.widgets.text("bronze_prefix", "")
dbutils.widgets.text("silver_prefix", "")
dbutils.widgets.dropdown("table", "chain_daily_state", ["chain_daily_state", "active_locks_daily"])






In [0]:
dt = dbutils.widgets.get("dt")
bucket = dbutils.widgets.get("bucket").strip()
bronze_prefix = dbutils.widgets.get("bronze_prefix").strip().rstrip("/")
silver_prefix = dbutils.widgets.get("silver_prefix").strip().rstrip("/")
table = dbutils.widgets.get("table")

if not bucket:
    raise ValueError("bucket is required")

bronze_dt = f"s3://{bucket}/{bronze_prefix}/dt={dt}"
manifest_path = f"{bronze_dt}/_MANIFEST.json"



In [0]:
%sql

DESCRIBE EXTERNAL LOCATION `db_s3_external_databricks-s3-optio-bronze-raw`;


GRANT READ FILES
ON EXTERNAL LOCATION `db_s3_external_databricks-s3-optio-bronze-raw`
TO `optio-devs`;

SHOW GRANTS ON EXTERNAL LOCATION `db_s3_external_databricks-s3-optio-bronze-raw`;

In [0]:
def read_manifest(path: str) -> dict:
    txt = dbutils.fs.head(path, 1000000)
    return json.loads(txt)

manifest = read_manifest(manifest_path)

display(manifest)


In [0]:
def dataset_run_path(dataset: str) -> str:
    info = manifest["datasets"].get(dataset)
    if not info or info.get("status") != "SUCCESS":
        raise ValueError(f"Dataset not SUCCESS in manifest: {dataset}")
    run_id = info["run_id"]
    return f"s3://{bucket}/{bronze_prefix}/dt={dt}/dataset={dataset}/run_id={run_id}"


df = spark.read.json(f"{dataset_run_path('staking_delegations_to')}/part-*.jsonl.gz")
# display(df.limit(5))


In [0]:
import json
from pyspark.sql import functions as F, types as T

def read_manifest(path: str) -> dict:
    txt = dbutils.fs.head(path, 1000000)
    return json.loads(txt)

def dataset_run_path(bucket: str, bronze_prefix: str, dt: str, manifest: dict, dataset: str) -> str:
    info = manifest["datasets"].get(dataset)
    if not info:
        raise ValueError(f"Dataset missing from manifest: {dataset}")
    if info.get("status") != "SUCCESS":
        raise ValueError(f"Dataset not SUCCESS in manifest: {dataset} => {info}")
    run_id = info["run_id"]
    return f"s3://{bucket}/{bronze_prefix}/dt={dt}/dataset={dataset}/run_id={run_id}"

def read_bronze_jsonl_gz(run_path: str):
    return spark.read.json(f"{run_path}/part-*.jsonl.gz")



In [0]:
manifest_path = f"s3://{bucket}/{bronze_prefix}/dt={dt}/_MANIFEST.json"
manifest = read_manifest(manifest_path)

print("snapshot_height:", manifest["snapshot_height"])
print("datasets:", list(manifest["datasets"].keys()))

run_path = dataset_run_path(bucket, bronze_prefix, dt, manifest, "staking_delegations_to")
print("run_path:", run_path)

df = read_bronze_jsonl_gz(run_path)
print("rows:", df.count())
display(df.limit(5))


In [0]:
DEC_UOPT = T.DecimalType(38, 0)

def as_dec_uopt(col):
    return F.col(col).cast(DEC_UOPT)

def as_ts(col):
    return F.to_timestamp(F.col(col))

def as_date(col):
    return F.to_date(F.col(col))

In [0]:
df2 = (df
  .select(
    F.col("dt"),
    F.col("snapshot_height").cast("bigint").alias("height"),
    as_ts("extracted_at").alias("snapshot_ts"),
    F.col("delegator_address"),
    F.col("validator_operator_address"),
    as_dec_uopt("balance_amount_uopt").alias("balance_amount_uopt_dec"),
    F.col("shares").cast(T.DecimalType(38,18)).alias("shares_dec"),
  )
)

# display(df2.limit(5))

In [0]:
%sql
USE CATALOG optio_warehouse;
USE SCHEMA silver;

CREATE TABLE IF NOT EXISTS optio_warehouse.silver.silver_active_locks_daily (
  dt STRING,
  height BIGINT,
  snapshot_ts TIMESTAMP,
  address STRING,
  unlock_date DATE,
  denom STRING,
  amount_uopt DECIMAL(38,0),
  lock_id STRING,
  source_run_id STRING
)
USING DELTA
PARTITIONED BY (dt);

CREATE TABLE IF NOT EXISTS optio_warehouse.silver.silver_chain_daily_state (
  dt STRING,
  height BIGINT,
  snapshot_ts TIMESTAMP,
  total_supply_uopt DECIMAL(38,0),
  total_locked_uopt DECIMAL(38,0),
  bonded_uopt DECIMAL(38,0),
  not_bonded_uopt DECIMAL(38,0),
  source_run_ids MAP<STRING, STRING>
)
USING DELTA
PARTITIONED BY (dt);


In [0]:
%sql
SHOW TABLES IN optio_warehouse.silver;

In [0]:
from pyspark.sql import functions as F, types as T

def build_silver_active_locks_daily(bucket: str, bronze_prefix: str, dt: str, manifest: dict):
    run_id = manifest["datasets"]["lockup_active_locks"]["run_id"]
    run_path = dataset_run_path(bucket, bronze_prefix, dt, manifest, "lockup_active_locks")
    bronze = read_bronze_jsonl_gz(run_path)

    df = (bronze
        .select(
            F.lit(dt).alias("dt"),
            F.col("snapshot_height").cast("bigint").alias("height"),
            as_ts("extracted_at").alias("snapshot_ts"),
            F.col("address").cast("string").alias("address"),
            as_date("unlock_date").alias("unlock_date"),
            F.col("amount_denom").cast("string").alias("denom"),
            as_dec_uopt("amount_uopt").alias("amount_uopt"),
            F.lit(run_id).alias("source_run_id"),
        )
        .withColumn(
            "lock_id",
            F.sha2(
                F.concat_ws("|",
                    F.col("dt"),
                    F.col("height").cast("string"),
                    F.col("address"),
                    F.col("unlock_date").cast("string"),
                    F.col("denom"),
                    F.col("amount_uopt").cast("string"),
                ),
                256
            )
        )
        .dropDuplicates(["dt", "lock_id"])
    )
    return df


In [0]:
from pyspark.sql import functions as F, types as T

DEC_SHARES = T.DecimalType(38, 18)
DEC_UOPT   = T.DecimalType(38, 0)

def as_dec_uopt(col):
    return F.col(col).cast(DEC_UOPT)

def as_ts(col):
    return F.to_timestamp(F.col(col))

def as_date(col):
    return F.to_date(F.col(col))

def build_silver_active_locks_daily(bucket: str, bronze_prefix: str, dt: str, manifest: dict):
    dataset = "lockup_active_locks"
    run_id = manifest["datasets"][dataset]["run_id"]
    run_path = dataset_run_path(bucket, bronze_prefix, dt, manifest, dataset)

    bronze = spark.read.json(f"{run_path}/part-*.jsonl.gz")

    # Create deterministic lock_id from stable business fields
    df = (bronze
        .select(
            F.lit(dt).alias("dt"),
            F.col("snapshot_height").cast("bigint").alias("height"),
            as_ts("extracted_at").alias("snapshot_ts"),
            F.col("address").cast("string").alias("address"),
            as_date("unlock_date").alias("unlock_date"),
            F.col("amount_denom").cast("string").alias("denom"),
            as_dec_uopt("amount_uopt").alias("amount_uopt"),
            F.lit(run_id).alias("source_run_id"),
        )
        .withColumn(
            "lock_id",
            F.sha2(
                F.concat_ws("|",
                    F.col("dt"),
                    F.col("height").cast("string"),
                    F.col("address"),
                    F.col("unlock_date").cast("string"),
                    F.col("denom"),
                    F.col("amount_uopt").cast("string"),
                ),
                256
            )
        )
        .dropDuplicates(["dt", "lock_id"])
    )
    return df


In [0]:
df_locks = build_silver_active_locks_daily(bucket, bronze_prefix, dt, manifest)
print("rows:", df_locks.count())
print("distinct lock_id:", df_locks.select("lock_id").distinct().count())
display(df_locks.limit(10))

In [0]:
target_table = "optio_warehouse.silver.silver_active_locks_daily"

(df_locks.write
  .format("delta")
  .mode("overwrite")
  .option("replaceWhere", f"dt = '{dt}'")
  .saveAsTable(target_table))


In [0]:
spark.table(target_table).where(F.col("dt") == dt).count()


In [0]:
%sql
SELECT dt, count(*) AS n
FROM optio_warehouse.silver.silver_active_locks_daily
WHERE dt = '2026-01-25'
GROUP BY dt;
