# Billing
- https://docs.databricks.com/en/admin/system-tables/billing.html
- https://docs.databricks.com/en/admin/system-tables/jobs-cost.html (Example queries)

In [0]:
-- https://docs.databricks.com/en/admin/system-tables/jobs-cost.html#most-expensive-jobs-from-the-last-30-days
-- most-expensive-jobs-from-the-last-30-days
-- This query identifies the jobs with the highest spend from the last 30 days.
with list_cost_per_job as (
  SELECT
    t1.workspace_id,
    t1.usage_metadata.job_id,
    COUNT(DISTINCT t1.usage_metadata.job_run_id) as runs,
    SUM(t1.usage_quantity * list_prices.pricing.default) as list_cost,
    first(identity_metadata.run_as, true) as run_as,
    first(t1.custom_tags, true) as custom_tags,
    MAX(t1.usage_end_time) as last_seen_date
  FROM system.billing.usage t1
  INNER JOIN system.billing.list_prices list_prices on
    t1.cloud = list_prices.cloud and
    t1.sku_name = list_prices.sku_name and
    t1.usage_start_time >= list_prices.price_start_time and
    (t1.usage_end_time <= list_prices.price_end_time or list_prices.price_end_time is null)
  WHERE
    t1.sku_name LIKE '%JOBS%'
    AND t1.usage_metadata.job_id IS NOT NULL
    AND t1.usage_date >= CURRENT_DATE() - INTERVAL 30 DAY
  GROUP BY ALL
),
most_recent_jobs as (
  SELECT
    *,
    ROW_NUMBER() OVER(PARTITION BY workspace_id, job_id ORDER BY change_time DESC) as rn
  FROM
    system.lakeflow.jobs QUALIFY rn=1
)
SELECT
    t2.name,
    t1.job_id,
    t1.workspace_id,
    t1.runs,
    t1.run_as,
    SUM(list_cost) as list_cost,
    t1.last_seen_date
FROM list_cost_per_job t1
  LEFT JOIN most_recent_jobs t2 USING (workspace_id, job_id)
GROUP BY ALL
ORDER BY list_cost DESC


In [0]:
-- This query identifies the job runs with the highest spend from the last 30 days.
with list_cost_per_job_run as (
  SELECT
    t1.workspace_id,
    t1.usage_metadata.job_id,
    t1.usage_metadata.job_run_id as run_id,
    SUM(t1.usage_quantity * list_prices.pricing.default) as list_cost,
    first(identity_metadata.run_as, true) as run_as,
    first(t1.custom_tags, true) as custom_tags,
    MAX(t1.usage_end_time) as last_seen_date
  FROM system.billing.usage t1
  INNER JOIN system.billing.list_prices list_prices on
    t1.cloud = list_prices.cloud and
    t1.sku_name = list_prices.sku_name and
    t1.usage_start_time >= list_prices.price_start_time and
    (t1.usage_end_time <= list_prices.price_end_time or list_prices.price_end_time is null)
  WHERE
    t1.sku_name LIKE '%JOBS%'
    AND t1.usage_metadata.job_id IS NOT NULL
    AND t1.usage_metadata.job_run_id IS NOT NULL
    AND t1.usage_date >= CURRENT_DATE() - INTERVAL 30 DAY
  GROUP BY ALL
),
most_recent_jobs as (
  SELECT
    *,
    ROW_NUMBER() OVER(PARTITION BY workspace_id, job_id ORDER BY change_time DESC) as rn
  FROM
    system.lakeflow.jobs QUALIFY rn=1
)
SELECT
    t1.workspace_id,
    t2.name,
    t1.job_id,
    t1.run_id,
     t1.run_as,
    SUM(list_cost) as list_cost,
    t1.last_seen_date
FROM list_cost_per_job_run t1
  LEFT JOIN most_recent_jobs t2 USING (workspace_id, job_id)
GROUP BY ALL
ORDER BY list_cost DESC


In [0]:
-- Dashboard: system-tables:jobs ; Query: Job Run Cost
SELECT
  t1.usage_date,
  t1.usage_metadata.job_id as job_id,
  t1.usage_metadata.job_run_id as run_id,
  t1.identity_metadata.run_as as run_as,
  t1.usage_quantity as usage_quantity,
  list_prices.pricing.default as pricing,
  t1.usage_quantity * list_prices.pricing.default AS list_cost,
  t1.*
FROM system.billing.usage t1
  INNER JOIN system.billing.list_prices list_prices 
    ON
      t1.cloud = list_prices.cloud AND
      t1.sku_name = list_prices.sku_name AND
      t1.usage_start_time >= list_prices.price_start_time AND
      (t1.usage_end_time <= list_prices.price_end_time or list_prices.price_end_time is NULL)
WHERE
  t1.sku_name LIKE '%JOBS%' AND
  t1.usage_metadata.job_id IS NOT NULL AND
  t1.usage_metadata.job_run_id IS NOT NULL AND
  t1.identity_metadata.run_as = 'juan.lamadrid@databricks.com'
ORDER BY t1.usage_date DESC;


In [0]:
-- DBU by job details
SELECT 
  usage_date
  ,sku_name
  ,SUM(usage_quantity) as total_dbu
FROM
  system.billing.usage
WHERE
  identity_metadata.run_as = 'juan.lamadrid@databricks.com'
  -- and billing_origin_product in ('JOBS')
  and usage_metadata.job_id = "164851159737624"
GROUP BY
  1,2
ORDER BY usage_date DESC

In [0]:
-- all job details
SELECT sku_name
  ,usage_start_time
  ,usage_end_time
  ,usage_date
  ,usage_quantity
  ,usage_metadata
  ,billing_origin_product
  ,product_features
  ,usage_type
  ,ingestion_date
  ,record_type
FROM
  system.billing.usage
WHERE
  identity_metadata.run_as = 'juan.lamadrid@databricks.com'
  and billing_origin_product in ('JOBS')
  and usage_metadata.job_id = "164851159737624"
ORDER BY usage_start_time DESC

In [0]:
-- what was the result of jobs, how much did they cost and how many dbus did the use?
with data as (
  SELECT
    u.workspace_id,
    u.usage_metadata.job_id as job_id,
    usage_metadata.job_run_id as run_id,
    sum(lp.pricing.default * u.usage_quantity) as list_cost,
    sum(u.usage_quantity) as dbus
  FROM
    system.billing.usage u
    inner join system.billing.list_prices lp on u.cloud = lp.cloud
    and u.sku_name = lp.sku_name
    and u.usage_start_time >= lp.price_start_time
    and (
      u.usage_end_time <= lp.price_end_time
      or lp.price_end_time is null
    )
    where u.sku_name like '%JOB%'
    and usage_metadata.job_id is not null
    group by all
),
job_runs as (
    select date_trunc('DAY', period_end_time) as run_date, job_id, run_id, result_state 
    from system.lakeflow.job_run_timeline 
    where result_state is not null
),
run_times as (
    select job_id, run_id, min(period_start_time) as start_time, max(period_end_time) as end_time
    from system.lakeflow.job_run_timeline
    group by all
),
job_runs_with_times as (
    select * 
    from job_runs
    inner join run_times using (job_id, run_id)
)
select  
    workspace_id, 
    job_id,
    run_id,
    result_state,
    start_time,
    end_time,
    list_cost,
    dbus
from data
inner join job_runs_with_times using (job_id,run_id)