# Databricks system data

This notebook ingests data from [Databricks Runtime release notes](https://learn.microsoft.com/en-us/azure/databricks/release-notes/runtime/) and Databricks API.  It is executed using Databricks Workflows as defined in resources/data_platform_tools_job.yml.

In [8]:
from pyspark.sql.functions import lit, col, when, expr, get, substring_index, regexp_replace
from databricks.sdk import AccountClient
from data_platform_tools.databricks_system_data import DatabricksSystemData

dbutils.widgets.text("catalog", "auk_dataplatform")
dbutils.widgets.text("schema", "system")

# --- CONFIG ----
ACCOUNT_HOST = "accounts.azuredatabricks.net"
TENANT_ID = "c3588c15-f840-4591-875f-b3d42610f22f"
ACCOUNT_ID = "42ba6f6a-250d-4e87-9433-3ab73685b3f6"
CLIENT_ID = "22a10d55-9e76-464d-96bc-3e6c3e44cc35"
CLIENT_SECRET = dbutils.secrets.get(scope="kv-redkic-ne-test", key="DatabricksAPI")
CATALOG_PATH = dbutils.widgets.get("catalog")
SCHEMA_PATH = dbutils.widgets.get("schema")
# --- END CONFIG ---

spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG_PATH}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG_PATH}.{SCHEMA_PATH}")

account_client = AccountClient(
    host = ACCOUNT_HOST,
    account_id = ACCOUNT_ID,
    azure_client_id = CLIENT_ID,
    azure_client_secret = CLIENT_SECRET,
    azure_tenant_id = TENANT_ID
)

dsd = DatabricksSystemData(
    spark = spark,
    account_client = account_client
    )

In [None]:
table_name = f"{CATALOG_PATH}.{SCHEMA_PATH}.runtime_versions"
df = (dsd.get_runtime_versions()
      .withColumn("spark_version_number", regexp_replace("version", " LTS", "")))
df.writeTo(table_name).createOrReplace()

In [None]:
table_name = f"{CATALOG_PATH}.{SCHEMA_PATH}.users"
df = dsd.get_users()
df = (df
      .withColumn("givenname", df.name.givenname)
      .withColumn("familyname", df.name.familyname)
      .drop("name", "emails"))
df.writeTo(table_name).createOrReplace()


In [None]:
table_name = f"{CATALOG_PATH}.{SCHEMA_PATH}.workspaces"
df = dsd.get_workspaces()
df.writeTo(table_name).createOrReplace()

In [None]:
table_name = f"{CATALOG_PATH}.{SCHEMA_PATH}.workspace_privileges"
df = dsd.get_workspace_privileges()
df = (df
      .withColumn("privilege_type", df.permissions[0])
      .withColumn("principal_id", df.principal.principal_id)
      .withColumn("type",
                  when(col("principal").group_name.isNotNull(), lit("Group"))
                  .when(col("principal").service_principal_name.isNotNull(), lit("Service Principal"))
                  .when(col("principal").user_name.isNotNull(), lit("User"))
                  .otherwise(lit("UNKNOWN")))
      .withColumn("display_name", when(col("type") == "User", df.principal.user_name).otherwise(df.principal.display_name))
      .drop("permissions", "principal")
     )

df.writeTo(table_name).createOrReplace()

In [None]:
table_name = f"{CATALOG_PATH}.{SCHEMA_PATH}.clusters"
df = dsd.get_clusters()
df = (df
      .withColumn("owner", df.custom_tags.owner)
      .withColumn("spark_version_number", substring_index("spark_version", ".x", 1)))
df.writeTo(table_name).createOrReplace()

In [None]:
table_name = f"{CATALOG_PATH}.{SCHEMA_PATH}.warehouses"
df = dsd.get_warehouses()
df = (df
      .withColumn("owner", get(expr("filter(tags.custom_tags, x -> x.key == 'owner')"), 0).value))
df.writeTo(table_name).createOrReplace()

In [None]:
table_name = f"{CATALOG_PATH}.{SCHEMA_PATH}.cluster_privileges"
df = dsd.get_cluster_privileges()
df.writeTo(table_name).createOrReplace()

In [None]:
table_name = f"{CATALOG_PATH}.{SCHEMA_PATH}.warehouse_privileges"
df = dsd.get_warehouse_privileges()
df.writeTo(table_name).createOrReplace()