# Databricks system data

This notebook ingests data from [Databricks Runtime release notes](https://learn.microsoft.com/en-us/azure/databricks/release-notes/runtime/) and Databricks API.  It is executed using Databricks Workflows as defined in resources/data_platform_tools_job.yml.

In [8]:
import pyspark.sql.functions as F
from databricks.sdk import AccountClient
from data_platform_tools.databricks_system_data import DatabricksSystemData

dbutils.widgets.text("catalog", "auk_dataplatform")
dbutils.widgets.text("schema", "system")
dbutils.widgets.text("tenant_id", "c8e4341f-53a8-4254-9cf6-3707c9077857")
dbutils.widgets.text("account_id", "49d4d377-8db8-478e-bf89-18b6cd393ada")
dbutils.widgets.text("client_id", "9a8fa6d0-b47b-4e38-8fe3-7b8c3cdbb6f9")
dbutils.widgets.text("secret_scope", "kvredkic01")
dbutils.widgets.text("api_secret_key", "sp-dp-databricksapi")


CATALOG_PATH = dbutils.widgets.get("catalog")
SCHEMA_PATH = dbutils.widgets.get("schema")

spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG_PATH}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG_PATH}.{SCHEMA_PATH}")

account_client = AccountClient(
    host = "accounts.azuredatabricks.net",
    account_id = dbutils.widgets.get("account_id"),
    azure_client_id = dbutils.widgets.get("client_id"),
    azure_client_secret = dbutils.secrets.get(
        scope=dbutils.widgets.get("secret_scope"), key=dbutils.widgets.get("api_secret_key")),
    azure_tenant_id = dbutils.widgets.get("tenant_id")
)

dsd = DatabricksSystemData(
    spark = spark,
    account_client = account_client
    )

In [None]:
table_name = f"{CATALOG_PATH}.{SCHEMA_PATH}.runtime_versions"
df = (dsd.get_runtime_versions()
      .withColumn("spark_version_number", F.regexp_replace("version", " LTS", "")))
df.writeTo(table_name).createOrReplace()

In [None]:
table_name = f"{CATALOG_PATH}.{SCHEMA_PATH}.users"
df = dsd.get_users()
df = (df
      .withColumn("givenname", df.name.givenname)
      .withColumn("familyname", df.name.familyname)
      .drop("name", "emails"))
df.writeTo(table_name).createOrReplace()

In [None]:
table_name = f"{CATALOG_PATH}.{SCHEMA_PATH}.workspaces"
df = dsd.get_workspaces()
df.writeTo(table_name).createOrReplace()

In [None]:
table_name = f"{CATALOG_PATH}.{SCHEMA_PATH}.workspace_privileges"
df = dsd.get_workspace_privileges()
df = (df
      .withColumn("privilege_type", df.permissions[0])
      .withColumn("principal_id", df.principal.principal_id)
      .withColumn("type",
                  F.when(F.col("principal").group_name.isNotNull(), F.lit("Group"))
                  .when(F.col("principal").service_principal_name.isNotNull(), F.lit("Service Principal"))
                  .when(F.col("principal").user_name.isNotNull(), F.lit("User"))
                  .otherwise(F.lit("UNKNOWN")))
      .withColumn("display_name",
                  F.when(F.col("type") == "User", df.principal.user_name)
                  .otherwise(df.principal.display_name))
      .drop("permissions", "principal")
     )

df.writeTo(table_name).createOrReplace()

In [None]:
table_name = f"{CATALOG_PATH}.{SCHEMA_PATH}.clusters"
df = dsd.get_clusters()
df = (df
      .withColumn("owner", df.custom_tags.owner)
      .withColumn("spark_version_number", F.substring_index("spark_version", ".x", 1)))
df.writeTo(table_name).createOrReplace()

In [None]:
table_name = f"{CATALOG_PATH}.{SCHEMA_PATH}.warehouses"
df = dsd.get_warehouses()
df = (df
      .withColumn("owner", F.get(F.expr("filter(tags.custom_tags, x -> x.key == 'owner')"), 0).value))
df.writeTo(table_name).createOrReplace()

In [None]:
table_name = f"{CATALOG_PATH}.{SCHEMA_PATH}.cluster_privileges"
df = dsd.get_cluster_privileges()
df.writeTo(table_name).createOrReplace()

In [None]:
table_name = f"{CATALOG_PATH}.{SCHEMA_PATH}.warehouse_privileges"
df = dsd.get_warehouse_privileges()
df.writeTo(table_name).createOrReplace()

In [None]:
table_name = f"{CATALOG_PATH}.{SCHEMA_PATH}.dashboard_privileges"
df = dsd.get_dashboard_privileges()
df.writeTo(table_name).createOrReplace()

In [None]:
# table_name = f"{CATALOG_PATH}.{SCHEMA_PATH}.volume_privileges"
# df = dsd.get_volume_privileges
# df.writeTo(table_name).createOrReplace()