# ConversionCentral Managed Profiling
Run this notebook from a Databricks Repo so backend deployments control profiling logic.

In [None]:
# Collect parameters passed by the FastAPI backend
# Each widget is declared up front so Databricks jobs can safely supply overrides.
dbutils.widgets.text("table_group_id", "")
dbutils.widgets.text("profile_run_id", "")
dbutils.widgets.text("data_quality_schema", "")
dbutils.widgets.text("payload_path", "")
dbutils.widgets.text("payload_base_path", "")
dbutils.widgets.text("callback_url", "")
dbutils.widgets.text("callback_base_url", "")
dbutils.widgets.text("callback_token", "")
dbutils.widgets.text("payload_storage", "")
dbutils.widgets.text("callback_behavior", "")
dbutils.widgets.text("catalog", "")
dbutils.widgets.text("schema_name", "")
dbutils.widgets.text("connection_id", "")
dbutils.widgets.text("connection_name", "")
dbutils.widgets.text("system_id", "")
dbutils.widgets.text("project_key", "")
dbutils.widgets.text("http_path", "")

from datetime import datetime
import json
import requests
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
table_group_id = dbutils.widgets.get("table_group_id")
profile_run_id = dbutils.widgets.get("profile_run_id")
dq_schema = (dbutils.widgets.get("data_quality_schema") or "").strip()
raw_payload_path = (dbutils.widgets.get("payload_path") or "").strip()
payload_path = raw_payload_path or None
payload_base_path = (dbutils.widgets.get("payload_base_path") or "").strip() or None
callback_url = (dbutils.widgets.get("callback_url") or "").strip() or None
callback_base_url = (dbutils.widgets.get("callback_base_url") or "").strip() or None
callback_token = (dbutils.widgets.get("callback_token") or "").strip() or None
connection_catalog = (dbutils.widgets.get("catalog") or "").strip()
connection_schema = (dbutils.widgets.get("schema_name") or "").strip()

if not table_group_id or not profile_run_id:
    raise ValueError("Required widgets missing: table_group_id/profile_run_id")
if not dq_schema:
    raise ValueError("Data quality schema widget is required for profiling runs.")

In [None]:
# Profile the tables registered for this table group and build the result payload.
from datetime import datetime
import re
from contextlib import suppress
from typing import Iterable

import datetime as dt
import hashlib
import json
import math

from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, BinaryType, MapType, StructType
from pyspark.sql import types as T
from pyspark.sql.utils import AnalysisException

MAX_COLUMNS_TO_PROFILE = 25
NULL_RATIO_ALERT_THRESHOLD = 0.5
HIGH_NULL_RATIO_THRESHOLD = 0.9
VALUE_DISTRIBUTION_LIMIT = 25
VALUE_DISTRIBUTION_DISTINCT_THRESHOLD = 1000
VALUE_DISTRIBUTION_MAX_ROWS = 5_000_000
MAX_VALUE_DISPLAY_LENGTH = 256

PROFILE_COLUMN_FIELDS = [
    "profile_run_id",
    "schema_name",
    "table_name",
    "column_name",
    "qualified_name",
    "data_type",
    "general_type",
    "ordinal_position",
    "row_count",
    "null_count",
    "non_null_count",
    "distinct_count",
    "min_value",
    "max_value",
    "avg_value",
    "stddev_value",
    "median_value",
    "p95_value",
    "true_count",
    "false_count",
    "min_length",
    "max_length",
    "avg_length",
    "non_ascii_ratio",
    "min_date",
    "max_date",
    "date_span_days",
    "metrics_json",
    "generated_at",
]

PROFILE_COLUMNS_SCHEMA = T.StructType(
    [
        T.StructField("profile_run_id", T.StringType(), False),
        T.StructField("schema_name", T.StringType(), True),
        T.StructField("table_name", T.StringType(), False),
        T.StructField("column_name", T.StringType(), False),
        T.StructField("qualified_name", T.StringType(), True),
        T.StructField("data_type", T.StringType(), True),
        T.StructField("general_type", T.StringType(), True),
        T.StructField("ordinal_position", T.IntegerType(), True),
        T.StructField("row_count", T.LongType(), True),
        T.StructField("null_count", T.LongType(), True),
        T.StructField("non_null_count", T.LongType(), True),
        T.StructField("distinct_count", T.LongType(), True),
        T.StructField("min_value", T.StringType(), True),
        T.StructField("max_value", T.StringType(), True),
        T.StructField("avg_value", T.DoubleType(), True),
        T.StructField("stddev_value", T.DoubleType(), True),
        T.StructField("median_value", T.DoubleType(), True),
        T.StructField("p95_value", T.DoubleType(), True),
        T.StructField("true_count", T.LongType(), True),
        T.StructField("false_count", T.LongType(), True),
        T.StructField("min_length", T.IntegerType(), True),
        T.StructField("max_length", T.IntegerType(), True),
        T.StructField("avg_length", T.DoubleType(), True),
        T.StructField("non_ascii_ratio", T.DoubleType(), True),
        T.StructField("min_date", T.DateType(), True),
        T.StructField("max_date", T.DateType(), True),
        T.StructField("date_span_days", T.IntegerType(), True),
        T.StructField("metrics_json", T.StringType(), True),
        T.StructField("generated_at", T.TimestampType(), True),
    ]
)

PROFILE_COLUMN_VALUES_FIELDS = [
    "profile_run_id",
    "schema_name",
    "table_name",
    "column_name",
    "value",
    "value_hash",
    "frequency",
    "relative_freq",
    "rank",
    "bucket_label",
    "bucket_lower_bound",
    "bucket_upper_bound",
    "generated_at",
]

PROFILE_COLUMN_VALUES_SCHEMA = T.StructType(
    [
        T.StructField("profile_run_id", T.StringType(), False),
        T.StructField("schema_name", T.StringType(), True),
        T.StructField("table_name", T.StringType(), False),
        T.StructField("column_name", T.StringType(), False),
        T.StructField("value", T.StringType(), True),
        T.StructField("value_hash", T.StringType(), True),
        T.StructField("frequency", T.LongType(), True),
        T.StructField("relative_freq", T.DoubleType(), True),
        T.StructField("rank", T.IntegerType(), True),
        T.StructField("bucket_label", T.StringType(), True),
        T.StructField("bucket_lower_bound", T.DoubleType(), True),
        T.StructField("bucket_upper_bound", T.DoubleType(), True),
        T.StructField("generated_at", T.TimestampType(), True),
    ]
)

PROFILE_ANOMALIES_FIELDS = [
    "profile_run_id",
    "table_name",
    "column_name",
    "anomaly_type",
    "severity",
    "description",
    "detected_at",
]

PROFILE_ANOMALIES_SCHEMA = T.StructType(
    [
        T.StructField("profile_run_id", T.StringType(), False),
        T.StructField("table_name", T.StringType(), True),
        T.StructField("column_name", T.StringType(), True),
        T.StructField("anomaly_type", T.StringType(), True),
        T.StructField("severity", T.StringType(), True),
        T.StructField("description", T.StringType(), True),
        T.StructField("detected_at", T.TimestampType(), True),
    ]
)

def _split_identifier(value: str | None) -> list[str]:
    cleaned = (value or "").replace("`", "").strip()
    if not cleaned:
        return []
    return [segment.strip() for segment in cleaned.split(".") if segment.strip()]


def _catalog_component(value: str | None) -> str | None:
    parts = _split_identifier(value)
    if len(parts) >= 2:
        return parts[0]
    return None

In [None]:
# Persist payload and call back into the API

from datetime import datetime
import re
import socket
from contextlib import suppress
from functools import lru_cache
from urllib.parse import urlparse, urlunparse

from pyspark.sql import functions as F
from pyspark.sql.utils import AnalysisException

DEFAULT_PRIVATE_PAYLOAD_ROOT = "dbfs:/tmp/conversioncentral/profiles"
DEFAULT_DRIVER_PAYLOAD_ROOT = "file:/databricks/driver/conversioncentral/profiles"
DEFAULT_CALLBACK_BEHAVIOR = "metadata_only"

DEFAULT_PAYLOAD_STORAGE_MODE = "inline"


DBFS_DISABLED_MESSAGES = ("public dbfs root is disabled", "access is denied")
DRIVER_DISABLED_MESSAGES = ("local filesystem access is forbidden", "workspacelocalfilesystem")
URI_SCHEME_PATTERN = re.compile(r"^[a-z][a-z0-9+.\-]*:/", re.IGNORECASE)
_DBFS_REDIRECT_NOTICE_EMITTED = False
_STORAGE_DISABLED_NOTICE_EMITTED = False


def _looks_like_dns_failure(error: BaseException) -> bool:
    """Detect DNS resolution failures from nested request exceptions."""
    current = error
    while current:
        if isinstance(current, socket.gaierror):
            return True
        name = current.__class__.__name__.lower()
        if "nameresolution" in name:
            return True
        message = str(current).lower()
        if "temporary failure in name resolution" in message:
            return True
        current = getattr(current, "__cause__", None) or getattr(current, "__context__", None)
    return False


def _rewrite_heroku_app_host(url: str | None) -> str | None:
    """Fallback to canonical Heroku hostname when review-app hosts fail DNS."""
    if not url:
        return None
    parsed = urlparse(url)
    host = parsed.hostname
    if not host:
        return None
    match = re.match(r"^(?P<base>[a-z0-9-]+?)-[0-9a-f]{12}\.herokuapp\.com$", host)
    if not match:
        return None
    canonical_host = f"{match.group('base')}.herokuapp.com"
    netloc = canonical_host
    if parsed.port:
        netloc = f"{canonical_host}:{parsed.port}"
    if parsed.username:
        auth = parsed.username
        if parsed.password:
            auth = f"{auth}:{parsed.password}"
        netloc = f"{auth}@{netloc}"
    scheme = parsed.scheme or "https"
    if scheme.lower() == "http":
        scheme = "https"
    return urlunparse(parsed._replace(netloc=netloc, scheme=scheme))


def _is_dbfs_path(path: str | None) -> bool:
    return bool(path and path.lower().startswith("dbfs:/"))


def _has_uri_scheme(value: str | None) -> bool:
    return bool(value and URI_SCHEME_PATTERN.match(value.strip()))


@lru_cache(maxsize=1)
def _dbfs_root_is_disabled() -> bool:
    probe_path = f"{DEFAULT_PRIVATE_PAYLOAD_ROOT}/_dbfs_access_probe"
    try:
        dbutils.fs.mkdirs(probe_path)
        dbutils.fs.rm(probe_path, True)
        return False
    except Exception as exc:  # noqa: BLE001 - Databricks surfaces JVM errors generically
        message = str(exc).lower()
        return any(fragment in message for fragment in DBFS_DISABLED_MESSAGES)


@lru_cache(maxsize=1)
def _driver_fs_is_disabled() -> bool:
    probe_path = f"{DEFAULT_DRIVER_PAYLOAD_ROOT}/_driver_access_probe"
    try:
        dbutils.fs.mkdirs(probe_path)
        dbutils.fs.rm(probe_path, True)
        return False
    except Exception as exc:  # noqa: BLE001 - Databricks surfaces JVM errors generically
        message = str(exc).lower()
        return any(fragment in message for fragment in DRIVER_DISABLED_MESSAGES)


def _warn_storage_disabled(message: str) -> None:
    global _STORAGE_DISABLED_NOTICE_EMITTED
    if not _STORAGE_DISABLED_NOTICE_EMITTED:
        print(message)
        _STORAGE_DISABLED_NOTICE_EMITTED = True


def _redirect_dbfs_path(path: str) -> str | None:
    global _DBFS_REDIRECT_NOTICE_EMITTED
    if not _is_dbfs_path(path):
        return path
    if not _dbfs_root_is_disabled():
        return path
    if _driver_fs_is_disabled():
        _warn_storage_disabled(
            "DBFS root access and driver filesystem writes are both disabled; payload artifacts will be skipped unless "
            "a cloud storage payload_base_path is provided."
        )
        return None
    if not _DBFS_REDIRECT_NOTICE_EMITTED:
        print(
            "DBFS root access is disabled on this workspace; persisting profiling artifacts to the driver filesystem "
            "instead."
        )
        _DBFS_REDIRECT_NOTICE_EMITTED = True
    suffix = path[len("dbfs:/") :].lstrip("/")
    redirected = f"{DEFAULT_DRIVER_PAYLOAD_ROOT}/{suffix}" if suffix else DEFAULT_DRIVER_PAYLOAD_ROOT
    return redirected.rstrip("/")


def _mkdirs_if_supported(target_path: str) -> None:
    lowered = target_path.lower()
    if lowered.startswith("dbfs:/") and _dbfs_root_is_disabled():
        return
    if lowered.startswith("file:/") and _driver_fs_is_disabled():
        return
    if lowered.startswith("dbfs:/") or lowered.startswith("file:/"):
        parent_dir = target_path.rsplit("/", 1)[0]
        dbutils.fs.mkdirs(parent_dir)


def _ensure_https_base_url(value: str) -> str:
    normalized = (value or "").strip()
    if not normalized:
        return normalized
    parsed = urlparse(normalized)
    if not parsed.scheme:
        normalized = f"https://{normalized.lstrip('/')}"
        parsed = urlparse(normalized)
    if parsed.scheme.lower() == "http":
        parsed = parsed._replace(scheme="https")
    normalized = urlunparse(parsed).rstrip("/")
    return normalized


def _lookup_metadata_setting(setting_key: str) -> str | None:
    normalized_key = (setting_key or "").strip().lower()
    if not normalized_key:
        return None
    try:
        settings_table = _metadata_table("dq_settings")
    except NameError:
        return None
    try:
        row = (
            spark.table(settings_table)
            .where(F.lower(F.col("key")) == normalized_key)
            .select("value")
            .limit(1)
            .collect()
        )
    except AnalysisException:
        return None
    if not row:
        return None
    value = row[0].get("value")
    return value.strip() if isinstance(value, str) and value.strip() else None




def _normalize_payload_storage_mode(value: str | None) -> str | None:
    normalized = (value or "").strip().lower()
    if not normalized:
        return None
    if normalized in {"inline", "database", "db"}:
        return "inline"
    if normalized in {"artifact", "artifacts", "file", "files", "path", "paths", "dbfs", "cloud"}:
        return "artifact"
    return None


def _resolve_payload_storage_mode() -> str:
    widget_choice = _normalize_payload_storage_mode(dbutils.widgets.get("payload_storage"))
    if widget_choice:
        return widget_choice
    setting_choice = _normalize_payload_storage_mode(_lookup_metadata_setting("profile_payload_storage_mode"))
    if setting_choice:
        return setting_choice
    return DEFAULT_PAYLOAD_STORAGE_MODE


def _payload_storage_is_artifact(mode: str) -> bool:
    return (mode or "").strip().lower() == "artifact"


def _encode_payload_json(payload: dict[str, object]) -> str | None:
    try:
        return json.dumps(payload, separators=(",", ":"))
    except TypeError as exc:
        print(f"Unable to serialize profiling payload: {exc}")
        return None


def _resolve_callback_behavior() -> str:
    widget_value = (dbutils.widgets.get("callback_behavior") or "").strip().lower()
    if widget_value:
        return widget_value
    setting_value = (_lookup_metadata_setting("profile_callback_behavior") or "").strip().lower()
    if setting_value:
        return setting_value
    return DEFAULT_CALLBACK_BEHAVIOR


def _callbacks_enabled(behavior: str) -> bool:
    if behavior in {"api", "callback", "legacy"}:
        return True
    if behavior in {"metadata_only", "metadata", "skip", "disabled", "off"}:
        return False
    print(f"Unknown callback behavior '{behavior}'; defaulting to metadata_only.")
    return False





def _sql_string_literal(value: str | None) -> str:
    if value is None:
        return "NULL"
    escaped = str(value).replace("'", "''")
    return f"'{escaped}'"


def _sql_numeric_literal(value: int | float | None) -> str:
    if value is None:
        return "NULL"
    try:
        return str(int(value))
    except (TypeError, ValueError):
        return "NULL"


def _normalize_temp_view_name(suffix: str | None) -> str:
    cleaned = re.sub(r"[^a-zA-Z0-9_]", "_", (suffix or "profile_run"))
    return f"_profile_anomalies_{cleaned}"


def _parse_anomaly_timestamp(value: str | None) -> datetime | None:
    if not value:
        return None
    candidate = value.strip()
    if not candidate:
        return None
    if candidate.endswith("Z"):
        candidate = f"{candidate[:-1]}+00:00"
    with suppress(ValueError):
        return datetime.fromisoformat(candidate)
    return None




In [None]:
# Column/value persistence helpers and overrides
import datetime as dt
from datetime import datetime

def _escape_identifier(identifier: str) -> str:
    cleaned = (identifier or "").strip().replace("`", "")
    if not cleaned:
        raise ValueError("Metadata identifiers cannot be empty.")
    return f"`{cleaned}`"

def _metadata_schema_reference() -> str:
    if not dq_schema:
        raise ValueError("data_quality_schema widget must be set before resolving metadata tables.")
    catalog = (connection_catalog or "").strip()
    if catalog:
        return f"{_escape_identifier(catalog)}.{_escape_identifier(dq_schema)}"
    return _escape_identifier(dq_schema)

def _metadata_table(table_name: str) -> str:
    return f"{_metadata_schema_reference()}.{_escape_identifier(table_name)}"

def _first_non_empty(*values):
    for value in values:
        if isinstance(value, str):
            candidate = value.strip()
            if candidate:
                return candidate
        elif value is not None:
            return value
    return None

def _coerce_int(value):
    if value is None:
        return None
    if isinstance(value, bool):
        return int(value)
    if isinstance(value, int):
        return value
    if isinstance(value, float):
        if not math.isfinite(value):
            return None
        return int(round(value))
    if isinstance(value, str):
        candidate = value.strip().replace(",", "")
        if not candidate:
            return None
        try:
            if "." in candidate:
                return int(float(candidate))
            return int(candidate)
        except ValueError:
            return None
    return None

def _coerce_float(value):
    if value is None:
        return None
    if isinstance(value, bool):
        return float(value)
    if isinstance(value, (int, float)):
        numeric = float(value)
        if math.isfinite(numeric):
            return numeric
        return None
    if isinstance(value, str):
        candidate = value.strip().replace(",", "")
        if not candidate:
            return None
        try:
            numeric = float(candidate)
        except ValueError:
            return None
        return numeric if math.isfinite(numeric) else None
    return None

def _coerce_date(value):
    if value is None:
        return None
    if isinstance(value, dt.date):
        return value
    if isinstance(value, datetime):
        return value.date()
    if isinstance(value, str):
        candidate = value.strip()
        if not candidate:
            return None
        normalized = f"{candidate[:-1]}+00:00" if candidate.endswith("Z") else candidate
        try:
            parsed = datetime.fromisoformat(normalized)
            return parsed.date()
        except ValueError:
            pass
        with suppress(ValueError):
            return datetime.strptime(candidate, "%Y-%m-%d").date()
    return None

def _stringify_value(value, limit: int | None = MAX_VALUE_DISPLAY_LENGTH):
    if value is None:
        return None
    if isinstance(value, bytes):
        candidate = value.decode("utf-8", errors="replace")
    elif isinstance(value, (datetime, dt.date)):
        candidate = value.isoformat()
    else:
        candidate = str(value)
    candidate = candidate.strip()
    if not candidate:
        return None
    if limit is not None and len(candidate) > limit:
        return candidate[:limit]
    return candidate

def _hash_value(value: str | None) -> str | None:
    if not value:
        return None
    return hashlib.sha1(value.encode("utf-8")).hexdigest()

def _collect_metrics(column_entry: dict[str, object]) -> dict[str, object]:
    metrics: dict[str, object] = {}
    for key in ("metrics", "summary"):
        nested = column_entry.get(key)
        if isinstance(nested, dict):
            for nested_key, nested_value in nested.items():
                if nested_value is None or nested_key in metrics:
                    continue
                metrics[nested_key] = nested_value
    return metrics

def _metric_lookup(column_entry: dict[str, object], metrics_map: dict[str, object], *keys):
    for key in keys:
        if key in metrics_map and metrics_map[key] is not None:
            return metrics_map[key]
        value = column_entry.get(key)
        if value is not None:
            return value
    return None

def _infer_general_type(data_type: str | None) -> str | None:
    if not data_type:
        return None
    lowered = data_type.lower()
    if any(token in lowered for token in ("int", "decimal", "number", "double", "float")):
        return "N"
    if any(token in lowered for token in ("date", "time", "timestamp")):
        return "D"
    if "bool" in lowered or "bit" in lowered:
        return "B"
    if any(token in lowered for token in ("char", "string", "text")):
        return "A"
    return "X"

def _encode_metrics_blob(metrics_map: dict[str, object]) -> str | None:
    if not metrics_map:
        return None
    normalized: dict[str, object] = {}
    for key, value in metrics_map.items():
        if isinstance(value, (datetime, dt.date)):
            normalized[key] = value.isoformat()
        elif isinstance(value, (int, float, bool)) or value is None:
            normalized[key] = value
        else:
            normalized[key] = _stringify_value(value, limit=None)
    try:
        return json.dumps(normalized, separators=(",", ":"))
    except TypeError:
        return None

def _extract_column_name(column_entry: dict[str, object]) -> str | None:
    return _first_non_empty(
        column_entry.get("column_name"),
        column_entry.get("column"),
        column_entry.get("name"),
        column_entry.get("columnName"),
    )

def _extract_table_entries(results_payload: dict[str, object] | list) -> list[dict[str, object]]:
    if isinstance(results_payload, dict):
        for key in ("tables", "table_profiles", "tablesProfiled"):
            candidate = results_payload.get(key)
            if isinstance(candidate, list):
                return [entry for entry in candidate if isinstance(entry, dict)]
        single = results_payload.get("table")
        if isinstance(single, dict):
            return [single]
    if isinstance(results_payload, list):
        return [entry for entry in results_payload if isinstance(entry, dict)]
    return []

def _collect_column_entries(table_entry: dict[str, object]) -> list[dict[str, object]]:
    for key in ("columns", "column_profiles", "columnsProfiled"):
        value = table_entry.get(key)
        if isinstance(value, list):
            return [entry for entry in value if isinstance(entry, dict)]
        if isinstance(value, dict):
            return [entry for entry in value.values() if isinstance(entry, dict)]
    return []

def _extract_table_context(table_entry: dict[str, object]) -> dict[str, object]:
    schema_name = _first_non_empty(
        table_entry.get("schema_name"),
        table_entry.get("schema"),
        table_entry.get("schemaName"),
        connection_schema,
    )
    table_name = _first_non_empty(
        table_entry.get("table_name"),
        table_entry.get("table"),
        table_entry.get("name"),
        table_entry.get("tableName"),
        table_entry.get("physical_name"),
        table_entry.get("physicalName"),
    )
    qualified_name = _first_non_empty(
        table_entry.get("qualified_name"),
        table_entry.get("qualifiedName"),
        table_entry.get("physical_name"),
        table_entry.get("physicalName"),
    )
    if not qualified_name and schema_name and table_name:
        qualified_name = f"{schema_name}.{table_name}"
    return {
        "schema_name": schema_name,
        "table_name": table_name,
        "qualified_name": qualified_name,
    }

def _build_column_row(
    context: dict[str, object],
    column_entry: dict[str, object],
    generated_at: datetime,
    ordinal_fallback: int,
 ):
    column_name = _extract_column_name(column_entry)
    table_name = context.get("table_name")
    if not column_name or not table_name:
        return None

    schema_name = context.get("schema_name") or connection_schema or dq_schema
    metrics_map = _collect_metrics(column_entry)

    row_count = _coerce_int(_metric_lookup(column_entry, metrics_map, "row_count", "rows", "total_rows"))
    null_count = _coerce_int(_metric_lookup(column_entry, metrics_map, "null_count"))
    non_null_count = _coerce_int(
        _metric_lookup(column_entry, metrics_map, "non_null_count", "nonnull_count", "valid_count")
    )
    if non_null_count is None and row_count is not None and null_count is not None:
        non_null_count = max(row_count - null_count, 0)
    distinct_count = _coerce_int(_metric_lookup(column_entry, metrics_map, "distinct_count", "cardinality"))
    min_value = _stringify_value(_metric_lookup(column_entry, metrics_map, "min_value", "min"))
    max_value = _stringify_value(_metric_lookup(column_entry, metrics_map, "max_value", "max"))
    avg_value = _coerce_float(_metric_lookup(column_entry, metrics_map, "avg_value", "avg", "mean"))
    stddev_value = _coerce_float(_metric_lookup(column_entry, metrics_map, "stddev_value", "std_dev", "stddev"))
    median_value = _coerce_float(_metric_lookup(column_entry, metrics_map, "median_value", "median", "p50"))
    p95_value = _coerce_float(_metric_lookup(column_entry, metrics_map, "p95_value", "p95"))
    true_count = _coerce_int(_metric_lookup(column_entry, metrics_map, "true_count", "trues"))
    false_count = _coerce_int(_metric_lookup(column_entry, metrics_map, "false_count", "falses"))
    min_length = _coerce_int(_metric_lookup(column_entry, metrics_map, "min_length", "length_min"))
    max_length = _coerce_int(_metric_lookup(column_entry, metrics_map, "max_length", "length_max"))
    avg_length = _coerce_float(_metric_lookup(column_entry, metrics_map, "avg_length", "length_avg"))
    non_ascii_ratio = _coerce_float(_metric_lookup(column_entry, metrics_map, "non_ascii_ratio", "nonAsciiPercent"))
    min_date = _coerce_date(_metric_lookup(column_entry, metrics_map, "min_date", "minDate"))
    max_date = _coerce_date(_metric_lookup(column_entry, metrics_map, "max_date", "maxDate"))
    date_span_days = _coerce_int(_metric_lookup(column_entry, metrics_map, "date_span_days", "dateSpanDays"))
    if date_span_days is None and min_date and max_date:
        date_span_days = (max_date - min_date).days

    data_type = _first_non_empty(
        column_entry.get("data_type"),
        column_entry.get("type"),
        column_entry.get("dataType"),
    )
    general_type = _first_non_empty(
        column_entry.get("general_type"),
        column_entry.get("type_category"),
        column_entry.get("typeCategory"),
        _infer_general_type(data_type),
    )
    qualified_name = _first_non_empty(
        column_entry.get("qualified_name"),
        column_entry.get("qualifiedName"),
        context.get("qualified_name"),
    )
    ordinal_position = _coerce_int(
        _metric_lookup(column_entry, metrics_map, "ordinal_position", "position", "index", "ordinal")
    )
    if ordinal_position is None:
        ordinal_position = ordinal_fallback

    generated_at_value = column_entry.get("generated_at")
    row_generated_at = (
        generated_at_value
        if isinstance(generated_at_value, datetime)
        else _parse_anomaly_timestamp(generated_at_value)
    ) or generated_at

    metrics_json = column_entry.get("metrics_json") or _encode_metrics_blob(metrics_map)

    return {
        "profile_run_id": profile_run_id,
        "schema_name": schema_name,
        "table_name": table_name,
        "column_name": column_name,
        "qualified_name": qualified_name,
        "data_type": data_type,
        "general_type": general_type,
        "ordinal_position": ordinal_position,
        "row_count": row_count,
        "null_count": null_count,
        "non_null_count": non_null_count,
        "distinct_count": distinct_count,
        "min_value": min_value,
        "max_value": max_value,
        "avg_value": avg_value,
        "stddev_value": stddev_value,
        "median_value": median_value,
        "p95_value": p95_value,
        "true_count": true_count,
        "false_count": false_count,
        "min_length": min_length,
        "max_length": max_length,
        "avg_length": avg_length,
        "non_ascii_ratio": non_ascii_ratio,
        "min_date": min_date,
        "max_date": max_date,
        "date_span_days": date_span_days,
        "metrics_json": metrics_json,
        "generated_at": row_generated_at,
    }

def _normalize_ratio(value):
    numeric = _coerce_float(value)
    if numeric is None:
        return None
    if numeric > 1.0:
        return numeric / 100.0
    return numeric

def _extract_top_values(column_entry: dict[str, object]) -> list[dict[str, object]]:
    for key in ("top_values", "frequencies", "topValues"):
        value = column_entry.get(key)
        if isinstance(value, list):
            return [entry for entry in value if isinstance(entry, dict)]
    return []

def _extract_histogram_entries(column_entry: dict[str, object]) -> list[dict[str, object]]:
    for key in ("histogram", "bins"):
        value = column_entry.get(key)
        if isinstance(value, list):
            return [entry for entry in value if isinstance(entry, dict)]
    return []

def _extract_bucket_bound(item: dict[str, object], *keys):
    for key in keys:
        bound = _coerce_float(item.get(key))
        if bound is not None:
            return bound
    return None

def _build_value_rows(
    context: dict[str, object],
    column_entry: dict[str, object],
    generated_at: datetime,
    row_count: int | None,
 ):
    column_name = _extract_column_name(column_entry)
    table_name = context.get("table_name")
    if not column_name or not table_name:
        return []

    schema_name = context.get("schema_name") or connection_schema or dq_schema
    rows: list[dict[str, object]] = []

    top_values = _extract_top_values(column_entry)[:VALUE_DISTRIBUTION_LIMIT]
    for idx, item in enumerate(top_values, start=1):
        value_text = _stringify_value(item.get("value") or item.get("label"))
        frequency = _coerce_int(item.get("frequency") or item.get("count") or item.get("value_count"))
        rank = _coerce_int(item.get("rank")) or idx
        relative_freq = _normalize_ratio(
            item.get("relative_freq") or item.get("ratio") or item.get("percentage") or item.get("percent")
        )
        if relative_freq is None and row_count and row_count > 0 and frequency is not None:
            relative_freq = frequency / float(row_count)
        if value_text is None and frequency is None:
            continue
        rows.append({
            "profile_run_id": profile_run_id,
            "schema_name": schema_name,
            "table_name": table_name,
            "column_name": column_name,
            "value": value_text,
            "value_hash": _hash_value(value_text),
            "frequency": frequency,
            "relative_freq": relative_freq,
            "rank": rank,
            "bucket_label": None,
            "bucket_lower_bound": None,
            "bucket_upper_bound": None,
            "generated_at": generated_at,
        })

    histogram_entries = _extract_histogram_entries(column_entry)
    for item in histogram_entries:
        bucket_label = _stringify_value(
            _first_non_empty(
                item.get("bucket_label"),
                item.get("bucketLabel"),
                item.get("label"),
                item.get("range"),
            )
        )
        value_text = _stringify_value(item.get("value") or bucket_label)
        frequency = _coerce_int(item.get("frequency") or item.get("count"))
        relative_freq = _normalize_ratio(
            item.get("relative_freq") or item.get("ratio") or item.get("percentage") or item.get("percent")
        )
        if relative_freq is None and row_count and row_count > 0 and frequency is not None:
            relative_freq = frequency / float(row_count)
        if bucket_label is None and value_text is None and frequency is None:
            continue
        rows.append({
            "profile_run_id": profile_run_id,
            "schema_name": schema_name,
            "table_name": table_name,
            "column_name": column_name,
            "value": value_text,
            "value_hash": _hash_value(value_text or bucket_label),
            "frequency": frequency,
            "relative_freq": relative_freq,
            "rank": None,
            "bucket_label": bucket_label,
            "bucket_lower_bound": _extract_bucket_bound(item, "bucket_lower_bound", "lower", "min"),
            "bucket_upper_bound": _extract_bucket_bound(item, "bucket_upper_bound", "upper", "max"),
            "generated_at": generated_at,
        })

    return rows

def _build_profile_detail_rows(results_payload: dict[str, object] | list):
    tables = _extract_table_entries(results_payload)
    if not tables:
        return [], []

    generated_at = datetime.utcnow()
    column_rows: list[dict[str, object]] = []
    value_rows: list[dict[str, object]] = []

    for table_entry in tables:
        context = _extract_table_context(table_entry)
        if not context.get("table_name"):
            continue
        columns = _collect_column_entries(table_entry)
        if not columns:
            continue
        for idx, column_entry in enumerate(columns, start=1):
            column_row = _build_column_row(context, column_entry, generated_at, ordinal_fallback=idx)
            if not column_row:
                continue
            column_rows.append(column_row)
            value_rows.extend(_build_value_rows(context, column_entry, generated_at, column_row.get("row_count")))

    return column_rows, value_rows

def _normalize_temp_view_name(suffix: str | None, *, prefix: str = "profile_anomalies") -> str:
    cleaned_suffix = re.sub(r"[^a-zA-Z0-9_]", "_", (suffix or "profile_run"))
    cleaned_prefix = re.sub(r"[^a-zA-Z0-9_]", "_", prefix)
    return f"_{cleaned_prefix}_{cleaned_suffix}"

def _coerce_payload_structure(payload_candidate: object | None) -> dict[str, object] | list | None:
    if isinstance(payload_candidate, (dict, list)):
        return payload_candidate
    if isinstance(payload_candidate, (bytes, bytearray)):
        payload_candidate = payload_candidate.decode("utf-8", errors="replace")
    if isinstance(payload_candidate, str):
        candidate = payload_candidate.strip()
        if candidate:
            with suppress(json.JSONDecodeError):
                decoded = json.loads(candidate)
                if isinstance(decoded, (dict, list)):
                    return decoded
    return None


def _load_payload_from_location(payload_location: str | None) -> dict[str, object] | list | None:
    trimmed = (payload_location or "").strip()
    if not trimmed:
        return None
    try:
        if trimmed.startswith("/dbfs/"):
            with open(trimmed, "r", encoding="utf-8") as handle:
                return _coerce_payload_structure(handle.read())
        if trimmed.startswith("dbfs:/") or trimmed.startswith("file:/") or trimmed.startswith("abfss:/") or trimmed.startswith("s3:/"):
            with dbutils.fs.open(trimmed, "r") as handle:
                return _coerce_payload_structure(handle.read())
        if _has_uri_scheme(trimmed):
            response = requests.get(trimmed, timeout=30)
            response.raise_for_status()
            return _coerce_payload_structure(response.text)
    except Exception as exc:
        print(f"Unable to load profiling payload from {trimmed}: {exc}")
    return None


def _persist_profile_detail_tables(results_payload: dict[str, object] | list | None, payload_location: str | None = None) -> None:
    payload = _coerce_payload_structure(results_payload)
    if payload is None:
        payload = _load_payload_from_location(payload_location)
    if payload is None:
        print("No profiling payload available for detail persistence; skipping column/value inserts.")
        return
    columns_table = _metadata_table("dq_profile_columns")
    values_table = _metadata_table("dq_profile_column_values")

    column_rows, value_rows = _build_profile_detail_rows(payload)
    delete_clause = f"WHERE profile_run_id = {_sql_string_literal(profile_run_id)}"

    for target_table in (columns_table, values_table):
        try:
            spark.sql(f"DELETE FROM {target_table} {delete_clause}")
        except AnalysisException as exc:
            print(f"Unable to delete existing rows from {target_table}: {exc}")

    if column_rows:
        columns_df = spark.createDataFrame(column_rows, PROFILE_COLUMNS_SCHEMA).select(*PROFILE_COLUMN_FIELDS)
        view_name = _normalize_temp_view_name(profile_run_id, prefix="profile_columns")
        try:
            columns_df.createOrReplaceTempView(view_name)
            select_list = ", ".join(PROFILE_COLUMN_FIELDS)
            spark.sql(
                f"INSERT INTO {columns_table} ({select_list}) SELECT {select_list} FROM {view_name}"
            )
            print(f"Persisted {len(column_rows)} column metrics for run {profile_run_id}.")
        finally:
            with suppress(Exception):
                spark.catalog.dropTempView(view_name)
    else:
        print(f"No column metrics extracted for run {profile_run_id}.")

    if value_rows:
        values_df = spark.createDataFrame(value_rows, PROFILE_COLUMN_VALUES_SCHEMA).select(*PROFILE_COLUMN_VALUES_FIELDS)
        view_name = _normalize_temp_view_name(profile_run_id, prefix="profile_column_values")
        try:
            values_df.createOrReplaceTempView(view_name)
            select_list = ", ".join(PROFILE_COLUMN_VALUES_FIELDS)
            spark.sql(
                f"INSERT INTO {values_table} ({select_list}) SELECT {select_list} FROM {view_name}"
            )
            print(f"Persisted {len(value_rows)} column value rows for run {profile_run_id}.")
        finally:
            with suppress(Exception):
                spark.catalog.dropTempView(view_name)
    else:
        print(f"No column value distributions extracted for run {profile_run_id}.")

def _persist_results_to_metadata(results_payload: dict[str, object] | None, payload_location: str | None) -> None:
    if not profile_run_id:
        raise ValueError("profile_run_id widget is required before persisting profiling metadata.")

    raw_payload = results_payload
    payload = raw_payload or {}
    if not isinstance(payload, dict):
        payload = {}

    profiles_table = _metadata_table("dq_profiles")
    anomalies_table = _metadata_table("dq_profile_anomalies")

    assignments = [
        f"status = {_sql_string_literal(payload.get('status') or 'unknown')}",
        "completed_at = current_timestamp()",
        f"row_count = {_sql_numeric_literal(payload.get('row_count'))}",
        f"anomaly_count = {_sql_numeric_literal(payload.get('anomaly_count'))}",
        f"payload_path = {_sql_string_literal(payload_location)}",
    ]

    update_sql = (
        f"UPDATE {profiles_table} "
        f"SET {', '.join(assignments)} "
        f"WHERE profile_run_id = {_sql_string_literal(profile_run_id)}"
    )
    spark.sql(update_sql)
    print(f"Updated dq_profiles entry for run {profile_run_id}.")

    _persist_profile_detail_tables(raw_payload, payload_location)

    anomalies = list(payload.get("anomalies") or [])
    delete_sql = f"DELETE FROM {anomalies_table} WHERE profile_run_id = {_sql_string_literal(profile_run_id)}"
    spark.sql(delete_sql)

    if not anomalies:
        print(f"No anomalies to persist for run {profile_run_id}.")
        return

    anomaly_rows = []
    for anomaly in anomalies:
        anomaly_rows.append(
            {
                "profile_run_id": profile_run_id,
                "table_name": anomaly.get("table_name"),
                "column_name": anomaly.get("column_name"),
                "anomaly_type": anomaly.get("anomaly_type"),
                "severity": anomaly.get("severity"),
                "description": anomaly.get("description"),
                "detected_at": _parse_anomaly_timestamp(anomaly.get("detected_at")) or datetime.utcnow(),
            }
        )

    anomalies_df = spark.createDataFrame(anomaly_rows, PROFILE_ANOMALIES_SCHEMA).select(*PROFILE_ANOMALIES_FIELDS)
    view_name = _normalize_temp_view_name(profile_run_id)
    try:
        anomalies_df.createOrReplaceTempView(view_name)
        spark.sql(
            f"INSERT INTO {anomalies_table} "
            "(profile_run_id, table_name, column_name, anomaly_type, severity, description, detected_at) "
            f"SELECT profile_run_id, table_name, column_name, anomaly_type, severity, description, detected_at FROM {view_name}"
        )
    finally:
        with suppress(Exception):
            spark.catalog.dropTempView(view_name)

    print(f"Persisted {len(anomalies)} anomalies for run {profile_run_id}.")

In [None]:
# Finalize the profiling job by persisting payload artifacts and metadata updates.
import os
import re

print(f"Finalizing profiling run {profile_run_id}...")


def _guess_profiling_payload():
    for key in ("profiling_payload", "profile_payload", "results_payload", "payload", "raw_payload"):
        candidate = globals().get(key)
        if isinstance(candidate, (dict, list)):
            return candidate
    return None


def _sanitize_segment(value, fallback):
    cleaned = (value or fallback or "").strip() or fallback
    return re.sub(r"[^a-zA-Z0-9_.-]", "_", cleaned)


def _default_artifact_path():
    base_root = (payload_base_path or DEFAULT_PRIVATE_PAYLOAD_ROOT or "dbfs:/tmp/conversioncentral/profiles").rstrip("/")
    group_segment = _sanitize_segment(table_group_id, "table_group")
    run_segment = _sanitize_segment(profile_run_id, "profile_run")
    return f"{base_root}/{group_segment}/{run_segment}.json"


def _materialize_payload_artifact(payload_obj, target_path):
    destination = target_path
    if destination.startswith("/dbfs/"):
        destination = f"dbfs:/{destination[6:]}"
    if destination.startswith("dbfs:/"):
        redirected = _redirect_dbfs_path(destination)
        if not redirected:
            return None
        destination = redirected
    encoded = _encode_payload_json(payload_obj) or json.dumps(payload_obj, default=str)
    if destination.startswith("dbfs:/") or destination.startswith("file:/"):
        _mkdirs_if_supported(destination)
        dbutils.fs.put(destination, encoded, True)
        return destination
    directory = os.path.dirname(destination)
    if directory:
        os.makedirs(directory, exist_ok=True)
    with open(destination, "w", encoding="utf-8") as handle:
        handle.write(encoded)
    return destination


results_payload = _guess_profiling_payload()
payload_location = payload_path or None
storage_mode = _resolve_payload_storage_mode()

if not payload_location and results_payload is not None and _payload_storage_is_artifact(storage_mode):
    target_path = _default_artifact_path()
    resolved_location = _materialize_payload_artifact(results_payload, target_path)
    if resolved_location:
        payload_location = resolved_location
        print(f"Persisted profiling payload artifact to {payload_location}.")
    else:
        print("Unable to persist payload artifact; continuing with inline payload only.")

if payload_location and payload_location.startswith("/dbfs/"):
    payload_location = f"dbfs:/{payload_location[6:]}"

if results_payload is None and not payload_location:
    print("WARNING: no profiling payload or artifact path detected; detail tables will remain empty.")

_persist_results_to_metadata(results_payload, payload_location)
print("Profiling metadata persistence completed.")