# ConversionCentral Managed Profiling
Run this notebook from a Databricks Repo so backend deployments control profiling logic.

In [None]:
# Collect parameters passed by the FastAPI backend


# Each widget is declared up front so Databricks jobs can safely supply overrides.
dbutils.widgets.text("table_group_id", "")
dbutils.widgets.text("profile_run_id", "")
dbutils.widgets.text("data_quality_schema", "")
dbutils.widgets.text("payload_path", "")
dbutils.widgets.text("payload_base_path", "")
dbutils.widgets.text("callback_url", "")
dbutils.widgets.text("callback_base_url", "")
dbutils.widgets.text("callback_token", "")
dbutils.widgets.text("payload_storage", "")
dbutils.widgets.text("callback_behavior", "")
dbutils.widgets.text("catalog", "")
dbutils.widgets.text("schema_name", "")
dbutils.widgets.text("connection_id", "")
dbutils.widgets.text("connection_name", "")
dbutils.widgets.text("system_id", "")
dbutils.widgets.text("project_key", "")
dbutils.widgets.text("http_path", "")


from datetime import datetime
import json
import requests
from pyspark.sql import SparkSession


spark = SparkSession.builder.getOrCreate()


_NOTEBOOK_STAGE_SEQUENCE = (
    ("parameters", "Cell 2 (widget and Spark initialization)"),
    ("profiling", "Cell 3 (profiling logic)"),
    ("payload_persistence", "Cell 4 (payload persistence and callbacks)"),
    ("metadata_helpers", "Cell 5 (metadata helper definitions)"),
    ("finalization", "Cell 6 (final persistence and callbacks)")
)
_NOTEBOOK_STAGE_LOOKUP = {label: (idx, hint) for idx, (label, hint) in enumerate(_NOTEBOOK_STAGE_SEQUENCE)}
_STAGE_SYMBOL_REQUIREMENTS = {
    "parameters": ("table_group_id", "profile_run_id", "dq_schema"),
    "profiling": ("MAX_COLUMNS_TO_PROFILE",),
    "payload_persistence": ("_resolve_payload_storage_mode", "_payload_storage_is_artifact"),
    "metadata_helpers": ("_persist_results_to_metadata",),
}


def _ensure_notebook_stage(stage_label: str) -> None:
    if stage_label not in _NOTEBOOK_STAGE_LOOKUP:
        raise ValueError(f"Unknown notebook stage '{stage_label}'.")
    stage_index, stage_hint = _NOTEBOOK_STAGE_LOOKUP[stage_label]
    for prior_label, prior_hint in _NOTEBOOK_STAGE_SEQUENCE[:stage_index]:
        required_symbols = _STAGE_SYMBOL_REQUIREMENTS.get(prior_label, ())
        missing = [symbol for symbol in required_symbols if symbol not in globals()]
        if missing:
            missing_list = ", ".join(sorted(missing))
            raise RuntimeError(
                "Profiling notebook Cells 1-6 must run sequentially. "
                f"Run {prior_hint} before {stage_hint} (missing: {missing_list}).",
            )


table_group_id = dbutils.widgets.get("table_group_id")
profile_run_id = dbutils.widgets.get("profile_run_id")
dq_schema = (dbutils.widgets.get("data_quality_schema") or "").strip()
raw_payload_path = (dbutils.widgets.get("payload_path") or "").strip()
payload_path = raw_payload_path or None
payload_base_path = (dbutils.widgets.get("payload_base_path") or "").strip() or None
callback_url = (dbutils.widgets.get("callback_url") or "").strip() or None
callback_base_url = (dbutils.widgets.get("callback_base_url") or "").strip() or None
callback_token = (dbutils.widgets.get("callback_token") or "").strip() or None
connection_catalog = (dbutils.widgets.get("catalog") or "").strip()
connection_schema = (dbutils.widgets.get("schema_name") or "").strip()


if not table_group_id or not profile_run_id:
    raise ValueError("Required widgets missing: table_group_id/profile_run_id")
if not dq_schema:
    raise ValueError("Data quality schema widget is required for profiling runs.")


_ensure_notebook_stage("parameters")

In [None]:
# Profile the tables registered for this table group and build the result payload.
from datetime import datetime
import re
from contextlib import suppress
from typing import Iterable


if "_ensure_notebook_stage" not in globals():
    raise RuntimeError("Profiling notebook Cells 1-6 must run sequentially; run Cell 2 before profiling.")


_ensure_notebook_stage("profiling")


import datetime as dt
import hashlib
import json
import math


from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.utils import AnalysisException


MAX_COLUMNS_TO_PROFILE = 25
NULL_RATIO_ALERT_THRESHOLD = 0.5
HIGH_NULL_RATIO_THRESHOLD = 0.9
VALUE_DISTRIBUTION_LIMIT = 25
VALUE_DISTRIBUTION_DISTINCT_THRESHOLD = 1000
VALUE_DISTRIBUTION_MAX_ROWS = 5_000_000
MAX_VALUE_DISPLAY_LENGTH = 256


PROFILE_COLUMN_FIELDS = [
    "profile_run_id",
    "schema_name",
    "table_name",
    "column_name",
    "qualified_name",
    "data_type",
    "general_type",
    "ordinal_position",
    "row_count",
    "null_count",
    "non_null_count",
    "distinct_count",
    "min_value",
    "max_value",
    "avg_value",
    "stddev_value",
    "median_value",
    "p95_value",
    "true_count",
    "false_count",
    "min_length",
    "max_length",
    "avg_length",
    "non_ascii_ratio",
    "min_date",
    "max_date",
    "date_span_days",
    "metrics_json",
    "generated_at",
]


...

In [None]:
# Persist payload and call back into the API


if "_ensure_notebook_stage" not in globals():
    raise RuntimeError("Profiling notebook Cells 1-6 must run sequentially; run prior cells before payload persistence.")


_ensure_notebook_stage("payload_persistence")


from datetime import datetime
import re
import socket
from contextlib import suppress
from functools import lru_cache
from urllib.parse import urlparse, urlunparse
from typing import Optional


from pyspark.sql import functions as F
from pyspark.sql.utils import AnalysisException


DEFAULT_PRIVATE_PAYLOAD_ROOT = "dbfs:/tmp/conversioncentral/profiles"
DEFAULT_DRIVER_PAYLOAD_ROOT = "file:/databricks/driver/conversioncentral/profiles"
DEFAULT_CALLBACK_BEHAVIOR = "metadata_only"
DEFAULT_PAYLOAD_STORAGE_MODE = "inline"
_VALID_PAYLOAD_STORAGE_MODES = {"inline", "artifact", "both"}
def _clean_widget_value(value: Optional[str]) -> str:
    return (value or "").strip()
def _resolve_payload_storage_mode() -> str:
    raw_value = _clean_widget_value(dbutils.widgets.get("payload_storage")).lower()
    if raw_value in _VALID_PAYLOAD_STORAGE_MODES:
        return raw_value
    if raw_value in {"inline_only", "inline_metadata"}:
        return "inline"
    if raw_value in {"artifact_only", "artifact_metadata"}:
        return "artifact"
    if payload_path:
        return "artifact"
    return DEFAULT_PAYLOAD_STORAGE_MODE
def _payload_storage_is_artifact(mode: str) -> bool:
    normalized = (mode or DEFAULT_PAYLOAD_STORAGE_MODE).strip().lower()
    return normalized in {"artifact", "both"}
DBFS_DISABLED_MESSAGES = ("public dbfs root is disabled", "access is denied")
DRIVER_DISABLED_MESSAGES = ("local filesystem access is forbidden", "workspacelocalfilesystem")
URI_SCHEME_PATTERN = re.compile(r"^[a-z][a-z0-9+.\-]*:/", re.IGNORECASE)
_DBFS_REDIRECT_NOTICE_EMITTED = False
_STORAGE_DISABLED_NOTICE_EMITTED = False




def _looks_like_dns_failure(error: BaseException) -> bool:
    """Detect DNS resolution failures from nested request exceptions."""
    current = error
    while current:
        if isinstance(current, socket.gaierror):
            return True
        name = current.__class__.__name__.lower()
        if "nameresolution" in name:
            return True
        message = str(current).lower()
        if "temporary failure in name resolution" in message:
            return True
        current = getattr(current, "__cause__", None) or getattr(current, "__context__", None)
    return False




def _rewrite_heroku_app_host(url: Optional[str]) -> Optional[str]:
    """Fallback to canonical Heroku hostname when review-app hosts fail DNS."""
    if not url:
        return None
    parsed = urlparse(url)
    host = parsed.hostname
    if not host:
        return None
    match = re.match(r"^(?P<base>[a-z0-9-]+?)-[0-9a-f]{12}\.herokuapp\.com$", host)
    if not match:
        return None
    canonical_host = f"{match.group('base')}.herokuapp.com"
    netloc = canonical_host
    if parsed.port:
        netloc = f"{canonical_host}:{parsed.port}"
    if parsed.username:
        auth = parsed.username
        if parsed.password:
            auth = f"{auth}:{parsed.password}"
        netloc = f"{auth}@{netloc}"
    scheme = parsed.scheme or "https"
    if scheme.lower() == "http":
        scheme = "https"
    return urlunparse(parsed._replace(netloc=netloc, scheme=scheme))

In [None]:
# Column/value persistence helpers and overrides
import datetime as dt
from datetime import datetime
from contextlib import suppress
from typing import Any, Mapping

if "_ensure_notebook_stage" not in globals():
    raise RuntimeError("Profiling notebook Cells 1-6 must run sequentially; run earlier cells before defining metadata helpers.")


_ensure_notebook_stage("metadata_helpers")


def _escape_identifier(identifier: str) -> str:
    cleaned = (identifier or "").strip().replace("`", "")
    if not cleaned:
        raise ValueError("Metadata identifiers cannot be empty.")
    return f"`{cleaned}`"


def _metadata_schema_reference() -> str:
    if not dq_schema:
        raise ValueError("data_quality_schema widget must be set before resolving metadata tables.")
    catalog = (connection_catalog or "").strip()
    if catalog:
        return f"{_escape_identifier(catalog)}.{_escape_identifier(dq_schema)}"
    return _escape_identifier(dq_schema)


def _metadata_table(table_name: str) -> str:
    return f"{_metadata_schema_reference()}.{_escape_identifier(table_name)}"


def _first_non_empty(*values):
    for value in values:
        if isinstance(value, str):
            candidate = value.strip()
            if candidate:
                return candidate
        elif value is not None:
            return value
    return None


def _coerce_int(value):
    if value is None:
        return None
    if isinstance(value, bool):
        return int(value)
    if isinstance(value, int):
        return value
    if isinstance(value, float):
        if not math.isfinite(value):
            return None
        return int(round(value))
    if isinstance(value, str):
        candidate = value.strip().replace(",", "")
        if not candidate:
            return None
        try:
            if "." in candidate:
                return int(float(candidate))
            return int(candidate)
        except ValueError:
            return None
    return None


def _coerce_float(value):
    if value is None:
        return None
    if isinstance(value, bool):
        return float(value)
    if isinstance(value, (int, float)):
        numeric = float(value)
        if math.isfinite(numeric):
            return numeric
        return None
    if isinstance(value, str):
        candidate = value.strip().replace(",", "")
        if not candidate:
            return None
        try:
            numeric = float(candidate)
        except ValueError:
            return None
        return numeric if math.isfinite(numeric) else None
    return None


def _sql_literal(value) -> str:
    if value is None:
        return "NULL"
    if isinstance(value, datetime):
        return f"'{value.strftime('%Y-%m-%d %H:%M:%S')}'"
    text = str(value).replace("'", "''")
    return f"'{text}'"


def _sql_number(value) -> str:
    if value is None:
        return "NULL"
    return str(value)


def _coerce_timestamp_value(value) -> datetime | None:
    if isinstance(value, datetime):
        return value
    if isinstance(value, (int, float)):
        numeric = float(value)
        if abs(numeric) > 1_000_000_000_000:
            numeric /= 1000.0
        with suppress(Exception):
            return datetime.utcfromtimestamp(numeric)
        return None
    if isinstance(value, str):
        text = value.strip()
        if not text:
            return None
        normalized = text[:-1] + "+00:00" if text.endswith("Z") else text
        with suppress(ValueError):
            return datetime.fromisoformat(normalized)
        for fmt in ("%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S"):
            with suppress(ValueError):
                return datetime.strptime(normalized, fmt)
    return None


def _resolve_databricks_run_id() -> str | None:
    with suppress(Exception):
        value = spark.conf.get("spark.databricks.job.runId")
        if value:
            return str(value)
    with suppress(Exception):
        ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()
        run_id = ctx.runId().get()
        if run_id:
            return str(run_id)
    with suppress(Exception):
        task_run_id = dbutils.jobs.taskRunId()
        if task_run_id:
            return str(task_run_id)
    return None


def _extract_profile_summary(payload: Any) -> Mapping[str, Any]:
    if isinstance(payload, Mapping):
        for key in ("summary", "profile_summary", "profileSummary", "metadata", "run"):
            nested = payload.get(key)
            if isinstance(nested, Mapping):
                return nested
        return payload
    if isinstance(payload, list):
        for item in payload:
            if isinstance(item, Mapping):
                return item
    return {}


def _persist_results_to_metadata(results_payload, payload_location):
    if not table_group_id:
        raise ValueError("table_group_id must be defined before persisting metadata.")
    if not profile_run_id:
        raise ValueError("profile_run_id must be defined before persisting metadata.")

    summary = _extract_profile_summary(results_payload) if results_payload is not None else {}
    status = _first_non_empty(summary.get("status"), summary.get("state"), "completed")
    started_at = _coerce_timestamp_value(summary.get("started_at") or summary.get("startedAt"))
    completed_at = _coerce_timestamp_value(summary.get("completed_at") or summary.get("completedAt"))
    row_count = _coerce_int(
        summary.get("row_count")
        or summary.get("rowCount")
        or summary.get("rows")
        or summary.get("total_rows")
        or summary.get("totalRows")
    )
    anomaly_count = _coerce_int(summary.get("anomaly_count") or summary.get("anomalyCount"))
    if anomaly_count is None:
        anomalies = summary.get("anomalies")
        if isinstance(anomalies, (list, tuple)):
            anomaly_count = len(anomalies)

    if started_at is None:
        started_at = datetime.utcnow()
    if completed_at is None:
        completed_at = datetime.utcnow()

    payload_ref = _first_non_empty(payload_location, summary.get("payload_path"), summary.get("payloadPath"))
    profiles_table = _metadata_table("dq_profiles")
    profile_literal = _sql_literal(profile_run_id)
    spark.sql(
        f"DELETE FROM {profiles_table} WHERE {_escape_identifier('profile_run_id')} = {profile_literal}"
    )

    columns = (
        "profile_run_id",
        "table_group_id",
        "status",
        "started_at",
        "completed_at",
        "row_count",
        "anomaly_count",
        "payload_path",
        "databricks_run_id",
    )
    values = [
        _sql_literal(profile_run_id),
        _sql_literal(table_group_id),
        _sql_literal(status),
        _sql_literal(started_at),
        _sql_literal(completed_at),
        _sql_number(row_count),
        _sql_number(anomaly_count),
        _sql_literal(payload_ref),
        _sql_literal(_resolve_databricks_run_id()),
    ]
    columns_sql = ", ".join(_escape_identifier(column) for column in columns)
    values_sql = ", ".join(values)
    spark.sql(f"INSERT INTO {profiles_table} ({columns_sql}) VALUES ({values_sql})")

    ref_label = payload_ref or "inline"
    print(
        f"Persisted metadata for profile run {profile_run_id} with status '{status}' and payload reference {ref_label}."
    )

In [None]:
# Final metadata persistence and callback dispatch
import json
from typing import Any

if "_ensure_notebook_stage" not in globals():
    raise RuntimeError("Profiling notebook Cells 1-6 must run sequentially; run prior cells before finalization.")


_ensure_notebook_stage("finalization")


def _resolve_results_payload() -> Any:
    """Pick the richest profiling payload produced by earlier cells."""
    for name in (
        "results_payload",
        "profiling_payload",
        "profile_payload",
        "profiling_results",
        "profile_results",
        "results",
    ):
        if name in globals():
            return globals()[name]
    return None


def _resolve_payload_reference() -> str | None:
    for candidate in (
        globals().get("persisted_payload_path"),
        globals().get("payload_reference"),
        globals().get("payload_location"),
        globals().get("payload_artifact_path"),
        payload_path,
        raw_payload_path,
    ):
        if isinstance(candidate, str):
            normalized = candidate.strip()
            if normalized:
                return normalized
    return None


def _resolve_callback_behavior() -> str:
    raw_value = (dbutils.widgets.get("callback_behavior") or DEFAULT_CALLBACK_BEHAVIOR).strip().lower()
    if raw_value in {"none", "disabled", "off"}:
        return "none"
    if raw_value in {"inline", "payload", "inline_payload", "payload_inline"}:
        return "inline"
    if raw_value in {"artifact", "artifact_only"}:
        return "artifact"
    if raw_value in {"metadata", "metadata_only"}:
        return "metadata_only"
    return DEFAULT_CALLBACK_BEHAVIOR


def _resolve_callback_target() -> str | None:
    if callback_url:
        return callback_url
    if callback_base_url and profile_run_id:
        base = callback_base_url.rstrip("/")
        return f"{base}/{profile_run_id}"
    return None


def _post_callback(target_url: str, payload: dict[str, Any]) -> dict[str, Any]:
    headers = {"Content-Type": "application/json"}
    if callback_token:
        headers["Authorization"] = f"Bearer {callback_token}"
    response = requests.post(target_url, headers=headers, json=payload, timeout=30)
    response.raise_for_status()
    return {"status_code": response.status_code, "text": response.text[:512]}


def _post_callback_with_retry(target_url: str, payload: dict[str, Any]) -> dict[str, Any]:
    try:
        return _post_callback(target_url, payload)
    except requests.RequestException as exc:
        if _looks_like_dns_failure(exc):
            fallback = _rewrite_heroku_app_host(target_url)
            if fallback and fallback != target_url:
                print(f"Retrying callback via canonical host: {fallback}")
                return _post_callback(fallback, payload)
        raise


resolved_storage_mode = _resolve_payload_storage_mode()
results_payload = _resolve_results_payload()
results_summary = _extract_profile_summary(results_payload) if results_payload is not None else {}
status = _first_non_empty(results_summary.get("status"), results_summary.get("state"), "completed")
payload_reference = _resolve_payload_reference()
if not payload_reference and _payload_storage_is_artifact(resolved_storage_mode):
    payload_reference = payload_path

_persist_results_to_metadata(results_payload, payload_reference)

callback_behavior = _resolve_callback_behavior()
callback_target = _resolve_callback_target()
callback_result = None

if callback_target and callback_behavior != "none":
    callback_payload: dict[str, Any] = {
        "profile_run_id": profile_run_id,
        "table_group_id": table_group_id,
        "status": status,
        "payload_reference": payload_reference,
        "payload_storage_mode": resolved_storage_mode,
        "metadata_schema": dq_schema,
        "summary": results_summary,
        "databricks_run_id": _resolve_databricks_run_id(),
    }
    if callback_behavior == "inline" and results_payload is not None:
        callback_payload["results"] = results_payload
    try:
        callback_result = _post_callback_with_retry(callback_target, callback_payload)
    except Exception as exc:  # pragma: no cover - surface callback errors without stopping metadata persistence
        callback_result = {"error": str(exc), "target": callback_target}
        print(f"Callback to {callback_target} failed: {exc}")

FINALIZATION_CONTEXT = {
    "profile_run_id": profile_run_id,
    "table_group_id": table_group_id,
    "status": status,
    "payload_reference": payload_reference,
    "payload_storage_mode": resolved_storage_mode,
    "callback_behavior": callback_behavior,
    "callback_target": callback_target,
    "callback_result": callback_result,
}

print(json.dumps(FINALIZATION_CONTEXT, indent=2, sort_keys=True))