In [0]:
# Databricks notebook source
%pip install -U langchain==0.3.7 langgraph==0.5.3 langchain_community langchain-databricks

Collecting langchain==0.3.7
  Downloading langchain-0.3.7-py3-none-any.whl.metadata (7.1 kB)
Collecting langgraph==0.5.3
  Downloading langgraph-0.5.3-py3-none-any.whl.metadata (6.9 kB)
Collecting langchain_community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-databricks
  Downloading langchain_databricks-0.1.2-py3-none-any.whl.metadata (3.3 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain==0.3.7)
  Downloading sqlalchemy-2.0.45-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl.metadata (9.5 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain==0.3.7)
  Downloading aiohttp-3.13.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl.metadata (8.1 kB)
Collecting langchain-core<0.4.0,>=0.3.15 (from langchain==0.3.7)
  Downloading langchain_core-0.3.83-py3-none-any.whl.metadata (3.2 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain==0.3.7)
  Downloading 

In [0]:
dbutils.library.restartPython()

### Configs

In [0]:
from langchain_community.chat_models import ChatDatabricks
from langchain.schema import HumanMessage


In [0]:

LLM_ENDPOINT_NAME = "databricks-meta-llama-3-1-8b-instruct" 

llm = ChatDatabricks(
    endpoint=LLM_ENDPOINT_NAME,
    temperature=0.2,
)

  llm = ChatDatabricks(


In [0]:
import time
from typing import Callable, Tuple

In [0]:
# Cell 2 — Exact-match "accuracy" baseline

def evaluate_response_accuracy(agent_output: str, expected_output: str) -> float:
    """Calculates a simple exact-match accuracy score (baseline)."""
    return 1.0 if agent_output.strip().lower() == expected_output.strip().lower() else 0.0

# Quick sanity test
agent_response = "The capital of France is Paris."
ground_truth = "Paris is the capital of France."
score = evaluate_response_accuracy(agent_response, ground_truth)

print(f"Response accuracy (exact match baseline): {score}")

Response accuracy (exact match baseline): 0.0


In [0]:
def timed_agent_action(agent_function: Callable, *args, **kwargs) -> Tuple[any, float]:
    """Measures execution time of an agent function in milliseconds."""
    start_time = time.perf_counter()
    result = agent_function(*args, **kwargs)
    end_time = time.perf_counter()
    
    latency_ms = (end_time - start_time) * 1000
    print(f"Action '{agent_function.__name__}' took {latency_ms:.2f} ms")
    
    return result, latency_ms

# Dummy agent/tool simulation
def simulated_tool_call(query: str) -> str:
    time.sleep(0.15)  # simulate IO / model latency
    return f"Result for '{query}'"

result, latency = timed_agent_action(simulated_tool_call, "get weather")
print(f"Tool call result: {result}")


Action 'simulated_tool_call' took 150.08 ms
Tool call result: Result for 'get weather'


In [0]:
# Cell 4 — Simple interaction monitor (placeholder token counting)

class LLMInteractionMonitor:
    """
    Tracks approximate input/output tokens.
    NOTE: This uses word-count as a placeholder; real token counts should come
    from the LLM API usage fields or a tokenizer.
    """
    def __init__(self):
        self.total_input_tokens = 0
        self.total_output_tokens = 0

    def record_interaction(self, prompt: str, response: str):
        input_tokens = len(prompt.split())     # placeholder
        output_tokens = len(response.split())  # placeholder

        self.total_input_tokens += input_tokens
        self.total_output_tokens += output_tokens

        print(
            f"Recorded interaction: "
            f"input_tokens≈{input_tokens}, output_tokens≈{output_tokens}"
        )

    def get_total_tokens(self):
        return self.total_input_tokens, self.total_output_tokens


# Example usage
monitor = LLMInteractionMonitor()
monitor.record_interaction("What is the capital of France?", "The capital of France is Paris.")
monitor.record_interaction(
    "Tell me a joke.",
    "Why don't scientists trust atoms? Because they make up everything!"
)

input_t, output_t = monitor.get_total_tokens()
print(f"Total input tokens≈{input_t}, Total output tokens≈{output_t}")

Recorded interaction: input_tokens≈6, output_tokens≈6
Recorded interaction: input_tokens≈4, output_tokens≈10
Total input tokens≈10, Total output tokens≈16


In [0]:
import re

def normalize_text(text: str) -> str:
    text = text.lower().strip()
    text = re.sub(r"[^a-z0-9\s]", " ", text)  # keep words/numbers, remove punctuation
    text = re.sub(r"\s+", " ", text)
    return text

def jaccard_similarity(a: str, b: str) -> float:
    a_tokens = set(normalize_text(a).split())
    b_tokens = set(normalize_text(b).split())
    if not a_tokens and not b_tokens:
        return 1.0
    if not a_tokens or not b_tokens:
        return 0.0
    return len(a_tokens & b_tokens) / len(a_tokens | b_tokens)

def evaluate_response_accuracy_jaccard(agent_output: str, expected_output: str, threshold: float = 0.6) -> float:
    """
    Returns 1.0 if token-set overlap is high enough; else 0.0.
    Threshold 0.6 works well for short factual sentences like this.
    """
    score = jaccard_similarity(agent_output, expected_output)
    return 1.0 if score >= threshold else 0.0


# quick check
print("Jaccard:", jaccard_similarity("The capital of France is Paris.", "Paris is the capital of France."))
print("Binary:", evaluate_response_accuracy_jaccard("The capital of France is Paris.", "Paris is the capital of France."))

Jaccard: 1.0
Binary: 1.0


In [0]:

import json
import logging
from typing import Optional, Dict, Any

from langchain_databricks import ChatDatabricks
from langchain.schema import SystemMessage, HumanMessage

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

LLM_ENDPOINT_NAME = "databricks-meta-llama-3-1-8b-instruct"

llm = ChatDatabricks(
    endpoint=LLM_ENDPOINT_NAME,
    temperature=0.2,
)

print("✅ Databricks LLM configured:", LLM_ENDPOINT_NAME)

✅ Databricks LLM configured: databricks-meta-llama-3-1-8b-instruct


  llm = ChatDatabricks(


In [0]:
LEGAL_SURVEY_RUBRIC = """
You are an expert legal survey methodologist and a critical legal reviewer. Your task is to evaluate the quality of a given legal survey question.

Provide a score from 1 to 5 for overall quality, along with a detailed rationale and specific feedback.
Focus on the following criteria:

1. **Clarity & Precision (Score 1-5):**
   - 1: Extremely vague, highly ambiguous, or confusing.
   - 3: Moderately clear, but could be more precise.
   - 5: Perfectly clear, unambiguous, and precise in its legal terminology (if applicable) and intent.

2. **Neutrality & Bias (Score 1-5):**
   - 1: Highly leading or biased, clearly influencing the respondent towards a specific answer.
   - 3: Slightly suggestive or could be interpreted as leading.
   - 5: Completely neutral, objective, and free from any leading language or loaded terms.

3. **Relevance & Focus (Score 1-5):**
   - 1: Irrelevant to the stated survey topic or out of scope.
   - 3: Loosely related but could be more focused.
   - 5: Directly relevant to the survey's objectives and well-focused on a single concept.

4. **Completeness (Score 1-5):**
   - 1: Omits critical information needed to answer accurately or provides insufficient context.
   - 3: Mostly complete, but minor details are missing.
   - 5: Provides all necessary context and information for the respondent to answer thoroughly.

5. **Appropriateness for Audience (Score 1-5):**
   - 1: Uses jargon inaccessible to the target audience or is overly simplistic for experts.
   - 3: Generally appropriate, but some terms might be challenging or oversimplified.
   - 5: Perfectly tailored to the assumed legal knowledge and background of the target survey audience.

**Output Format (STRICT):**
Return ONLY a valid JSON object with EXACTLY these keys:
- overall_score (integer 1-5)
- rationale (string)
- detailed_feedback (array of strings; each bullet covers one criterion)
- concerns (array of strings)
- recommended_action (string)

No markdown. No extra text.
""".strip()

print("✅ Rubric loaded (strict JSON format enforced)")

✅ Rubric loaded (strict JSON format enforced)


In [0]:
# Cell 3 — LLM Judge class (Databricks-native, strict JSON handling)

class LLMJudgeForLegalSurvey:
    """
    Evaluates legal survey questions using a Databricks-hosted chat LLM.
    Enforces strict JSON output and includes basic recovery for minor formatting issues.
    """

    def __init__(self, llm: ChatDatabricks):
        self.llm = llm

    def _build_messages(self, survey_question: str):
        return [
            SystemMessage(content=LEGAL_SURVEY_RUBRIC),
            HumanMessage(
                content=(
                    "LEGAL SURVEY QUESTION TO EVALUATE:\n"
                    f"{survey_question}\n\n"
                    "Remember: return ONLY valid JSON with the specified keys."
                )
            ),
        ]

    def judge(self, survey_question: str) -> Optional[Dict[str, Any]]:
        messages = self._build_messages(survey_question)

        try:
            logging.info("Sending survey question to Databricks LLM for evaluation...")
            response = self.llm.invoke(messages)
            raw_text = (response.content or "").strip()

            # First attempt: direct JSON parse
            try:
                return json.loads(raw_text)
            except json.JSONDecodeError:
                # Fallback: extract first JSON object if model added extra text
                start = raw_text.find("{")
                end = raw_text.rfind("}")
                if start != -1 and end != -1 and end > start:
                    candidate = raw_text[start : end + 1]
                    return json.loads(candidate)

                logging.error("JSON parsing failed. Raw response:\n%s", raw_text)
                return None

        except Exception as e:
            logging.error("Unexpected error during LLM judgment: %s", e)
            return None


# Instantiate the judge
judge = LLMJudgeForLegalSurvey(llm)
print("✅ LLM Judge initialized")

✅ LLM Judge initialized


In [0]:
good_legal_survey_question = """
To what extent do you agree or disagree that current intellectual property laws in Switzerland
adequately protect emerging AI-generated content, assuming the content meets the originality
criteria established by the Federal Supreme Court?
(Select one: Strongly Disagree, Disagree, Neutral, Agree, Strongly Agree)
""".strip()

biased_legal_survey_question = """
Don't you agree that overly restrictive data privacy laws like the FADP are hindering essential
technological innovation and economic growth in Switzerland?
(Select one: Yes, No)
""".strip()

vague_legal_survey_question = """
What are your thoughts on legal tech?
""".strip()

print("✅ Example survey questions prepared")


✅ Example survey questions prepared


In [0]:
# Cell 5 — Run evaluations + pretty-print JSON

def pretty_print(label: str, result: Optional[Dict[str, Any]]):
    print("\n" + "=" * 80)
    print(label)
    print("=" * 80)
    if result is None:
        print("❌ No judgment returned (JSON parse failed or request error).")
    else:
        print(json.dumps(result, indent=2))

pretty_print("Evaluating GOOD legal survey question", judge.judge(good_legal_survey_question))
pretty_print("Evaluating BIASED legal survey question", judge.judge(biased_legal_survey_question))
pretty_print("Evaluating VAGUE legal survey question", judge.judge(vague_legal_survey_question))


2026-01-16 20:40:08,135 - INFO - Sending survey question to Databricks LLM for evaluation...
2026-01-16 20:40:09,892 - INFO - Sending survey question to Databricks LLM for evaluation...



Evaluating GOOD legal survey question
{
  "overall_score": 4,
  "rationale": "The question is well-structured and clear, but it could benefit from a slight adjustment to ensure neutrality.",
  "detailed_feedback": [
    "Clarity & Precision: 5 - The question is perfectly clear and precise in its legal terminology and intent.",
    "Neutrality & Bias: 4 - The question is mostly neutral, but the assumption that the content meets the originality criteria might slightly influence the respondent's answer.",
    "Relevance & Focus: 5 - The question is directly relevant to the survey's objectives and well-focused on a single concept.",
    "Completeness: 5 - The question provides all necessary context and information for the respondent to answer thoroughly.",
    "Appropriateness for Audience: 5 - The question is perfectly tailored to the assumed legal knowledge and background of the target survey audience."
  ],
  "concerns": [
    "The assumption that the content meets the originality crit

2026-01-16 20:40:11,819 - INFO - Sending survey question to Databricks LLM for evaluation...



Evaluating BIASED legal survey question
{
  "overall_score": 2,
  "rationale": "The question has some issues with clarity, neutrality, and relevance.",
  "detailed_feedback": [
    "Clarity & Precision: The question is somewhat ambiguous due to the use of 'overly restrictive' and 'hindering essential technological innovation and economic growth', which could be interpreted in different ways.",
    "Neutrality & Bias: The question is leading and biased towards a negative view of the FADP, using words like 'hindering' and 'overly restrictive', which may influence respondents to choose 'Yes'.",
    "Relevance & Focus: The question is somewhat off-topic, as it focuses on the impact of the FADP on innovation and economic growth, rather than the data privacy laws themselves.",
    "Completeness: The question lacks context and information about what is meant by 'essential technological innovation and economic growth', making it difficult for respondents to answer accurately.",
    "Appropria

In [0]:
# Cell 6 — Strict schema validation for judge output

from typing import List

REQUIRED_SCHEMA = {
    "overall_score": int,
    "rationale": str,
    "detailed_feedback": list,
    "concerns": list,
    "recommended_action": str,
}

def validate_judge_output(result: Dict[str, Any]) -> List[str]:
    """
    Validates the LLM judge output against the required schema.
    Returns a list of validation errors (empty if valid).
    """
    errors = []

    for key, expected_type in REQUIRED_SCHEMA.items():
        if key not in result:
            errors.append(f"Missing required key: '{key}'")
        else:
            if not isinstance(result[key], expected_type):
                errors.append(
                    f"Key '{key}' has wrong type: "
                    f"expected {expected_type.__name__}, got {type(result[key]).__name__}"
                )

    # Additional semantic checks
    if "overall_score" in result:
        score = result["overall_score"]
        if not isinstance(score, int) or not (1 <= score <= 5):
            errors.append("overall_score must be an integer between 1 and 5")

    if "detailed_feedback" in result:
        if not all(isinstance(x, str) for x in result["detailed_feedback"]):
            errors.append("detailed_feedback must be a list of strings")

    if "concerns" in result:
        if not all(isinstance(x, str) for x in result["concerns"]):
            errors.append("concerns must be a list of strings")

    return errors


# Example validation run
sample_result = judge.judge(good_legal_survey_question)
validation_errors = validate_judge_output(sample_result) if sample_result else ["No result"]

print("Validation errors:" if validation_errors else "✅ Output is schema-valid")
for err in validation_errors:
    print("-", err)

2026-01-16 20:40:35,704 - INFO - Sending survey question to Databricks LLM for evaluation...


✅ Output is schema-valid


In [0]:
# Cell 7 — Auto-repair + retry when JSON is invalid (Databricks-native)

import time

def timed_invoke(messages):
    start = time.perf_counter()
    resp = llm.invoke(messages)
    end = time.perf_counter()
    return resp, (end - start) * 1000


def _extract_json_object(text: str) -> Optional[Dict[str, Any]]:
    text = (text or "").strip()
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        start = text.find("{")
        end = text.rfind("}")
        if start != -1 and end != -1 and end > start:
            try:
                return json.loads(text[start : end + 1])
            except json.JSONDecodeError:
                return None
        return None


def judge_with_repair(
    survey_question: str,
    max_attempts: int = 2,
    print_debug: bool = True,
) -> Optional[Dict[str, Any]]:
    """
    1) Try normal judge call
    2) If JSON invalid or schema invalid, ask model to ONLY output corrected JSON (repair pass)
    """
    # Attempt 1: normal judge
    messages = judge._build_messages(survey_question)  # reuse existing builder
    resp, latency_ms = timed_invoke(messages)
    raw = (resp.content or "").strip()
    parsed = _extract_json_object(raw)

    if parsed is not None:
        errs = validate_judge_output(parsed)
        if not errs:
            if print_debug:
                print(f"✅ Judge OK (attempt 1). Latency: {latency_ms:.2f} ms")
            return parsed
        if print_debug:
            print(f"⚠️ Schema invalid (attempt 1). Latency: {latency_ms:.2f} ms")
            for e in errs:
                print("-", e)
    else:
        if print_debug:
            print(f"⚠️ JSON parse failed (attempt 1). Latency: {latency_ms:.2f} ms")
            print("Raw response:\n", raw[:1200])

    # Repair attempts
    attempt = 2
    while attempt <= max_attempts:
        repair_instructions = f"""
You previously attempted to evaluate a legal survey question, but your output was INVALID.

Your job now:
- Output ONLY a valid JSON object (no markdown, no extra text).
- The JSON must contain EXACTLY these keys with correct types:
  - overall_score (integer 1-5)
  - rationale (string)
  - detailed_feedback (array of strings)
  - concerns (array of strings)
  - recommended_action (string)

Fix any formatting issues and ensure the schema is correct.
"""

        repair_messages = [
            SystemMessage(content=LEGAL_SURVEY_RUBRIC),
            HumanMessage(
                content=(
                    f"{repair_instructions}\n\n"
                    "LEGAL SURVEY QUESTION:\n"
                    f"{survey_question}\n\n"
                    "YOUR PREVIOUS (INVALID) OUTPUT:\n"
                    f"{raw}\n\n"
                    "Return ONLY valid JSON."
                )
            ),
        ]

        resp2, latency_ms2 = timed_invoke(repair_messages)
        raw2 = (resp2.content or "").strip()
        parsed2 = _extract_json_object(raw2)

        if parsed2 is not None:
            errs2 = validate_judge_output(parsed2)
            if not errs2:
                if print_debug:
                    print(f"✅ Repair OK (attempt {attempt}). Latency: {latency_ms2:.2f} ms")
                return parsed2
            if print_debug:
                print(f"⚠️ Repair schema still invalid (attempt {attempt}). Latency: {latency_ms2:.2f} ms")
                for e in errs2:
                    print("-", e)
        else:
            if print_debug:
                print(f"⚠️ Repair JSON parse failed (attempt {attempt}). Latency: {latency_ms2:.2f} ms")
                print("Raw repair response:\n", raw2[:1200])

        raw = raw2
        attempt += 1

    return None


# Example: run with repair enabled
result = judge_with_repair(good_legal_survey_question, max_attempts=2, print_debug=True)
pretty_print("GOOD question (judge_with_repair)", result)

✅ Judge OK (attempt 1). Latency: 1771.06 ms

GOOD question (judge_with_repair)
{
  "overall_score": 4,
  "rationale": "The question is well-structured and clear, but it could benefit from a slight adjustment to ensure neutrality.",
  "detailed_feedback": [
    "Clarity & Precision: 5 - The question is perfectly clear and precise in its legal terminology and intent.",
    "Neutrality & Bias: 4 - The question is mostly neutral, but the phrase 'adequately protect' could be interpreted as slightly leading.",
    "Relevance & Focus: 5 - The question is directly relevant to the survey's objectives and well-focused on a single concept.",
    "Completeness: 5 - The question provides all necessary context and information for the respondent to answer thoroughly.",
    "Appropriateness for Audience: 5 - The question is perfectly tailored to the assumed legal knowledge and background of the target survey audience."
  ],
  "concerns": [
    "The use of 'adequately protect' might influence responden

In [0]:
# Cell 8 — Add observability: latency + (approx) token counts + structured record

import uuid
from datetime import datetime

def approx_tokens(text: str) -> int:
    # Placeholder token estimator (words). Replace with real token usage if your stack exposes it.
    return len((text or "").split())

def run_judge_observed(survey_question: str) -> Dict[str, Any]:
    trace_id = str(uuid.uuid4())
    ts = datetime.utcnow().isoformat() + "Z"

    # Build messages and time the call
    messages = judge._build_messages(survey_question)
    resp, latency_ms = timed_invoke(messages)
    raw = (resp.content or "").strip()

    parsed = _extract_json_object(raw)
    final = parsed
    repaired = False
    schema_errors = None

    if final is None:
        final = judge_with_repair(survey_question, max_attempts=2, print_debug=False)
        repaired = True
    else:
        schema_errors = validate_judge_output(final)
        if schema_errors:
            final = judge_with_repair(survey_question, max_attempts=2, print_debug=False)
            repaired = True

    record = {
        "trace_id": trace_id,
        "timestamp_utc": ts,
        "endpoint": LLM_ENDPOINT_NAME,
        "latency_ms": latency_ms,
        "prompt_chars": sum(len(m.content) for m in messages),
        "response_chars": len(raw),
        "prompt_tokens_approx": sum(approx_tokens(m.content) for m in messages),
        "response_tokens_approx": approx_tokens(raw),
        "repaired": repaired,
        "initial_schema_errors": schema_errors,
        "judge_output": final,
        "raw_output": raw[:2000],  # keep capped for logs
    }
    return record


# Example observed run
observed = run_judge_observed(good_legal_survey_question)
print(json.dumps(observed, indent=2))


{
  "trace_id": "ad305c37-8257-4d30-82cc-75955feefbc0",
  "timestamp_utc": "2026-01-16T20:42:01.856302Z",
  "endpoint": "databricks-meta-llama-3-1-8b-instruct",
  "latency_ms": 1556.6365420017974,
  "prompt_chars": 2393,
  "response_chars": 1310,
  "prompt_tokens_approx": 359,
  "response_tokens_approx": 171,
  "repaired": false,
  "initial_schema_errors": [],
  "judge_output": {
    "overall_score": 4,
    "rationale": "The question is well-structured and clear, but it could benefit from a more precise definition of 'emerging AI-generated content' to ensure respondents understand the scope.",
    "detailed_feedback": [
      "Clarity & Precision: The question is mostly clear, but the term 'emerging AI-generated content' could be more precisely defined.",
      "Neutrality & Bias: The question is generally neutral, but the assumption that the content meets the originality criteria might subtly influence respondents.",
      "Relevance & Focus: The question is highly relevant to the sur