In [0]:
# Databricks notebook source
%pip install -U langchain==0.3.7 langgraph==0.5.3 langchain_community langchain-databricks

Collecting langchain==0.3.7
  Downloading langchain-0.3.7-py3-none-any.whl.metadata (7.1 kB)
Collecting langgraph==0.5.3
  Downloading langgraph-0.5.3-py3-none-any.whl.metadata (6.9 kB)
Collecting langchain_community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-databricks
  Downloading langchain_databricks-0.1.2-py3-none-any.whl.metadata (3.3 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain==0.3.7)
  Downloading sqlalchemy-2.0.45-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl.metadata (9.5 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain==0.3.7)
  Downloading aiohttp-3.13.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl.metadata (8.1 kB)
Collecting langchain-core<0.4.0,>=0.3.15 (from langchain==0.3.7)
  Downloading langchain_core-0.3.83-py3-none-any.whl.metadata (3.2 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain==0.3.7)
  Downloading 

In [0]:
dbutils.library.restartPython()

### Configs

In [0]:
# Cell 1 — Databricks setup + imports + LLM config (no OpenAI)

import json
import re
from typing import Optional, Dict, Any, List

from langchain_databricks import ChatDatabricks
from langchain.schema import SystemMessage, HumanMessage

LLM_ENDPOINT_NAME = "databricks-meta-llama-3-1-8b-instruct"

llm = ChatDatabricks(
    endpoint=LLM_ENDPOINT_NAME,
    temperature=0.2,
)

print("✅ Databricks LLM configured:", LLM_ENDPOINT_NAME)


✅ Databricks LLM configured: databricks-meta-llama-3-1-8b-instruct


  llm = ChatDatabricks(


In [0]:
# Cell 2 — Databricks-native LLM query helper + robust JSON extraction

def query_model(system_prompt: str, prompt: str) -> str:
    """
    Databricks-native chat call using ChatDatabricks.
    Returns the raw text content.
    """
    resp = llm.invoke([
        SystemMessage(content=system_prompt),
        HumanMessage(content=prompt),
    ])
    return (resp.content or "").strip()


def extract_json_from_review_text(text: str) -> Optional[Dict[str, Any]]:
    """
    Tries to parse the JSON inside the model's response.
    The prompt asks for:
      THOUGHT: ...
      REVIEW JSON:
      ```json
      { ... }
      ```
    We robustly extract the first JSON object found.
    """
    if not text:
        return None

    # Prefer fenced ```json blocks
    m = re.search(r"```json\s*(\{.*?\})\s*```", text, flags=re.DOTALL)
    if m:
        try:
            return json.loads(m.group(1))
        except json.JSONDecodeError:
            pass

    # Fallback: take the first {...} span
    start = text.find("{")
    end = text.rfind("}")
    if start != -1 and end != -1 and end > start:
        candidate = text[start:end+1]
        try:
            return json.loads(candidate)
        except json.JSONDecodeError:
            return None

    return None


print("✅ Helpers ready: query_model(), extract_json_from_review_text()")

✅ Helpers ready: query_model(), extract_json_from_review_text()


In [0]:
# --------------------------------------------------------------------
# LLM QUERY + JSON EXTRACTION
# --------------------------------------------------------------------

def query_model(system_prompt: str, prompt: str) -> str:
    """Databricks-native chat call."""
    resp = llm.invoke([
        SystemMessage(content=system_prompt),
        HumanMessage(content=prompt),
    ])
    return (resp.content or "").strip()


def extract_json_from_review_text(text: str) -> Optional[Dict[str, Any]]:
    """Extract JSON block from reviewer output."""
    if not text:
        return None

    # Prefer fenced JSON
    match = re.search(r"```json\s*(\{.*?\})\s*```", text, flags=re.DOTALL)
    if match:
        try:
            return json.loads(match.group(1))
        except json.JSONDecodeError:
            pass

    # Fallback: first {...}
    start, end = text.find("{"), text.rfind("}")
    if start != -1 and end != -1 and end > start:
        try:
            return json.loads(text[start:end + 1])
        except json.JSONDecodeError:
            return None

    return None

In [0]:
# --------------------------------------------------------------------
# SCORING PROMPT
# --------------------------------------------------------------------

TEMPLATE_INSTRUCTIONS = """
Respond in the following format:

THOUGHT:
<THOUGHT>

REVIEW JSON:
```json
<JSON>
```

In <JSON>, include the following fields exactly:
- Summary
- Strengths
- Weaknesses
- Originality (1–4)
- Quality (1–4)
- Clarity (1–4)
- Significance (1–4)
- Questions
- Limitations
- Ethical Concerns (boolean)
- Soundness (1–4)
- Presentation (1–4)
- Contribution (1–4)
- Overall (1–10)
- Confidence (1–5)
- Decision (Accept or Reject)

Return ONLY this format.
""".strip()

In [0]:
# --------------------------------------------------------------------
# CORE SCORING FUNCTION
# --------------------------------------------------------------------

def get_score(
    outlined_plan: str,
    latex: str,
    reward_model_llm: str = None,
    reviewer_type: Optional[str] = None,
    attempts: int = 3,
) -> Dict[str, Any]:
    last_error = None

    for _ in range(attempts):
        sys_prompt = (reviewer_type or "You are a harsh but fair reviewer.") + "\n" + TEMPLATE_INSTRUCTIONS

        user_prompt = f"""
OUTLINED PLAN:
{outlined_plan}

LATEX REPORT:
{latex}
""".strip()

        raw = query_model(sys_prompt, user_prompt)
        parsed = extract_json_from_review_text(raw)

        if parsed is not None:
            return {
                "raw_text": raw,
                "review_json": parsed,
            }

        last_error = "JSON parsing failed"

    return {
        "raw_text": "",
        "review_json": None,
        "error": last_error,
    }

In [0]:
# --------------------------------------------------------------------
# REVIEWERS AGENT
# --------------------------------------------------------------------

class ReviewersAgent:
    def __init__(self, model: str = "databricks", notes: Optional[List[str]] = None):
        self.notes = notes or []
        self.model = model

    def inference(self, plan: str, report: str) -> str:
        reviewer_1 = "You are a harsh but fair reviewer and expect good experiments that lead to insights."
        reviewer_2 = "You are a harsh and critical but fair reviewer looking for impactful ideas."
        reviewer_3 = "You are a harsh but fair open-minded reviewer seeking novel ideas."

        r1 = get_score(plan, report, reviewer_type=reviewer_1)
        r2 = get_score(plan, report, reviewer_type=reviewer_2)
        r3 = get_score(plan, report, reviewer_type=reviewer_3)

        return (
            f"Reviewer #1:\n{json.dumps(r1['review_json'], indent=2)}\n\n"
            f"Reviewer #2:\n{json.dumps(r2['review_json'], indent=2)}\n\n"
            f"Reviewer #3:\n{json.dumps(r3['review_json'], indent=2)}"
        )

In [0]:
agent = ReviewersAgent()
demo_plan = "Evaluate transformer pruning techniques."
demo_report = "\\section{Introduction} This paper explores..."

print(agent.inference(demo_plan, demo_report))

Reviewer #1:
{
  "Summary": "This paper evaluates various transformer pruning techniques, but the introduction lacks context and the report is incomplete.",
  "Strengths": [
    "Clear research question",
    "Good selection of pruning techniques"
  ],
  "Weaknesses": [
    "Brief and lacking introduction",
    "Incomplete report"
  ],
  "Originality": 2,
  "Quality": 3,
  "Clarity": 2,
  "Significance": 2,
  "Questions": [
    "What are the specific research questions being addressed?",
    "Why are these pruning techniques being evaluated?"
  ],
  "Limitations": [
    "Incomplete report",
    "Lack of context in introduction"
  ],
  "Ethical Concerns": false,
  "Soundness": 2,
  "Presentation": 2,
  "Contribution": 2,
  "Overall": 6,
  "Confidence": 3,
  "Decision": "Reject"
}

Reviewer #2:
{
  "Summary": "The paper explores transformer pruning techniques, but the evaluation aspect is unclear.",
  "Strengths": [
    "Clear introduction to transformer pruning techniques",
    "Potenti

In [0]:
# Cell 1 — Setup: imports + Databricks LLM config (AgentLaboratory-style backend)

import json
import re
import uuid
from datetime import datetime
from typing import Optional, Dict, Any, List

from langchain_databricks import ChatDatabricks
from langchain.schema import SystemMessage, HumanMessage

LLM_ENDPOINT_NAME = "databricks-meta-llama-3-1-8b-instruct"

llm = ChatDatabricks(
    endpoint=LLM_ENDPOINT_NAME,
    temperature=0.2,
)

print("✅ Databricks LLM configured:", LLM_ENDPOINT_NAME)


✅ Databricks LLM configured: databricks-meta-llama-3-1-8b-instruct


In [0]:
# Cell 2 — Core helpers: query_model + JSON extraction (used by all agents)

def query_model(system_prompt: str, prompt: str) -> str:
    """Databricks-native LLM call."""
    resp = llm.invoke([
        SystemMessage(content=system_prompt),
        HumanMessage(content=prompt),
    ])
    return (resp.content or "").strip()


def extract_first_json_object(text: str) -> Optional[Dict[str, Any]]:
    if not text:
        return None

    # 1) Prefer fenced JSON blocks: ```json ... ```
    start_tag = "```json"
    end_tag = "```"

    start = text.find(start_tag)
    if start != -1:
        # Find the closing fence after the start fence
        fence_start_end = text.find("\n", start)
        if fence_start_end != -1:
            end = text.find(end_tag, fence_start_end)
            if end != -1:
                block = text[fence_start_end:end].strip()
                # Some models include leading/trailing text; try to isolate {...}
                s = block.find("{")
                e = block.rfind("}")
                if s != -1 and e != -1 and e > s:
                    try:
                        return json.loads(block[s:e+1])
                    except json.JSONDecodeError:
                        pass

    # 2) Fallback: first {...} in the whole text
    s = text.find("{")
    e = text.rfind("}")
    if s != -1 and e != -1 and e > s:
        try:
            return json.loads(text[s:e+1])
        except json.JSONDecodeError:
            return None

    return None


print("✅ Helpers ready: query_model(), extract_first_json_object()")


✅ Helpers ready: query_model(), extract_first_json_object()


In [0]:
# -----------------------------
# REVIEWER PROMPT
# -----------------------------

TEMPLATE_INSTRUCTIONS = """
Respond in the following format:

THOUGHT:
<THOUGHT>

REVIEW JSON:
```json
<JSON>
```

In <THOUGHT>, briefly discuss your reasoning specific to this paper.

In <JSON>, provide the review in JSON format with the following fields in the order:
- Summary
- Strengths
- Weaknesses
- Originality (1-4)
- Quality (1-4)
- Clarity (1-4)
- Significance (1-4)
- Questions
- Limitations
- Ethical Concerns (boolean)
- Soundness (1-4)
- Presentation (1-4)
- Contribution (1-4)
- Overall (1-10)
- Confidence (1-5)
- Decision (Accept or Reject)

Return ONLY the required format. The JSON must be valid.
""".strip()

In [0]:
def get_score(
    outlined_plan: str,
    latex: str,
    reviewer_type: str,
    attempts: int = 2,
) -> Dict[str, Any]:
    """Run a reviewer scoring pass."""
    last_error = None

    for attempt in range(1, attempts + 1):
        system_prompt = f"{reviewer_type}\n\n{TEMPLATE_INSTRUCTIONS}"
        user_prompt = f"""
OUTLINED PLAN:
{outlined_plan}

LATEX REPORT:
{latex}
""".strip()

        raw = query_model(system_prompt, user_prompt)
        parsed = extract_first_json_object(raw)

        if parsed is not None:
            return {
                "raw_text": raw,
                "review_json": parsed,
            }

        last_error = f"Attempt {attempt}: failed to parse JSON"

    return {
        "raw_text": "",
        "review_json": None,
        "error": last_error,
    }

In [0]:
plan = "Evaluate pruning methods for transformers."
report = "\\section{Introduction} This paper explores..."
review = get_score(
    outlined_plan=plan,
    latex=report,
    reviewer_type="You are a harsh but fair reviewer."
)
print(review["review_json"])

{'Summary': 'This paper explores pruning methods for transformers, but the introduction is too generic and lacks concrete information.', 'Strengths': ['Well-structured outline', 'Interesting topic'], 'Weaknesses': ['Lack of concrete information in the introduction', 'Unclear methodology'], 'Originality': 2, 'Quality': 2, 'Clarity': 3, 'Significance': 2, 'Questions': ['What specific pruning methods are explored?', 'How are the results evaluated?'], 'Limitations': ['Insufficient information in the introduction', 'Unclear methodology'], 'Ethical Concerns': False, 'Soundness': 2, 'Presentation': 3, 'Contribution': 2, 'Overall': 6, 'Confidence': 3, 'Decision': 'Major Revisions Required'}


In [0]:
# Cell 4 — ReviewersAgent (3 reviewers) using reviewer_scoring.get_score

import json
class ReviewersAgent:
    def __init__(self, notes=None):
        self.notes = notes or []

    def inference(self, plan: str, report: str) -> str:
        reviewer_1 = "You are a harsh but fair reviewer and expect good experiments that lead to insights for the research topic."
        reviewer_2 = "You are a harsh and critical but fair reviewer who is looking for an idea that would be impactful in the field."
        reviewer_3 = "You are a harsh but fair open-minded reviewer that is looking for novel ideas that have not been proposed before."

        r1 = get_score(outlined_plan=plan, latex=report, reviewer_type=reviewer_1, attempts=2)
        r2 = get_score(outlined_plan=plan, latex=report, reviewer_type=reviewer_2, attempts=2)
        r3 = get_score(outlined_plan=plan, latex=report, reviewer_type=reviewer_3, attempts=2)

        def fmt(label, r):
            if r.get("review_json") is None:
                return f"{label}:\n❌ No JSON parsed. Error: {r.get('error')}\nRaw:\n{r.get('raw_text','')}"
            return f"{label}:\n{json.dumps(r['review_json'], indent=2)}"

        return "\n\n".join([
            fmt("Reviewer #1", r1),
            fmt("Reviewer #2", r2),
            fmt("Reviewer #3", r3),
        ])


reviewers_agent = ReviewersAgent()
print("✅ ReviewersAgent ready")


✅ ReviewersAgent ready


In [0]:
# Cell 5 — Minimal demo: run Exploration & Discovery (review loop)

# Example inputs (replace with real plan / LaTeX report)
plan = """
We propose to study pruning strategies for transformer-based language models.
The goal is to reduce inference cost while preserving downstream task accuracy.
Experiments will compare structured vs unstructured pruning across multiple layers.
"""

report = r"""
\section{Introduction}
Transformer models are expensive to deploy. This paper explores pruning methods
to reduce inference cost while maintaining performance.

\section{Method}
We evaluate magnitude-based pruning and structured head pruning on several benchmarks.

\section{Results}
Preliminary results show structured pruning preserves accuracy better at high sparsity.
"""

# Run reviewers
output = reviewers_agent.inference(plan, report)

print("=== Reviewer Feedback ===")
print(output)


=== Reviewer Feedback ===
Reviewer #1:
{
  "Summary": "This paper proposes to study pruning strategies for transformer-based language models to reduce inference cost while preserving downstream task accuracy. However, the proposal lacks concrete details and a clear direction.",
  "Strengths": [
    "Exploring pruning methods for transformer models is a timely and relevant research topic"
  ],
  "Weaknesses": [
    "Lack of concrete details about experiments, evaluation metrics, and analysis",
    "Introduction is too brief and lacks context",
    "Method section is too concise"
  ],
  "Originality": 2,
  "Quality": 2,
  "Clarity": 2,
  "Significance": 2,
  "Questions": [
    "What are the specific pruning strategies being evaluated?",
    "What are the evaluation metrics and benchmarks used?",
    "How will the results be analyzed and interpreted?"
  ],
  "Limitations": [
    "Lack of concrete details and a clear direction",
    "Preliminary results may not be robust or reliable"
  ],


In [0]:
# Cell 6 — Aggregate reviewer JSON into a single decision + quick score summary

import re

def safe_get(d: dict, key: str, default=None):
    return d.get(key, default) if isinstance(d, dict) else default

def aggregate_reviews(reviews: list) -> dict:
    """
    reviews: list of review_json dicts (can include None)
    Returns a compact aggregate summary.
    """
    valid = [r for r in reviews if isinstance(r, dict)]
    if not valid:
        return {"error": "No valid review_json objects to aggregate."}

    overalls = [safe_get(r, "Overall") for r in valid if isinstance(safe_get(r, "Overall"), (int, float))]
    confidences = [safe_get(r, "Confidence") for r in valid if isinstance(safe_get(r, "Confidence"), (int, float))]
    decisions = [safe_get(r, "Decision") for r in valid if isinstance(safe_get(r, "Decision"), str)]

    # Majority decision
    decision_norm = [d.strip().lower() for d in decisions]
    accept_count = sum(d == "accept" for d in decision_norm)
    reject_count = sum(d == "reject" for d in decision_norm)
    majority = "Accept" if accept_count > reject_count else "Reject"

    summary = {
        "n_reviews": len(valid),
        "overall_avg": sum(overalls) / len(overalls) if overalls else None,
        "overall_min": min(overalls) if overalls else None,
        "overall_max": max(overalls) if overalls else None,
        "confidence_avg": sum(confidences) / len(confidences) if confidences else None,
        "decision_majority": majority,
        "decision_counts": {"Accept": accept_count, "Reject": reject_count},
    }
    return summary

r1 = get_score(plan, report, "You are a harsh but fair reviewer and expect good experiments that lead to insights for the research topic.", attempts=2)
r2 = get_score(plan, report, "You are a harsh and critical but fair reviewer who is looking for an idea that would be impactful in the field.", attempts=2)
r3 = get_score(plan, report, "You are a harsh but fair open-minded reviewer that is looking for novel ideas that have not been proposed before.", attempts=2)

agg = aggregate_reviews([r1.get("review_json"), r2.get("review_json"), r3.get("review_json")])

print("=== Aggregate Summary ===")
print(json.dumps(agg, indent=2))


=== Aggregate Summary ===
{
  "n_reviews": 3,
  "overall_avg": 6.0,
  "overall_min": 6,
  "overall_max": 6,
  "confidence_avg": 3.0,
  "decision_majority": "Reject",
  "decision_counts": {
    "Accept": 0,
    "Reject": 0
  }
}


In [0]:
# Cell 7 — Revise the plan based on reviewer feedback (Databricks-native)

def parse_json_strict(raw: str) -> Dict[str, Any]:
    """
    Parse JSON from model output that may be wrapped in ```json ... ``` fences.
    Assumes the content inside the fence is valid JSON.
    """
    if not raw:
        raise ValueError("Empty model output")

    text = raw.strip()

    # Strip ```json / ``` fences if present
    if text.startswith("```"):
        lines = text.splitlines()

        # remove opening fence (``` or ```json)
        if lines[0].startswith("```"):
            lines = lines[1:]

        # remove closing fence
        if lines and lines[-1].strip() == "```":
            lines = lines[:-1]

        text = "\n".join(lines).strip()

    return json.loads(text)

def revise_plan_with_reviews(original_plan: str, review_jsons: list) -> dict:
    valid_reviews = [r for r in review_jsons if isinstance(r, dict)]

    sys_prompt = """
You are a research lead improving an experiment plan based on peer review feedback.

Return ONLY valid JSON (no markdown, no extra text) with keys:
- revised_plan (string)
- change_log (array of strings)
""".strip()

    prompt = f"""
ORIGINAL PLAN:
{original_plan}

REVIEWER FEEDBACK (JSON):
{json.dumps(valid_reviews, indent=2)}

Return ONLY JSON.
""".strip()

    raw = query_model(sys_prompt, prompt)

    try:
        return parse_json_strict(raw)
    except Exception as e:
        return {
            "error": f"Failed to parse JSON: {e}",
            "raw": raw,
        }


In [0]:
revised = revise_plan_with_reviews(
    original_plan=plan,
    review_jsons=[r1.get("review_json"), r2.get("review_json"), r3.get("review_json")]
)

print("=== Revised Plan Output ===")
print(json.dumps(revised, indent=2))

=== Revised Plan Output ===
{
  "revised_plan": "We propose to study pruning strategies for transformer-based language models. The goal is to reduce inference cost while preserving downstream task accuracy. Experiments will compare structured vs unstructured pruning across multiple layers, with a focus on implementing and evaluating pruning strategies using specific quantitative metrics, such as accuracy and inference cost. We will also investigate the implications of high sparsity on downstream task accuracy and explore the trade-off between inference cost and accuracy.",
  "change_log": [
    "Added specific quantitative metrics to evaluate pruning strategies",
    "Investigated implications of high sparsity on downstream task accuracy",
    "Explored trade-off between inference cost and accuracy",
    "Provided more depth and detail in the paper",
    "Presented preliminary results in a more thorough and analyzed manner"
  ]
}


In [0]:
# Cell C — Iteration loop: review -> aggregate -> revise (stop on Accept or threshold)

def run_iteration_loop(
    initial_plan: str,
    initial_report: str,
    max_rounds: int = 3,
    overall_threshold: float = 7.5,
):
    plan_cur = initial_plan
    report_cur = initial_report  # keep constant here; you can revise report similarly later

    history = []

    reviewer_1 = "You are a harsh but fair reviewer and expect good experiments that lead to insights for the research topic."
    reviewer_2 = "You are a harsh and critical but fair reviewer who is looking for an idea that would be impactful in the field."
    reviewer_3 = "You are a harsh but fair open-minded reviewer that is looking for novel ideas that have not been proposed before."

    for rnd in range(1, max_rounds + 1):
        print(f"\n================= ROUND {rnd} =================")

        r1 = get_score(plan_cur, report_cur, reviewer_1, attempts=2)
        r2 = get_score(plan_cur, report_cur, reviewer_2, attempts=2)
        r3 = get_score(plan_cur, report_cur, reviewer_3, attempts=2)

        reviews = [r1.get("review_json"), r2.get("review_json"), r3.get("review_json")]
        agg = aggregate_reviews(reviews)

        print("Aggregate:", json.dumps(agg, indent=2))

        history.append({
            "round": rnd,
            "plan": plan_cur,
            "agg": agg,
            "reviews": reviews,
        })

        # Stop conditions
        if agg.get("decision_majority") == "Accept":
            print("✅ Stop: majority decision is Accept")
            break

        overall_avg = agg.get("overall_avg")
        if isinstance(overall_avg, (int, float)) and overall_avg >= overall_threshold:
            print(f"✅ Stop: overall_avg >= threshold ({overall_avg:.2f} >= {overall_threshold})")
            break

        # Revise plan based on reviews
        revised = revise_plan_with_reviews(plan_cur, reviews)
        if "error" in revised:
            print("⚠️ Revise failed, stopping. Error:", revised["error"])
            break

        plan_cur = revised["revised_plan"]
        print("\n--- Revised plan (first 600 chars) ---")
        print(plan_cur[:600])

    return {"final_plan": plan_cur, "history": history}


result = run_iteration_loop(
    initial_plan=plan,
    initial_report=report,
    max_rounds=3,
    overall_threshold=7.5,
)

print("\n=== FINAL PLAN (first 800 chars) ===")
print(result["final_plan"][:800])



Aggregate: {
  "n_reviews": 3,
  "overall_avg": 6.0,
  "overall_min": 6,
  "overall_max": 6,
  "confidence_avg": 3.0,
  "decision_majority": "Reject",
  "decision_counts": {
    "Accept": 0,
    "Reject": 0
  }
}

--- Revised plan (first 600 chars) ---
We propose to study pruning strategies for transformer-based language models, focusing on a novel comparison of structured vs unstructured pruning across multiple layers. Experiments will be designed to ensure robust results, with a clear research question and expected outcomes. The pruning strategies will be evaluated on multiple tasks, and the computational requirements will be analyzed.

Aggregate: {
  "n_reviews": 3,
  "overall_avg": 6.0,
  "overall_min": 6,
  "overall_max": 6,
  "confidence_avg": 3.0,
  "decision_majority": "Reject",
  "decision_counts": {
    "Accept": 0,
    "Reject": 1
  }
}

--- Revised plan (first 600 chars) ---
We propose to study pruning strategies for transformer-based language models, focusing on a novel c