# Synthetic Document + Comment Thread Generator

This notebook builds a reproducible pipeline for generating synthetic documents and associated comment threads using a large language model (LLM). Each dataset entry mirrors the procedure described in the project specification.

## Procedure Overview

The generator retrieves one of ten predefined topics, creates a four-paragraph document, highlights a contentious sentence with the peer's first remark, and then alternates between author and peer comments until the thread reaches five peer inputs and four author responses.

In [17]:
import json
import random
import re
import textwrap
from dataclasses import dataclass
from typing import Callable, Dict, Iterable, List, Optional

try:
    from openai import OpenAI
except ImportError:
    OpenAI = None


TOPICS: List[str] = [
    "a news report on a local incident",
    "a restaurant review for a food blog",
    "a quarterly update for the executive team",
    "an email to all the company employees communicating a new office policy",
    "an instructions manual for an electronics product",
    "an advertisement for a new product launch",
    "a statement from a political campaign",
    "an execution plan for a project at work",
    "a new research proposal for a grant",
    "an opinion piece for a newspaper",
]


DOCUMENT_PROMPT_TEMPLATE = textwrap.dedent(
    """
    Write a short document (3 paragraphs) that is {topic}. The document should be realistic and some of the points should be at least somewhat controversial.
    """
).strip()


PEER_DISAGREEMENT_PROMPT_TEMPLATE = textwrap.dedent(
    """

    A peer is reviewing the following document and they disagree with one specific sentence inside it.

    Document:
    {document}

    A peer is reviewing the document and they disagree with some part of it. The disagreement centers around one specific sentence in the document, and could be about style, content, framing, or another thing. The peer will highlight that sentence in the doc and leave a comment for the author to review. Please indicate what sentence is highlighted and what the comment is. The comment should be brief, no more than 3 sentences.

    Respond with a JSON object that contains exactly the keys "highlighted_sentence" and "comment".
    """).strip()


AUTHOR_RESPONSE_PROMPT_TEMPLATE = textwrap.dedent(
    """
    You are the author of the document below. One of your peers highlighted thae specific sentence and left comments that you must answer.

    Document:
    {document}

    Highlighted sentence:
    {highlighted_sentence}

    Comment thread so far:
    {thread}

    As the author, you would like to keep the doc as it is. What do they respond to the comment? 3 sentences max.
    """
).strip()


PEER_FOLLOWUP_PROMPT_TEMPLATE = textwrap.dedent(
    """
    You are the peer reviewer who highlighted the sentence in the document shown below. You still disagree with the author.

    Document:
    {document}

    Highlighted sentence:
    {highlighted_sentence}

    Comment thread so far:
    {thread}

    You still disagree and will respond again. What do you respond to the last comment? Remember to keep comments short, 3 sentences max.
    """
).strip()


@dataclass
class CommentTurn:
    speaker: str
    text: str

    def to_dict(self) -> Dict[str, str]:
        return {"speaker": self.speaker, "text": self.text}


@dataclass
class DatasetEntry:
    topic: str
    document: str
    highlighted_sentence: str
    comment_thread: List[CommentTurn]

    def to_dict(self) -> Dict[str, object]:
        return {
            "topic": self.topic,
            "document": self.document,
            "highlighted_sentence": self.highlighted_sentence,
            "comment_thread": [turn.to_dict() for turn in self.comment_thread],
        }

    def to_json(self) -> str:
        return json.dumps(self.to_dict(), ensure_ascii=False)


def _format_thread_for_prompt(thread: Iterable[CommentTurn]) -> str:
    if not thread:
        return "No comments yet."
    lines = []
    for turn in thread:
        lines.append(f"{turn.speaker.title()}: {turn.text}")
    return '\n'.join(lines)


def _extract_json_object(text: str) -> Dict[str, str]:
    text = text.strip()
    if not text:
        raise ValueError("LLM returned an empty string when JSON was expected.")

    match = re.search(r"\{.*\}", text, re.DOTALL)
    if not match:
        raise ValueError(f"Could not find a JSON object in:\n{text}")

    data = json.loads(match.group(0))
    if not isinstance(data, dict):
        raise ValueError("Expected a JSON object from the LLM response.")
    return data


class LLMClient:
    def __init__(
        self,
        model: str = "gpt-4o-mini",
        temperature: float = 0.8,
        responder: Optional[Callable[..., str]] = None,
        client: Optional[object] = None,
    ) -> None:
        self.model = model
        self.temperature = temperature
        self._responder = responder
        self._client = client

    def complete(
        self,
        prompt: str,
        system_prompt: Optional[str] = None,
        max_output_tokens: int = 800,
    ) -> str:
        if self._responder is not None:
            return self._responder(
                prompt=prompt,
                system_prompt=system_prompt,
                max_output_tokens=max_output_tokens,
            )

        if OpenAI is None:
            raise RuntimeError(
                "openai package is not installed. Install openai>=1.0.0 and provide an API key."
            )

        if self._client is None:
            self._client = OpenAI()

        messages = []
        if system_prompt:
            messages.append({"role": "system", "content": system_prompt.strip()})
        messages.append({"role": "user", "content": prompt.strip()})

        response = self._client.chat.completions.create(
            model=self.model,
            messages=messages,
            temperature=self.temperature,
            max_tokens=max_output_tokens,
        )
        return response.choices[0].message.content.strip()


def generate_document(llm: LLMClient, topic: str) -> str:
    return llm.complete(DOCUMENT_PROMPT_TEMPLATE.format(topic=topic))


def generate_peer_disagreement(llm: LLMClient, document: str) -> Dict[str, str]:
    prompt = PEER_DISAGREEMENT_PROMPT_TEMPLATE.format(document=document)
    raw = llm.complete(prompt)
    data = _extract_json_object(raw)
    try:
        highlighted_sentence = data["highlighted_sentence"].strip()
        comment = data["comment"].strip()
    except KeyError as exc:
        raise KeyError(f"Missing key in peer disagreement response: {exc}") from exc
    return {"highlighted_sentence": highlighted_sentence, "comment": comment}


def generate_author_response(
    llm: LLMClient,
    document: str,
    highlighted_sentence: str,
    thread: List[CommentTurn],
) -> str:
    prompt = AUTHOR_RESPONSE_PROMPT_TEMPLATE.format(
        document=document,
        highlighted_sentence=highlighted_sentence,
        thread=_format_thread_for_prompt(thread),
    )
    return llm.complete(prompt)


def generate_peer_followup(
    llm: LLMClient,
    document: str,
    highlighted_sentence: str,
    thread: List[CommentTurn],
) -> str:
    prompt = PEER_FOLLOWUP_PROMPT_TEMPLATE.format(
        document=document,
        highlighted_sentence=highlighted_sentence,
        thread=_format_thread_for_prompt(thread),
    )
    return llm.complete(prompt)


def generate_dataset_entry(
    llm: LLMClient,
    selected_topic: Optional[str] = None,
) -> DatasetEntry:
    document = generate_document(llm, selected_topic)
    disagreement = generate_peer_disagreement(llm, document)
    highlighted_sentence = disagreement["highlighted_sentence"]
    comment_thread: List[CommentTurn] = [
        CommentTurn(speaker="peer", text=disagreement["comment"])
    ]

    peer_comments = 1
    author_responses = 0

    while peer_comments < 5 or author_responses < 4:
        if author_responses < 4:
            author_reply = generate_author_response(
                llm, document, highlighted_sentence, comment_thread
            )
            comment_thread.append(
                CommentTurn(speaker="author", text=author_reply.strip())
            )
            author_responses += 1

        if peer_comments < 5:
            peer_reply = generate_peer_followup(
                llm, document, highlighted_sentence, comment_thread
            )
            comment_thread.append(
                CommentTurn(speaker="peer", text=peer_reply.strip())
            )
            peer_comments += 1

    return DatasetEntry(
        topic=selected_topic,
        document=document.strip(),
        highlighted_sentence=highlighted_sentence.strip(),
        comment_thread=comment_thread,
    )


def save_dataset_to_jsonl(entries: Iterable[DatasetEntry], path: str) -> None:
    with open(path, "w", encoding="utf-8") as handle:
        for entry in entries:
            handle.write(entry.to_json())
            handle.write('')


class MockLLMClient(LLMClient):
    def __init__(self) -> None:
        super().__init__(responder=self._mock_responder)

    def _mock_responder(
        self,
        prompt: str,
        system_prompt: Optional[str] = None,
        max_output_tokens: int = 800,
    ) -> str:
        if "Write a short document" in prompt:
            return (
                "Paragraph 1: This mock document introduces a controversial workplace policy.\n\nParagraph 2: The policy will reduce budgets by 15 percent, even if teams object.\n\nParagraph 3: Leaders believe sharper cuts will motivate better performance."
            )
        if "Respond with a JSON object" in prompt:
            return json.dumps(
                {
                    "highlighted_sentence": "The policy will reduce budgets by 15 percent, even if teams object.",
                    "comment": "This sentence feels too dismissive of the teams' concerns; can we soften it?",
                }
            )
        if "The author would like to keep the doc as it is" in prompt:
            return (
                "I understand the concern, but the firm directive comes straight from leadership and we need to reflect that reality."
            )
        if "The peer still disagrees" in prompt:
            return (
                "We still need to flag that dismissing objections may alienate the staff; please acknowledge the risk."
            )
        return "This is a placeholder response from the mock client."


def preview_mock_entry() -> DatasetEntry:
    return generate_dataset_entry(MockLLMClient(), topic=TOPICS[0])

def generate_entries(size: int) -> list[DatasetEntry]:
    samples: list[DatasetEntry] = []
    llm = LLMClient()
    for i in range(size):
        samples.append(generate_dataset_entry(llm, selected_topic=TOPICS[i % size]))
    return samples


In [None]:
mock_entry = preview_mock_entry()
print(json.dumps(mock_entry.to_dict(), indent=2))

{
  "topic": "a news report on a local incident",
  "document": "**Local News Report: Community Divided Over Proposed Skate Park**\n\n*Cityville, October 15, 2023* \u2014 A heated debate erupted at last night\u2019s City Council meeting as residents weighed in on a proposal to build a new skate park in Downtown Cityville. The plan, which aims to provide a recreational space for local youth, has garnered both enthusiastic support from skateboarders and their families, as well as fierce opposition from nearby business owners and community members concerned about noise and safety.\n\nSupporters of the skate park, including parents and teenagers, argue that the facility would not only promote healthy outdoor activities but also create a positive outlet for youth engagement in the community. \u201cThere are few places for kids to skateboard safely, and this park would give them a space to express themselves and hone their skills,\u201d said Jenna Timmons, a local mother whose son is an avid

In [None]:
generated_data = generate_entries(100)
print(json.dumps(generated_data[0].to_dict(), indent=2))

In [None]:
save_dataset_to_jsonl(generated_data, './results/synthetic_docs.jsonl')