In [None]:
import os

path = "/Users/ivanmkc/code/adk-samples/output/companies/blue_ridge_outfitters/run_0/Blue Ridge Outfitters_ _Gear Up for Your Adventure..md"
with open(path, "r") as f:
    text = f.read()

In [None]:
len(text.split("\n"))

In [None]:
len(text)

In [None]:
import dspy
import mlflow
mlflow.dspy.autolog()
mlflow.set_experiment("extract_claims")

model_name = "gemini/gemini-2.5-pro-preview-03-25"
# model_name = "gemini/gemini-2.0-flash"
lm = dspy.LM(
    model=model_name,
    # max_tokens=65535,
    # allowed_openai_params=["thinking"],
    # thinking={"type": "enabled", "budget_tokens": 1024},
)
dspy.configure(lm=lm)

In [None]:
from typing import List, Optional, Iterator
import dspy
from pydantic import BaseModel, Field
import asyncio
from tqdm.asyncio import tqdm

# --- DSPy Signature for Claim Extraction ---
class ClaimExtractionSignature(dspy.Signature):
    """
    Given a source text, extract verifiable claims.
    Ensure each quote is supported by the source text.
    Aim to extract distinct and significant claims.
    Avoid claims that include ambiguity or speculation.
    """

    source_text: str = dspy.InputField(
        desc="The full text from which to extract claims."
    )
    extracted_claims: List[str] = dspy.OutputField(
        desc="A list of extracted claims. Each claim needs to be self-contained such that context-dependent words/pronouns (e.g. this, that, it, etc) without explicit noun phrases need to be replaced with the thing/concept they refer to."
    )


def batch_text(text: str, batch_size: int) -> Iterator[str]:
    batch = []
    
    for line in text.split("\n"):
        batch.append(line)

        if len(batch) >= batch_size:
            yield "\n".join(batch)
            batch = []
    
    if batch:
        yield batch

# --- DSPy Module for Verifiable Claim Extraction ---
class VerifiableClaimExtractor(dspy.Module):
    def __init__(self):
        super().__init__()
        self.claim_extractor = dspy.ChainOfThought(ClaimExtractionSignature)

    async def forward(self, source_text: str) -> dspy.Prediction:
        """
        Extracts verifiable claims from the given source text.

        Args:
            source_text: The text to extract claims from.

        Returns:
            A dspy.Prediction object containing a list of extracted claims.
        """

        semaphore = asyncio.Semaphore(5)
        claim_extractor_async = dspy.asyncify(self.claim_extractor)
        async def extract_claims(text: str) -> list[str]:
            async with semaphore:
                return await claim_extractor_async(source_text=text)
        
        extraction_tasks = [extract_claims(batch)
                            for batch in batch_text(source_text, batch_size=4)
                            ]
        
        predictions = await tqdm.gather(*extraction_tasks) 
        claims = [claim
                        for prediction in predictions
                        for claim in prediction.extracted_claims
                       ]

        return claims

In [None]:
# for batch in batch_text(text, 10):
#     print(batch)
#     print("---------")

In [None]:
import dspy
# lm = dspy.LM(model="gemini/gemini-2.5-pro-preview-03-25")
lm = dspy.LM(
    model="gemini/gemini-2.5-pro-preview-03-25",
    max_tokens=65535,
    allowed_openai_params=['thinking'], 
    thinking={"type": "enabled", "budget_tokens": 2048},
)
dspy.configure(lm=lm)

In [None]:
claim_extractor_module = VerifiableClaimExtractor()

extracted_claims = await claim_extractor_module.forward(
    source_text=text,
)

In [None]:
extracted_claims

In [None]:
# extractor = dspy.ChainOfThought(ClaimExtractionSignature)
# extractor_async = dspy.asyncify(extractor)
# prediction = await extractor_async(source_text="Hello world.")

In [None]:
# prediction.extracted_claims

In [None]:
class FalseClaimSignature(dspy.Signature):
    """
    Given a source text, make up falsehoods that contradict the text.
    Ensure each quote is contradicted by the source text.
    Aim to create distinct and significant fake claims.
    Avoid claims that include ambiguity or speculation.
    """

    source_text: str = dspy.InputField(
        desc="The full text from which to extract claims."
    )
    false_claims: List[str] = dspy.OutputField(
        desc="A list of false claims. Each claim needs to be self-contained such that context-dependent words/pronouns (e.g. this, that, it, etc) without explicit noun phrases need to be replaced with the thing/concept they refer to."
    )


def batch_text(text: str, batch_size: int) -> Iterator[str]:
    batch = []
    
    for line in text.split("\n"):
        batch.append(line)

        if len(batch) >= batch_size:
            yield "\n".join(batch)
            batch = []
    
    if batch:
        yield batch

# --- DSPy Module for Verifiable Claim Extraction ---
class FalsehoodExtractor(dspy.Module):
    def __init__(self):
        super().__init__()
        self.claim_extractor = dspy.ChainOfThought(FalseClaimSignature)

    async def forward(self, source_text: str) -> dspy.Prediction:
        """
        Create incorrect information against the given source text.

        Args:
            source_text: The text to contradict.

        Returns:
            A dspy.Prediction object containing a list of 'false_claims'.
        """

        semaphore = asyncio.Semaphore(5)
        claim_extractor_async = dspy.asyncify(self.claim_extractor)
        async def extract_claims(text: str) -> list[str]:
            async with semaphore:
                return await claim_extractor_async(source_text=text)
        
        extraction_tasks = [extract_claims(batch)
                            for batch in batch_text(source_text, batch_size=4)
                            ]
        
        predictions = await tqdm.gather(*extraction_tasks) 
        claims = [claim
                        for prediction in predictions
                        for claim in prediction.false_claims
                       ]

        return claims

In [None]:
false_claim_generator_module = FalsehoodExtractor()

false_claims = await false_claim_generator_module.forward(
    source_text=text,
)

In [None]:
false_claims

In [None]:
class ClaimRewriterSignature(dspy.Signature):
    """
    Given a source text and an input claim:
    1. Analyze the claim. If it's underspecified (e.g., uses pronouns like 'it', 'this', 'they' without clear antecedents, or refers to concepts vaguely), rewrite it to be self-contained and unambiguous by incorporating necessary context from the source text.
    2. The rewritten claim MUST be clearly and unambiguously either be supported by or contradicted by the source text.
    3. Provide a verdict ('supported' or 'contradicted') for the rewritten claim against the source text.
    """

    source_text: str = dspy.InputField(
        desc="The source text for context and verification."
    )
    claim: str = dspy.InputField(
        desc="The input claim to analyze and rewrite."
    )
    rewritten_claim: str = dspy.OutputField(
        desc="The rewritten, self-contained, and unambiguous claim."
    )
    is_supported: bool = dspy.OutputField(
        # Restricting to these two as per the problem statement's emphasis
        desc="Verdict for the rewritten_claim: True if 'supported' and False if 'contradicted' by the source_text."
    )
    reasoning: str = dspy.OutputField(
        desc="Brief reasoning for the verdict and any significant rewrites made to achieve clarity and verifiability."
    )

class ClaimRewriter(dspy.Module):
    def __init__(self):
        super().__init__()
        # Using Predict as ChainOfThought might be overkill if the prompt is strong enough,
        # but CoT is generally more robust for complex reasoning.
        self.rewriter_predictor = dspy.ChainOfThought(ClaimRewriterSignature)

    async def forward(self, claim: str, source_text: str) -> dspy.Prediction:
        """
        Rewrites a claim for clarity and verifies if it's supported or contradicted by the source text.

        Args:
            claim: The claim string to process.
            source_text: The source text to use for context and verification.

        Returns:
            A dspy.Prediction object containing 'rewritten_claim', 'verdict', and 'reasoning'.
        """
        # dspy.Predict/ChainOfThought are not async by default.
        # We need to use dspy.asyncify for them if we want to await their calls.
        rewriter_predictor_async = dspy.asyncify(self.rewriter_predictor) # batch_size=1 as we process one claim at a time here
        
        # If source_text is very long, the LLM might struggle.
        # However, for claim rewriting/verification, the relevant source context is often local.
        # The current FalsehoodExtractor already batches source_text for generation,
        # so the source_text passed here will be a manageable chunk.
        prediction = await rewriter_predictor_async(claim=claim, source_text=source_text)
        return prediction


In [None]:
rewriter = ClaimRewriter()
all_claims = extracted_claims + false_claims
rewritten_predictions = await tqdm.gather(*[rewriter.forward(claim=claim, source_text=text) for claim in all_claims])

In [69]:
assert all([is_supported == prediction.is_supported for (is_supported, prediction) in zip([True for _ in range(len(extracted_claims))] + [False for _ in range(len(false_claims))], rewritten_predictions)]), "Pre and post rewriting is_supported are not consistent"

In [None]:
[prediction.rewritten_claim for prediction in rewritten_predictions]

[Prediction(
     reasoning='The claim is a direct statement found in the first sentence of the source text. It is self-contained and unambiguous.',
     rewritten_claim='Blue Ridge Outfitters has been equipping outdoor enthusiasts for their journeys for over half a century.',
     is_supported=True
 ),
 Prediction(
     reasoning='The claim is directly supported by the first sentence of the source text, which states, "For over half a century, Blue Ridge Outfitters has been equipping outdoor enthusiasts for their journeys," and the second sentence, "Founded more than 50 years ago by the avid mountaineer Jedediah \'Jed\' Stone...". The original claim is already self-contained and unambiguous.',
     rewritten_claim='Blue Ridge Outfitters was founded more than 50 years ago.',
     is_supported=True
 ),
 Prediction(
     reasoning='The claim is self-contained and directly verifiable from the source text. The first paragraph of the source text explicitly states that Blue Ridge Outfitters w