In [23]:
# Universal parameters
_use_sample = True
dataset_name = "cuad"
vectorstore_path = f"./vectorstore/faiss_store_{'sample_' if _use_sample else ''}{dataset_name}"
directory_path = f"../data/{'sample_' if _use_sample else ''}corpus/{dataset_name}"
test_file = f"../data/{'sample_' if _use_sample else ''}benchmarks/{dataset_name}.json"
rephrased_file = f"../data/{'sample_' if _use_sample else ''}benchmarks/{dataset_name}_rephrased.json"

print(f"use sample: {_use_sample}")
print(f"dataset: {dataset_name}")
print(f"vector store: {vectorstore_path}")
print(f"corpus path: {directory_path}")
print(f"test file: {test_file}")

use sample: True
dataset: cuad
vector store: ./vectorstore/faiss_store_sample_cuad
corpus path: ../data/sample_corpus/cuad
test file: ../data/sample_benchmarks/cuad.json


In [13]:
import json
from typing import List, Tuple
from pydantic import BaseModel

class QASnippet(BaseModel):
    file_path: str
    span: Tuple[int, int]
    answer: str

class QAGroundTruth(BaseModel):
    query: str
    snippets: List[QASnippet]

def load_groundtruth(json_file_path: str) -> List[QAGroundTruth]:
    """
    Loads the QA ground-truth data from a JSON file.
    Expected JSON format:
    {
        "tests": [
            {
                "query": "Your query...",
                "snippets": [
                    {
                        "file_path": "path/to/file.txt",
                        "span": [start, end],
                        "answer": "The answer text..."
                    },
                    ...
                ]
            },
            ...
        ]
    }
    """
    with open(json_file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    groundtruth_tests = []

    try:
        tests = data.get("tests")
    except Exception as e:
        tests = data
    
    for test in tests:
        snippets = [QASnippet(**snippet) for snippet in test["snippets"]]
        groundtruth_tests.append(QAGroundTruth(query=test["query"], snippets=snippets))
    return groundtruth_tests

# Build VectorStore

In [9]:
import os
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") 

# sentence-transformers/all-MiniLM-L6-v2
# Linq-AI-Research/Linq-Embed-Mistral
# thenlper/gte-base

def load_documents_with_spans(directory: str, chunk_size: int = 1000, chunk_overlap: int = 0):
    """
    Loads .txt files from a directory, splits each document's text into chunks using
    RecursiveCharacterTextSplitter, computes the span (start, end) for each chunk, and
    returns a list of Document objects with metadata (including filename, source, and span).
    """
    documents = []
    # Initialize the splitter with the desired separators and parameters.
    splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", "!", "?", ".", ":", ";", ",", " ", ""],
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        is_separator_regex=False,
        strip_whitespace=False,
    )
    
    # Process each .txt file in the directory.
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            filepath = os.path.join(directory, filename)
            with open(filepath, "r", encoding="utf-8") as f:
                text = f.read()
            
            # Split text into chunks.
            text_splits = splitter.split_text(text)
            
            # Verify that the chunks concatenate to the original text.
            assert "".join(text_splits) == text, "Concatenated splits do not match the original text."
            
            # Compute spans and create Document objects.
            prev_index = 0
            for i, chunk_text in enumerate(text_splits):
                span = (prev_index, prev_index + len(chunk_text))
                prev_index += len(chunk_text)
                doc = Document(
                    page_content=chunk_text,
                    metadata={
                        "filename": filename,
                        "filepath": f"{dataset_name}/{filename}",
                        "span": span,  # Stores the (start, end) positions of the chunk.
                        "id": f"{filename}_chunk_{i}"
                    }
                )
                documents.append(doc)
    return documents


if os.path.exists(vectorstore_path):
    print(f"The vectorstore_path '{vectorstore_path}' already exists. Please delete it first if you wish to continue.")
else:
    # Load the documents, splitting each into chunks with span metadata.
    documents = load_documents_with_spans(directory_path, chunk_size=500, chunk_overlap=0)
    print(f"Loaded {len(documents)} document chunks with spans.")

    # Build the FAISS vector store using the list of Document objects.
    vectorstore = FAISS.from_documents(documents, embeddings)

    # Save the FAISS vector store locally for later retrieval.
    vectorstore.save_local(vectorstore_path)
    print(f"FAISS vector store saved locally at '{vectorstore_path}'.")

Loaded 461 document chunks with spans.
FAISS vector store saved locally at './vectorstore/faiss_store_sample_privacy_qa'.


In [None]:
# to check 5 first samples
loaded_vectorstore = FAISS.load_local(vectorstore_path, embeddings, allow_dangerous_deserialization=True)
docstore_ids = dict(loaded_vectorstore.index_to_docstore_id)
loaded_vectorstore.get_by_ids(list(docstore_ids.values())[:5])

In [None]:
## TO DELETE A VECTOR STORE, RUN THIS CELL ##

# import shutil
# import os

# # Check if the directory exists
# if os.path.exists(vectorstore_path):
#     shutil.rmtree(vectorstore_path)
#     print(f"Deleted the FAISS vector store at: {vectorstore_path}")
# else:
#     print(f"No FAISS vector store found at: {vectorstore_path}")

# Query Rewriter (Simple Extractor)

In [24]:
import os
import re
import difflib
from collections import Counter
from rapidfuzz import fuzz
from typing import List, Tuple, Callable
from sentence_transformers import SentenceTransformer, util
import nltk
from tqdm import tqdm
import random
from nltk.corpus import stopwords
nltk.download('stopwords', quiet=True)

def extract_tgt_corpus(query: str) -> str:
    """
    Extracts the target corpus description from the query and preprocesses it
    by removing the term "Non-Disclosure Agreement" (case-insensitive) and common English stopwords.
    
    For example:
      "Consider the Non-Disclosure Agreement between Artop and Inno; Does the document permit..."
    returns (after preprocessing):
      "between Artop Inno"
    """
    # Extract text between "Consider the " and the first semicolon
    pattern = r"^Consider (.*?);"
    match = re.match(pattern, query)
    if not match:
        return ""
    
    tgt = match.group(1).strip()
    
    # Remove the term "Non-Disclosure Agreement" (case-insensitive)
    tgt = re.sub(r"(?i)Non-Disclosure Agreement", "", tgt).strip()
    
    # Load common English stopwords from NLTK
    stop_words = set(stopwords.words("english"))
    
    # Tokenize the text (here using simple whitespace splitting)
    tokens = tgt.split()
    
    # Filter out stopwords
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    
    # Join tokens back into a string
    processed_tgt = " ".join(filtered_tokens)
    return processed_tgt

def find_best_corpus(tgt_corpus: str, corpus_files: List[str]) -> Tuple[str, float]:
    """
    Given a target corpus description and a list of corpus file names,
    returns the file name with the highest similarity score and that score.
    """
    best_match = None
    best_ratio = 0.0
    for file in corpus_files:
        ratio = difflib.SequenceMatcher(None, tgt_corpus.lower(), file.lower()).ratio()
        if ratio > best_ratio:
            best_ratio = ratio
            best_match = file
    return best_match, best_ratio

def find_best_corpus_rapid(tgt_corpus: str, corpus_files: List[str]) -> Tuple[str, float]:
    """
    Uses RapidFuzz's token_set_ratio to compute a similarity score between the target corpus and each file name.
    Returns the best matching file and its score (normalized between 0 and 1).
    """
    best_match = None
    best_score = 0.0
    for file in corpus_files:
        # token_set_ratio handles unordered tokens and common token removal well.
        score = fuzz.token_set_ratio(tgt_corpus, file)
        if score > best_score:
            best_score = score
            best_match = file
    # Normalize the score to [0, 1] (RapidFuzz returns a value in [0,100])
    return best_match, best_score / 100.0

def find_best_corpus_embeddings(tgt_corpus: str, corpus_files: List[str],
                                model: SentenceTransformer) -> Tuple[str, float]:
    """
    Embeds the target corpus description and each file name using a sentence transformer,
    then computes cosine similarities to find the best matching file.
    """
    # Embed the target description.
    tgt_embedding = model.encode(tgt_corpus, convert_to_tensor=True)
    # Embed all candidate file names.
    file_embeddings = model.encode(corpus_files, convert_to_tensor=True)
    # Compute cosine similarities.
    cosine_scores = util.cos_sim(tgt_embedding, file_embeddings)[0]
    # Get the index of the best matching file.
    best_idx = int(cosine_scores.argmax())
    best_score = float(cosine_scores[best_idx])
    return corpus_files[best_idx], best_score


def evaluate_corpus_matching(ground_truths: List[QAGroundTruth],
                             candidate_files: List[str],
                             threshold: float,
                             match_fn: Callable[[str, List[str]], Tuple[str, float]]
                             ) -> List[int]:
    """
    For each QAGroundTruth:
      - Extract the target corpus from the query.
      - Find the best matching file using the provided match_fn and its similarity score.
      - If the similarity score is below the threshold, assign a score of 0.
      - If above the threshold:
          * Assign 1 if the best matching file is among the actual file paths.
          * Assign -1 if it does not match.
    Returns a list of scores.
    """
    outputs = []
    for gt in tqdm(ground_truths, desc="Evaluating queries"):
        tgt_corpus = extract_tgt_corpus(gt.query)
        best_file, similarity = match_fn(tgt_corpus, candidate_files)
        # Get the set of actual file paths from the ground truth snippets.
        actual_files = {snippet.file_path for snippet in gt.snippets}
        
        if similarity >= threshold:
            score = 1 if best_file in actual_files else -1
        else:
            score = 0

        result = {
            "query": gt.query,
            "targeted_corpus": tgt_corpus,
            "best_file": best_file,
            "similarity": similarity,
            "actual_files": list(actual_files),
            "score": score
        }
        outputs.append(result)
    return outputs


In [25]:
groundtruth_tests = load_groundtruth(test_file) # test_file, rephrased_file
test_queries = [gt.query for gt in groundtruth_tests]
list_corpus = [os.path.join(f"{dataset_name}", filename) for filename in os.listdir(directory_path) if filename.endswith(".txt")]

In [26]:
threshold = 0.3
model = SentenceTransformer("all-MiniLM-L6-v2")
match_fn_embeddings = lambda tgt, files: find_best_corpus_embeddings(tgt, files, model)

results = evaluate_corpus_matching(groundtruth_tests, list_corpus, threshold, match_fn_embeddings)

for sample in random.sample(results, 5):
    print(json.dumps(sample, indent=2))

scores = [results[i]["score"] for i in range(len(results))]
counts = Counter(scores)
print("dataset: ", dataset_name)
print("Final Score: ", counts)

Evaluating queries: 100%|██████████| 194/194 [00:07<00:00, 26.72it/s]

{
  "query": "Consider the Product Development and Co-Branding Agreement between Integrity Incorporated and Time Life, Inc. for 'Songs 4 Worship' Series; Does this contract include an exclusivity agreement?",
  "targeted_corpus": "Product Development Co-Branding Agreement Integrity Incorporated Time Life, Inc. 'Songs 4 Worship' Series",
  "best_file": "cuad/IntegrityMediaInc_20010329_10-K405_EX-10.17_2373875_EX-10.17_Co-Branding Agreement.txt",
  "similarity": 0.45198339223861694,
  "actual_files": [
    "cuad/IntegrityMediaInc_20010329_10-K405_EX-10.17_2373875_EX-10.17_Co-Branding Agreement.txt"
  ],
  "score": 1
}
{
  "query": "Consider the Co-Branding and Services Agreement between RSL COM PrimeCall, Inc. and deltathree.com, Inc.; Is there an anti-assignment clause in this contract?",
  "targeted_corpus": "Co-Branding Services Agreement RSL COM PrimeCall, Inc. deltathree.com, Inc.",
  "best_file": "cuad/DeltathreeInc_19991102_S-1A_EX-10.19_6227850_EX-10.19_Co-Branding Agreement_ Ser




# Query Rewriter (Small Language Model)

In [9]:
import os
import re
import difflib
from collections import Counter
from rapidfuzz import fuzz
from typing import List, Tuple, Callable
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline
import nltk
from tqdm import tqdm
import random
from nltk.corpus import stopwords
nltk.download('stopwords', quiet=True)

def split_question(query: str, model) -> Tuple[str, str]:
    """
    Splits a query into two parts using a language model with few-shot prompt engineering.
    
    The function identifies:
      - targeted_corpus: a concise phrase that identifies the relevant document or agreement (e.g., "Evelozcity's Non-Disclosure Agreement" or "EFCA's Non-Disclosure Agreement").
      - original_question: the actual question about that document.
    
    The prompt provides examples for both semicolon-separated queries and naturally phrased queries.
    Output is expected as a JSON object with keys 'targeted_corpus' and 'original_question'.
    """
    prompt = (
        "Split the following query into two parts and output a JSON object with keys 'targeted_corpus' and 'original_question'.\n"
        "The targeted_corpus should be a short phrase describing the document or agreement being referenced, and the original_question should be the question part.\n\n"
        "Example 1 (semicolon-delimited):\n"
        "Input: \"Consider the Non-Disclosure Agreement between CopAcc and ToP Mentors; Does the document indicate that the Agreement does not grant any rights to the Confidential Information?\"\n"
        "Output: {\"targeted_corpus\": \"CopAcc and ToP Mentors\", \"original_question\": \"Does the document indicate that the Agreement does not grant any rights to the Confidential Information?\"}\n\n"
        "Example 2 (natural language):\n"
        "Input: \"Is the Confidential Information covered in Evelozcity's Non-Disclosure Agreement? Are there any specific examples of technical information that is covered?\"\n"
        "Output: {\"targeted_corpus\": \"Evelozcity\", \"original_question\": \"Does the document state that Confidential Information shall only include technical information?\"}\n\n"
        "Example 3 (another natural language example):\n"
        "Input: \"Does the Data Use Agreement in New York City specify if the Receiving Party must return or destroy Confidential Information upon termination?\"\n"
        "Output: {\"targeted_corpus\": \"Data Use Agreement in New York City\", \"original_question\": \"Does the Data Use Agreement in New York City specify if the Receiving Party must return or destroy Confidential Information upon termination?\"}\n\n"
        "Now, split the following query:\n"
        f"Input: \"{query}\"\n\n"
        "Output:"
    )
    
    # Increase max_new_tokens to allow a longer answer and use sampling.
    output = model(prompt, max_new_tokens=250, do_sample=True, temperature=0.5)
    generated_text = output[0]['generated_text'].strip()
    
    # Debug print (optional):
    # print("Raw generated text:", generated_text)
    
    # If the generated text does not start with a curly brace, add them.
    if not generated_text.startswith("{"):
        # Try to extract the JSON-like part using regex (optional improvement).
        json_like = re.search(r"\{.*\}", generated_text, re.DOTALL)
        if json_like:
            generated_text = json_like.group(0)
        else:
            generated_text = "{" + generated_text + "}"
    
    try:
        result = json.loads(generated_text)
        targeted_corpus = result.get("targeted_corpus", "").strip()
        original_question = result.get("original_question", "").strip()
    except Exception as e:
        # Fallback: if JSON parsing fails, fall back to a heuristic split on the semicolon.
        parts = query.split(";", 1)
        targeted_corpus = parts[0].replace("Consider", "").strip() if parts else ""
        original_question = parts[1].strip() if len(parts) > 1 else ""
    
    return targeted_corpus, original_question

def find_best_corpus(tgt_corpus: str, corpus_files: List[str]) -> Tuple[str, float]:
    """
    Given a target corpus description and a list of corpus file names,
    returns the file name with the highest similarity score and that score.
    """
    best_match = None
    best_ratio = 0.0
    for file in corpus_files:
        ratio = difflib.SequenceMatcher(None, tgt_corpus.lower(), file.lower()).ratio()
        if ratio > best_ratio:
            best_ratio = ratio
            best_match = file
    return best_match, best_ratio

def find_best_corpus_rapid(tgt_corpus: str, corpus_files: List[str]) -> Tuple[str, float]:
    """
    Uses RapidFuzz's token_set_ratio to compute a similarity score between the target corpus and each file name.
    Returns the best matching file and its score (normalized between 0 and 1).
    """
    best_match = None
    best_score = 0.0
    for file in corpus_files:
        # token_set_ratio handles unordered tokens and common token removal well.
        score = fuzz.token_set_ratio(tgt_corpus, file)
        if score > best_score:
            best_score = score
            best_match = file
    # Normalize the score to [0, 1] (RapidFuzz returns a value in [0,100])
    return best_match, best_score / 100.0

def find_best_corpus_embeddings(tgt_corpus: str, corpus_files: List[str],
                                model: SentenceTransformer) -> Tuple[str, float]:
    """
    Embeds the target corpus description and each file name using a sentence transformer,
    then computes cosine similarities to find the best matching file.
    """
    # Embed the target description.
    tgt_embedding = model.encode(tgt_corpus, convert_to_tensor=True)
    # Embed all candidate file names.
    file_embeddings = model.encode(corpus_files, convert_to_tensor=True)
    # Compute cosine similarities.
    cosine_scores = util.cos_sim(tgt_embedding, file_embeddings)[0]
    # Get the index of the best matching file.
    best_idx = int(cosine_scores.argmax())
    best_score = float(cosine_scores[best_idx])
    return corpus_files[best_idx], best_score


def evaluate_corpus_matching(ground_truths: List[QAGroundTruth],
                             candidate_files: List[str],
                             threshold: float,
                             match_fn: Callable[[str, List[str]], Tuple[str, float]]
                             ) -> List[int]:
    """
    For each QAGroundTruth:
      - Extract the target corpus from the query.
      - Find the best matching file using the provided match_fn and its similarity score.
      - If the similarity score is below the threshold, assign a score of 0.
      - If above the threshold:
          * Assign 1 if the best matching file is among the actual file paths.
          * Assign -1 if it does not match.
    Returns a list of scores.
    """
    outputs = []
    model = pipeline("text2text-generation", model="google/flan-t5-large")
    for gt in tqdm(ground_truths, desc="Evaluating queries"):
        tgt_corpus, orig = split_question(gt.query, model)
        best_file, similarity = match_fn(tgt_corpus, candidate_files)
        # Get the set of actual file paths from the ground truth snippets.
        actual_files = {snippet.file_path for snippet in gt.snippets}
        
        if similarity >= threshold:
            score = 1 if best_file in actual_files else -1
        else:
            score = 0

        result = {
            "query": gt.query,
            "targeted_corpus": tgt_corpus,
            "best_file": best_file,
            "similarity": similarity,
            "actual_files": list(actual_files),
            "score": score
        }
        outputs.append(result)
    return outputs

In [10]:
groundtruth_tests = load_groundtruth(rephrased_file) # test_file, rephrased_file
test_queries = [gt.query for gt in groundtruth_tests]
list_corpus = [os.path.join(f"{dataset_name}", filename) for filename in os.listdir(directory_path) if filename.endswith(".txt")]

In [11]:
threshold = 0.3
model = SentenceTransformer("all-MiniLM-L6-v2")
match_fn_embeddings = lambda tgt, files: find_best_corpus_embeddings(tgt, files, model)

results = evaluate_corpus_matching(groundtruth_tests, list_corpus, threshold, match_fn_embeddings)

for sample in random.sample(results, 5):
    print(json.dumps(sample, indent=2))

scores = [results[i]["score"] for i in range(len(results))]
counts = Counter(scores)
print("dataset: ", dataset_name)
print("Final Score: ", counts)

Device set to use mps:0
Evaluating queries: 100%|██████████| 194/194 [26:38<00:00,  8.24s/it]

{
  "query": "Consider EFCA's Non-Disclosure Agreement; Does the document allow the Receiving Party to independently develop information that is similar to the Confidential Information?",
  "targeted_corpus": "EFCA",
  "best_file": "contractnli/EFCAConfidentialityAgreement.txt",
  "similarity": 0.4112617075443268,
  "actual_files": [
    "contractnli/EFCAConfidentialityAgreement.txt"
  ],
  "score": 1
}
{
  "query": "\"In the Non-Disclosure Agreement between CopAcc and ToP Mentors, does it explicitly state that the Receiving Party is not granted any rights to the Confidential Information?\"",
  "targeted_corpus": "\"In the Non-Disclosure Agreement between CopAcc and ToP Mentors, does it explicitly state that the Receiving Party is not granted any rights to the Confidential Information?\"",
  "best_file": "contractnli/CopAcc_NDA-and-ToP-Mentors_2.0_2017.txt",
  "similarity": 0.5668238401412964,
  "actual_files": [
    "contractnli/CopAcc_NDA-and-ToP-Mentors_2.0_2017.txt"
  ],
  "score":




# Rephrase Question

In [50]:
import json
import random
from tqdm import tqdm
from typing import List
from transformers import pipeline

def rephrase_question(question: str, model) -> str:
    """
    Rephrases a question into a more natural, real-world style while preserving two distinct parts:
      - A part providing details about the relevant document or agreement (targeted corpus).
      - The actual query regarding that document.
      
    The function uses prompt engineering with a few-shot approach. The output is expected to be exactly the rephrased question.
    
    Few-shot examples:
    Example 1:
      Original: "In the Non-Disclosure Agreement between CopAcc and ToP Mentors, does it explicitly state that the Receiving Party is not granted any rights to the Confidential Information?"
      Rephrased: "In the Non-Disclosure Agreement between CopAcc and ToP Mentors, does it explicitly state that the Receiving Party is not granted any rights to the Confidential Information?"
      
    Example 2:
      Original: "Consider EFCA's Non-Disclosure Agreement; Does the document mention that some obligations of the Agreement may survive the termination of the Agreement?"
      Rephrased: "Does EFCA's Non-Disclosure Agreement mention whether certain obligations continue even after the Agreement is terminated?"
      
    Example 3:
      Original: "Consider the Data Use Agreement in New York City; Does the document specify whether the Receiving Party is required to destroy or return Confidential Information upon the termination of the Agreement?"
      Rephrased: "In the Data Use Agreement for New York City, does the document specify if the Receiving Party must destroy or return Confidential Information once the Agreement ends?"
    
    Now rephrase the following question:
      Original: "{question}"
    
    Output exactly as:
      Rephrased: "<your rephrased question here>"
    """
    prompt = (
        "Example 1:\n"
        "Original: \"Consider the Non-Disclosure Agreement between CopAcc and ToP Mentors; Does the document indicate that the Agreement does not grant the Receiving Party any rights to the Confidential Information?\"\n"
        "Rephrased: \"In the Non-Disclosure Agreement between CopAcc and ToP Mentors, does it explicitly state that the Receiving Party is not granted any rights to the Confidential Information?\"\n\n"
        "Example 2:\n"
        "Original: \"Consider EFCA's Non-Disclosure Agreement; Does the document mention that some obligations of the Agreement may survive the termination of the Agreement?\"\n"
        "Rephrased: \"Does EFCA's Non-Disclosure Agreement mention whether certain obligations continue even after the Agreement is terminated?\"\n\n"
        "Example 3:\n"
        "Original: \"Consider the Data Use Agreement in New York City; Does the document specify whether the Receiving Party is required to destroy or return Confidential Information upon the termination of the Agreement?\"\n"
        "Rephrased: \"In the Data Use Agreement for New York City, does the document specify if the Receiving Party must destroy or return Confidential Information once the Agreement ends?\"\n\n"
        "Now, rephrase the following question in a natural, conversational style while preserving the two parts (the document details and the query):\n"
        f"Original: \"{question}\"\n\n"
        "Rephrased: "
    )
    
    # Use sampling to allow creative rephrasing.
    output = model(prompt, max_length=150, do_sample=True, temperature=0.8)
    generated_text = output[0]['generated_text']
    
    # Attempt to parse the output if it follows our exact format
    # Here we assume the model's output starts with "Rephrased:" and then the text.
    if generated_text.strip().lower().startswith("rephrased:"):
        rephrased = generated_text.strip()[len("Rephrased:"):].strip()
    else:
        rephrased = generated_text.strip()
    
    return rephrased

def rephrase_groundtruth_queries(groundtruths: List[QAGroundTruth],
                                 model,
                                 percentage: float = 0.5) -> List[QAGroundTruth]:
    """
    Rephrases a given percentage of queries in the ground truth.
    
    Args:
      groundtruths: List of QAGroundTruth objects.
      model: A text-to-text generation pipeline.
      percentage: Fraction of queries to rephrase (e.g., 0.5 means 50%).
    
    Returns:
      The updated list of QAGroundTruth objects.
    """
    num_to_rephrase = int(len(groundtruths) * percentage)
    indices = random.sample(range(len(groundtruths)), num_to_rephrase)
    
    for idx in tqdm(indices, desc="Rephrasing queries"):
        original_query = groundtruths[idx].query
        new_query = rephrase_question(original_query, model)
        groundtruths[idx].query = new_query
    return groundtruths

def save_groundtruth(groundtruths: List[QAGroundTruth], output_path: str):
    """
    Saves the list of QAGroundTruth objects to a JSON file in the expected format.
    """
    tests = [gt.dict() for gt in groundtruths]
    output_data = {"tests": tests}
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(output_data, f, indent=2)
    print(f"Saved rephrased groundtruth data to {output_path}")


In [51]:
# Load the ground truth test data from a JSON file.
groundtruths = load_groundtruth(test_file)

# Load a text-to-text generation pipeline using an open-source model.
model = pipeline("text2text-generation", model="google/flan-t5-large")

# Rephrase a specified percentage (e.g., 50%) of the queries.
rephrased_groundtruths = rephrase_groundtruth_queries(groundtruths, model, percentage=0.5)

# Save the updated groundtruth test data.
rephrased_file = f"../data/{'sample_' if _use_sample else ''}benchmarks/{dataset_name}_rephrased.json"
save_groundtruth(rephrased_groundtruths, rephrased_file)

Device set to use mps:0
Rephrasing queries: 100%|██████████| 97/97 [09:53<00:00,  6.12s/it]

Saved rephrased groundtruth data to ../data/sample_benchmarks/contractnli_rephrased.json



/var/folders/hk/j9r7jggx4dxgt8gmzj_c2z080000gn/T/ipykernel_2252/3494889406.py:89: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.10/migration/
  tests = [gt.dict() for gt in groundtruths]


In [52]:
groundtruths_rephrased = load_groundtruth(rephrased_file)

len(groundtruths_rephrased)

194

In [55]:
groundtruths_rephrased

[QAGroundTruth(query='Consider the Non-Disclosure Agreement between CopAcc and ToP Mentors; Does the document indicate that the Agreement does not grant the Receiving Party any rights to the Confidential Information?', snippets=[QASnippet(file_path='contractnli/CopAcc_NDA-and-ToP-Mentors_2.0_2017.txt', span=(11461, 11963), answer='Any and all proprietary rights, including but not limited to rights to and in inventions, patent rights, utility models, copyrights, trademarks and trade secrets, in and to any Confidential Information shall be and remain with the Participants respectively, and Mentor shall not have any right, license, title or interest in or to any Confidential Information, except the limited right to review, assess and help develop such Confidential Information in connection with the Copernicus Accelerator 2017.')]),
 QAGroundTruth(query='"In the Non-Disclosure Agreement between CopAcc and ToP Mentors, does it explicitly state that the Receiving Party is not granted any rig