# Load Dependencies

In [1]:
import os
import json
import random
from collections import defaultdict
from typing import List, Tuple
from pydantic import BaseModel, computed_field

from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph

import pandas as pd
import json

load_dotenv()
os.environ.get("OPENAI_API_KEY")

dataset_name = "contractnli"
test_file = f"../data/benchmarks/{dataset_name}.json"
result_file = f"../data/results/qa_results.json"

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

persist_path = "./vectorstore/chroma_openai_embed_3_small"

vector_store = Chroma(
        embedding_function=embeddings, 
        persist_directory=persist_path
    )
# llm = ChatOpenAI(model="gpt-3.5-turbo-0125")
llm = ChatOpenAI(model="gpt-4o-mini")

# Generate response from retrieved context

## Get query and context from JSON

In [2]:
import json
import random

with open("./data/json_output/contractnli.json") as f:
    QnA_data = json.load(f)

In [3]:
import json

print(QnA_data[0].keys())
print(QnA_data[0]['query'])

print(f"number of chunks: {len(QnA_data[0]['retrieved_chunks_unranked'])}")

for chunk in QnA_data[0]['retrieved_chunks_unranked']:
    # print(chunk['text'])
    print(chunk)

dict_keys(['query', 'snippets', 'file_set', 'query_rewriter', 'feature_extraction', 'retrieved_chunks_unranked'])
Consider the Non-Disclosure Agreement between CopAcc and ToP Mentors; Does the document indicate that the Agreement does not grant the Receiving Party any rights to the Confidential Information?
number of chunks: 3
{'chunk_id': 21561, 'filepath': 'contractnli/CopAcc_NDA-and-ToP-Mentors_2.0_2017.txt', 'span': [7711, 8160], 'text': '4 Definition of Confidential Information\n“Confidential Information” means any Idea disclosed to Mentor, all data and information, know-how, business concepts, software, procedures, products, services, development projects, and programmes contained in such Idea and/or its description and any conclusions. Confidential Information does not include:\n> information already known or independently developed by Mentor prior to the disclosure of any Idea;'}
{'chunk_id': 21539, 'filepath': 'contractnli/CopAcc_NDA-and-ToP-Mentors_2.0_2017.txt', 'span': [359

In [4]:
print(QnA_data)

[{'query': 'Consider the Non-Disclosure Agreement between CopAcc and ToP Mentors; Does the document indicate that the Agreement does not grant the Receiving Party any rights to the Confidential Information?', 'snippets': [{'file_path': 'contractnli/CopAcc_NDA-and-ToP-Mentors_2.0_2017.txt', 'span': [11461, 11963], 'answer': 'Any and all proprietary rights, including but not limited to rights to and in inventions, patent rights, utility models, copyrights, trademarks and trade secrets, in and to any Confidential Information shall be and remain with the Participants respectively, and Mentor shall not have any right, license, title or interest in or to any Confidential Information, except the limited right to review, assess and help develop such Confidential Information in connection with the Copernicus Accelerator 2017.'}], 'file_set': ['CopAcc_NDA-and-ToP-Mentors_2.0_2017.txt'], 'query_rewriter': [{'best_file_path': 'contractnli/CopAcc_NDA-and-ToP-Mentors_2.0_2017.txt', 'file_locator': '

In [5]:
QnA_data_len = len(QnA_data)
print(QnA_data_len)

def get_dataset_from_json(index=0):
    
    qna_data_query = QnA_data[index]["query"]
    retrieved_chunks = QnA_data[index]["retrieved_chunks_unranked"]

    contexts_from_json = []
    for context in retrieved_chunks:
        cur_context = {}
        cur_context["file_path"] = context["filepath"]
        cur_context["span"] = context["span"]
        cur_context["chunk"] = context["text"]
        contexts_from_json.append(cur_context)
    
    return qna_data_query, contexts_from_json

194


## Answer Generation Pipeline

In [6]:
from langchain import hub
from typing_extensions import List, TypedDict
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import PromptTemplate

class ResponseGeneration:
    class State(TypedDict):
        question : str
        context : List[Document]
        answer: str

    def __init__(self, prompt: PromptTemplate, model = "gpt-4o-mini", temperature = 0.2):
        self.llm = ChatOpenAI(model=model, temperature=temperature)
        self.prompt = prompt

        graph_builder = StateGraph(self.State)
        graph_builder.add_sequence([self.generate])
        graph_builder.add_edge(START, "generate")
        self.graph = graph_builder.compile()

    def generate(self, state : State):
        context_doc_message = "\n\n".join(doc for doc in state["context"])
        message = self.prompt.invoke({"question":state["question"], "context":context_doc_message})
        response = self.llm.invoke(message)

        return({"answer":response})
    

In [7]:
# define baseline prompt
baseline_prompt = PromptTemplate.from_template("""HUMAN\n
                                               You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\n
                                               Question: {question}\n 
                                               Context: {context}\n 
                                               Answer:
                                               """)

In [8]:
# baseline response test
baseline_response_generation = ResponseGeneration(baseline_prompt)

query, contexts = get_dataset_from_json(index=193)
output = baseline_response_generation.graph.invoke({"question":query, "context":[context["chunk"] for context in contexts]})

In [9]:
print(output["answer"])

content='The Mutual Non-Disclosure Agreement does not explicitly allow the Receiving Party to acquire information similar to the Confidential Information from a third party. The Recipient is required to use the Confidential Information solely for the specified Purpose and cannot make any other use without prior written consent from the Discloser. Therefore, obtaining similar information from a third party would likely violate the terms of the agreement.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 75, 'prompt_tokens': 309, 'total_tokens': 384, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_b8bc95a0ac', 'finish_reason': 'stop', 'logprobs': None} id='run-2042796b-8414-479b-97da-0c377287d20e-0' usage_metadata={'input_tok

In [10]:
# generate responses for all the queries in the dataset

def generate_full_response(response_generator: ResponseGeneration, size = 10, JSON_CoT=False):
    qna_context_list = []

    # use tqdm here! 
    for i in range(0, size):
        query, contexts = get_dataset_from_json(index=i)
        output = response_generator.graph.invoke({"question":query, "context":[context["chunk"] for context in contexts]})
        
        if JSON_CoT :
            response = json.loads(output["answer"].content)["answer"]
        else :
            response = output["answer"].content
        user_input = query
        retrieved_contexts = [context["chunk"] for context in contexts]

        qna_context_list.append([user_input, response, retrieved_contexts])
        dataset_df = pd.DataFrame(qna_context_list, columns=["user_input", "response", "retrieved_contexts"])
    
    return dataset_df

In [11]:
query_answer_baseline = generate_full_response(baseline_response_generation, 194)


In [12]:
display(query_answer_baseline.columns)

Index(['user_input', 'response', 'retrieved_contexts'], dtype='object')

In [14]:
query_answer_baseline.to_json("./data/json_output/contractNLI_RG_baseline.json", orient="records", indent=4)

In [15]:
print(len(query_answer_baseline))

194
