# Load Dependencies

In [1]:
import os
import json
import random
from collections import defaultdict
from typing import List, Tuple
from pydantic import BaseModel, computed_field

from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph

load_dotenv()
os.environ.get("OPENAI_API_KEY")

dataset_name = "contractnli"
test_file = f"../data/benchmarks/{dataset_name}.json"
result_file = f"../data/results/qa_results.json"

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

persist_path = "./vectorstore/chroma_openai_embed_3_small"

vector_store = Chroma(
        embedding_function=embeddings, 
        persist_directory=persist_path
    )
llm = ChatOpenAI(model="gpt-3.5-turbo-0125")

# RAGAS Eval (Faithfulness + Answer Relevance) using JSON

## Load JSON

In [2]:
import json
import random

with open("../data/benchmarks/contractnli.json") as f:
    QnA_data = json.load(f)

## Get datasets from JSON

In [3]:
QnA_data_len = len(QnA_data["tests"])

def get_dataset_from_json_random():

    random_index = random.randint(0, QnA_data_len-1)
    print(f"random index {random_index}")
    
    qna_data_query = QnA_data["tests"][random_index]["query"]
    qna_data_contexts = QnA_data["tests"][random_index]["snippets"]

    contexts_from_json = []
    for context in qna_data_contexts:
        cur_context = {}
        cur_context["file_path"] = context["file_path"]
        cur_context["span"] = context["span"]
        cur_context["chunk"] = context["answer"]
        contexts_from_json.append(cur_context)
    
    return qna_data_query, contexts_from_json

def get_dataset_from_json(index=0):
    
    qna_data_query = QnA_data["tests"][index]["query"]
    qna_data_contexts = QnA_data["tests"][index]["snippets"]

    contexts_from_json = []
    for context in qna_data_contexts:
        cur_context = {}
        cur_context["file_path"] = context["file_path"]
        cur_context["span"] = context["span"]
        cur_context["chunk"] = context["answer"]
        contexts_from_json.append(cur_context)
    
    return qna_data_query, contexts_from_json


# example of loading one
get_dataset_from_json(index=15)

("Consider DBT's Mutual Non-Disclosure Agreement; Does the document require the Receiving Party to notify the Disclosing Party if they are required by law, regulation, or judicial process to disclose any Confidential Information?",
 [{'file_path': 'contractnli/DBT%20Mutual%20NDA.txt',
   'span': [1910, 1975],
   'chunk': 'Confidential Information shall not include any information that: '},
  {'file_path': 'contractnli/DBT%20Mutual%20NDA.txt',
   'span': [2493, 2707],
   'chunk': '(v) the Confidential Information is required to be disclosed pursuant to a requirement of a governmental agency or law so long as the other party is provided notice of such requirement prior to any such disclosure.'}])

## Dummy RAG Pipeline

In [9]:
from langchain import hub
from typing_extensions import List, TypedDict
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import PromptTemplate

baseline_prompt = PromptTemplate.from_template("""HUMAN\n
                                               You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\n
                                               Question: {question}\n 
                                               Context: {context}\n 
                                               Answer:
                                               """)

class ResponseGeneration:
    class State(TypedDict):
        question : str
        context : List[Document]
        answer: str

    def __init__(self, prompt: PromptTemplate, model = "gpt-4o-mini", temperature = 0.2, top_p = 0.6):
        self.llm = ChatOpenAI(model=model, temperature=temperature, top_p=top_p)
        self.prompt = prompt

        graph_builder = StateGraph(self.State)
        graph_builder.add_sequence([self.generate])
        graph_builder.add_edge(START, "generate")
        self.graph = graph_builder.compile()

    def generate(self, state : State):
        context_doc_message = "\n\n".join(doc for doc in state["context"])
        message = self.prompt.invoke({"question":state["question"], "context":context_doc_message})
        response = self.llm.invoke(message)

        return({"answer":response})
    

In [10]:
# baseline response generation object
baseline_response_generation = ResponseGeneration(baseline_prompt)

query, contexts = get_dataset_from_json(index=33)
output = baseline_response_generation.graph.invoke({"question":query, "context":[context["chunk"] for context in contexts]})

In [11]:
print(output["answer"])

content='Yes, the Non-Disclosure Agreement allows the Receiving Party to acquire information similar to the Confidential Information from a third party, provided that the information is lawfully received free of restriction from a source that has the right to furnish it. This is outlined in the context where it states that restrictions do not apply to information independently developed or lawfully received from another source.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 73, 'prompt_tokens': 290, 'total_tokens': 363, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_06737a9306', 'finish_reason': 'stop', 'logprobs': None} id='run-775a7269-8f62-41f4-8a45-14946f063b52-0' usage_metadata={'input_tokens': 290, 'output_tokens'

# Build QnA Baseline for RAGAS

In [13]:
import pandas as pd
import json

def build_dataset(response_generator: ResponseGeneration, size = 10, JSON_CoT=False):
    qna_context_list = []

    # use tqdm here! 
    for i in range(0, size):
        query, contexts = get_dataset_from_json(index=i)
        output = response_generator.graph.invoke({"question":query, "context":[context["chunk"] for context in contexts]})
        

        if JSON_CoT :
            response = json.loads(output["answer"].content)["answer"]
        else :
            response = output["answer"].content
        user_input = query
        retrieved_contexts = [context["chunk"] for context in contexts]

        qna_context_list.append([user_input, response, retrieved_contexts])
        dataset_df = pd.DataFrame(qna_context_list, columns=["user_input", "response", "retrieved_contexts"])
    
    return dataset_df

In [14]:
query_answer_baseline = build_dataset(baseline_response_generation, 50)

In [15]:
display(query_answer_baseline)

Unnamed: 0,user_input,response,retrieved_contexts
0,Consider the Non-Disclosure Agreement between ...,"Yes, the document indicates that the Agreement...","[Any and all proprietary rights, including but..."
1,Consider the Non-Disclosure Agreement between ...,"No, the document does not state that Confident...",[“Confidential Information” means any Idea dis...
2,Consider the Non-Disclosure Agreement between ...,The context does not explicitly mention that o...,[Notwithstanding the termination of this Agree...
3,Consider the Non-Disclosure Agreement between ...,The context does not indicate that the Receivi...,"[At Organiser’s first request, Mentor shall:, ..."
4,Consider the Non-Disclosure Agreement between ...,"No, the Non-Disclosure Agreement prohibits the...",[Mentor shall not disclose any Confidential In...
5,Consider the Non-Disclosure Agreement between ...,"Yes, the document requires the Receiving Party...","[If Mentor is required by mandatory, non-appea..."
6,Consider the Non-Disclosure Agreement between ...,"Yes, the Non-Disclosure Agreement allows the R...","[Confidential Information does not include:, >..."
7,Consider the Non-Disclosure Agreement between ...,"No, the Non-Disclosure Agreement prohibits the...",[Mentor shall not disclose any Confidential In...
8,Consider the Non-Disclosure Agreement between ...,"Yes, the Non-Disclosure Agreement restricts th...",[Mentor shall not use any Confidential Informa...
9,Consider DBT's Mutual Non-Disclosure Agreement...,"Yes, the document indicates that the Agreement...",[5. No Further Rights All Confidential Informa...


# Evaluate Baseline using RAGAS (Faithfulness and Answer Relevancy)

In [None]:
# RAGAS
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_precision,
    context_recall,
)
from ragas import evaluate
from datasets import Dataset    


dataset_ragas = Dataset.from_pandas(query_answer_baseline)
print(dataset_ragas)
result = evaluate(
    dataset_ragas,
    metrics=[
        faithfulness,
        answer_relevancy
    ]
)


  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['user_input', 'response', 'retrieved_contexts'],
    num_rows: 50
})


Evaluating: 100%|██████████| 100/100 [00:59<00:00,  1.68it/s]


In [17]:
print(result)

{'faithfulness': 0.8160, 'answer_relevancy': 0.8623}


# Zero-shot Chain of Thought

In [None]:
zeroshot_CoT_prompt = PromptTemplate.from_template("""HUMAN\n
                                                You are an assistant for question-answering tasks. Think step by step on each of retrieved context on how they help answer the question. Then use only relevant information to answer the question as a sentence or maximal of two sentences. Provide your answer in JSON format {{ "thought":,"answer": }} Just say you don't know if you cannot generate meaningful and factual answer\n
                                                Question: {question}\n
                                                Context: {context}\n
                                                Step-by-step reasoning: Let's think step by step on the context.\n
                                                Answer: 
                                                """)

# baseline response generation object
zero_shot_CoT_response_generation = ResponseGeneration(zeroshot_CoT_prompt)

query, contexts = get_dataset_from_json(index=2)
print(query)
print(contexts)
output = zero_shot_CoT_response_generation.graph.invoke({"question":query, "context":[context["chunk"] for context in contexts]})

print(output["answer"].content)

Consider the Non-Disclosure Agreement between CopAcc and ToP Mentors; Does the document mention that some obligations of the Agreement may survive the termination of the Agreement?
{
  "thought": "The context indicates that even after the termination of the Agreement, there are obligations regarding the confidentiality of information that must be upheld. This suggests that some obligations of the Agreement do indeed survive its termination.",
  "answer": "Yes, the document mentions that some obligations of the Agreement may survive the termination, specifically regarding the confidentiality of information."
}


In [51]:
print(contexts)


[{'file_path': 'contractnli/CopAcc_NDA-and-ToP-Mentors_2.0_2017.txt', 'span': [12076, 12328], 'chunk': 'Notwithstanding the termination of this Agreement, any Confidential Information must be kept confidential for as long as such Confidential Information is not publicly known unless it becomes part of the public domain through no wrongful act of Mentor. '}]


In [None]:
query, contexts = get_dataset_from_json(index=14)
print(query)
# print(contexts)
output = zero_shot_CoT_response_generation.graph.invoke({"question":query, "context":[context["chunk"] for context in contexts]})

print(output["answer"].content)

Consider DBT's Mutual Non-Disclosure Agreement; Does the document permit the Receiving Party to retain some Confidential Information even after its return or destruction?
{
  "thought": "The context specifies that the Receiving Party must return or certify the destruction of all Confidential Information upon the Disclosing Party's request. There is no mention of any allowance for the Receiving Party to retain any Confidential Information after its return or destruction.",
  "answer": "No, the document does not permit the Receiving Party to retain any Confidential Information after its return or destruction."
}


In [21]:
# build dataset and evaluate

exp2_query_answer = build_dataset(zero_shot_CoT_response_generation, 50, JSON_CoT=True)

dataset_ragas = Dataset.from_pandas(exp2_query_answer)
print(dataset_ragas)
result = evaluate(
    dataset_ragas,
    metrics=[
        faithfulness,
        answer_relevancy
    ]
)

Dataset({
    features: ['user_input', 'response', 'retrieved_contexts'],
    num_rows: 50
})


Evaluating: 100%|██████████| 100/100 [00:51<00:00,  1.93it/s]


In [22]:
print(result)

{'faithfulness': 0.6433, 'answer_relevancy': 0.8959}


In [64]:
display(exp2_query_answer)

Unnamed: 0,user_input,response,retrieved_contexts
0,Consider the Non-Disclosure Agreement between ...,"Yes, the document indicates that the Agreement...","[Any and all proprietary rights, including but..."
1,Consider the Non-Disclosure Agreement between ...,"No, the document does not state that Confident...",[“Confidential Information” means any Idea dis...
2,Consider the Non-Disclosure Agreement between ...,"Yes, the document mentions that some obligatio...",[Notwithstanding the termination of this Agree...
3,Consider the Non-Disclosure Agreement between ...,"No, the document does not permit the Receiving...","[At Organiser’s first request, Mentor shall:, ..."
4,Consider the Non-Disclosure Agreement between ...,"No, the document does not allow the Receiving ...",[Mentor shall not disclose any Confidential In...
5,Consider the Non-Disclosure Agreement between ...,"Yes, the document requires the Receiving Party...","[If Mentor is required by mandatory, non-appea..."
6,Consider the Non-Disclosure Agreement between ...,"Yes, the document allows the Receiving Party t...","[Confidential Information does not include:, >..."
7,Consider the Non-Disclosure Agreement between ...,"No, the document does not allow the Receiving ...",[Mentor shall not disclose any Confidential In...
8,Consider the Non-Disclosure Agreement between ...,"Yes, the document restricts the use of Confide...",[Mentor shall not use any Confidential Informa...
9,Consider DBT's Mutual Non-Disclosure Agreement...,"Yes, the document indicates that the Agreement...",[5. No Further Rights All Confidential Informa...


Faithfulness, is a metrics to measure how many claims in the final response is supported by the contexts  
Answer Relevancy, is the cosine distance between the original question to a number of artificial question

In [68]:
# display(exp2_query_answer[0:4])

from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import Faithfulness
from ragas.llms import LangchainLLMWrapper

sample_index = 3

sample = SingleTurnSample(
    user_input=exp2_query_answer["user_input"][sample_index],
    response=exp2_query_answer["response"][sample_index],
    retrieved_contexts=exp2_query_answer["retrieved_contexts"][sample_index]
)

evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini", temperature=0))
scorer = Faithfulness(llm=evaluator_llm)
print(f"faithfulness: {await scorer.single_turn_ascore(sample)}")

print(exp2_query_answer["user_input"][sample_index])
print(exp2_query_answer["retrieved_contexts"][sample_index])
print(exp2_query_answer["response"][sample_index])


faithfulness: 0.0
Consider the Non-Disclosure Agreement between CopAcc and ToP Mentors; Does the document permit the Receiving Party to retain some Confidential Information even after its return or destruction?
['At Organiser’s first request, Mentor shall:', '(d) erase and/or destroy any Confidential Information contained in computer memory or data storage apparatus of, under control of or used by Mentor;\n(e) remove the Confidential Information from any software or data base of, under control of/or used by Mentor that incorporates or uses the Confidential Information in whole or in part; and']
No, the document does not permit the Receiving Party to retain any Confidential Information after its return or destruction.


In [None]:
# test on data that has meaningless context
# what is faithfulness and answer relevancy after all...
# what evaluation metrics used in original LegalBench????

# Varying Temperature

In [24]:
baseline_prompt = PromptTemplate.from_template("""HUMAN\n
                                               You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\n
                                               Question: {question}\n 
                                               Context: {context}\n 
                                               Answer:
                                               """)



temp_0_response_generation = ResponseGeneration(baseline_prompt, temperature=0)

query, contexts = get_dataset_from_json(index=33)
output = temp_0_response_generation.graph.invoke({"question":query, "context":[context["chunk"] for context in contexts]})

print(output["answer"].content)

temp_8_response_generation = ResponseGeneration(baseline_prompt, temperature=0.8)

query, contexts = get_dataset_from_json(index=33)
output = temp_8_response_generation.graph.invoke({"question":query, "context":[context["chunk"] for context in contexts]})

print(output["answer"].content)

Yes, the Non-Disclosure Agreement allows the Receiving Party to acquire information similar to the Confidential Information from a third party, provided that the information is lawfully received free of restriction from a source that has the right to furnish it. This is outlined in the context where it states that restrictions do not apply to information independently developed or lawfully received.
Yes, the Non-Disclosure Agreement allows the Receiving Party to acquire information similar to the Confidential Information from a third party, provided that it is lawfully received free of restriction from a source that has the right to furnish it. This is outlined in the exceptions to the restrictions on the use or disclosure of Confidential Information.


### Temperature 0

In [26]:
temp_0_response_generation = ResponseGeneration(baseline_prompt, temperature=0)

exp3_temp0_query_answer_1 = build_dataset(temp_0_response_generation, 20)

dataset_ragas = Dataset.from_pandas(exp3_temp0_query_answer_1)
print(dataset_ragas)
result = evaluate(
    dataset_ragas,
    metrics=[
        faithfulness,
        answer_relevancy
    ]
)

print(result)

Dataset({
    features: ['user_input', 'response', 'retrieved_contexts'],
    num_rows: 20
})


Evaluating: 100%|██████████| 40/40 [00:23<00:00,  1.70it/s]


{'faithfulness': 0.9104, 'answer_relevancy': 0.8644}


In [27]:
exp3_temp0_query_answer_2 = build_dataset(temp_0_response_generation, 20)

dataset_ragas = Dataset.from_pandas(exp3_temp0_query_answer_2)
print(dataset_ragas)
result = evaluate(
    dataset_ragas,
    metrics=[
        faithfulness,
        answer_relevancy
    ]
)

print(result)

Dataset({
    features: ['user_input', 'response', 'retrieved_contexts'],
    num_rows: 20
})


Evaluating: 100%|██████████| 40/40 [00:28<00:00,  1.39it/s]


{'faithfulness': 0.8271, 'answer_relevancy': 0.9097}


In [28]:
exp3_temp0_query_answer_3 = build_dataset(temp_0_response_generation, 20)

dataset_ragas = Dataset.from_pandas(exp3_temp0_query_answer_3)
print(dataset_ragas)
result = evaluate(
    dataset_ragas,
    metrics=[
        faithfulness,
        answer_relevancy
    ]
)

print(result)

Dataset({
    features: ['user_input', 'response', 'retrieved_contexts'],
    num_rows: 20
})


Evaluating: 100%|██████████| 40/40 [00:31<00:00,  1.28it/s]


{'faithfulness': 0.8133, 'answer_relevancy': 0.9079}


### Temperature 0.25

In [29]:
temp_2_5_response_generation = ResponseGeneration(baseline_prompt, temperature=0.25)

exp3_temp2_5_query_answer_1 = build_dataset(temp_2_5_response_generation, 20)

dataset_ragas = Dataset.from_pandas(exp3_temp2_5_query_answer_1)
print(dataset_ragas)
result = evaluate(
    dataset_ragas,
    metrics=[
        faithfulness,
        answer_relevancy
    ]
)

print(result)

Dataset({
    features: ['user_input', 'response', 'retrieved_contexts'],
    num_rows: 20
})


Evaluating: 100%|██████████| 40/40 [00:21<00:00,  1.88it/s]


{'faithfulness': 0.8771, 'answer_relevancy': 0.8177}


In [30]:
exp3_temp2_5_query_answer_2 = build_dataset(temp_2_5_response_generation, 20)

dataset_ragas = Dataset.from_pandas(exp3_temp2_5_query_answer_2)
print(dataset_ragas)
result = evaluate(
    dataset_ragas,
    metrics=[
        faithfulness,
        answer_relevancy
    ]
)

print(result)

Dataset({
    features: ['user_input', 'response', 'retrieved_contexts'],
    num_rows: 20
})


Evaluating: 100%|██████████| 40/40 [00:22<00:00,  1.78it/s]


{'faithfulness': 0.8521, 'answer_relevancy': 0.8174}


In [31]:
exp3_temp2_5_query_answer_3 = build_dataset(temp_2_5_response_generation, 20)

dataset_ragas = Dataset.from_pandas(exp3_temp2_5_query_answer_3)
print(dataset_ragas)
result = evaluate(
    dataset_ragas,
    metrics=[
        faithfulness,
        answer_relevancy
    ]
)

print(result)

Dataset({
    features: ['user_input', 'response', 'retrieved_contexts'],
    num_rows: 20
})


Evaluating: 100%|██████████| 40/40 [00:24<00:00,  1.66it/s]


{'faithfulness': 0.8042, 'answer_relevancy': 0.8629}


### Temperature 0.9

In [32]:
temp_9_response_generation = ResponseGeneration(baseline_prompt, temperature=0.9)

exp3_temp9_query_answer_1 = build_dataset(temp_9_response_generation, 20)

dataset_ragas = Dataset.from_pandas(exp3_temp9_query_answer_1)
print(dataset_ragas)
result = evaluate(
    dataset_ragas,
    metrics=[
        faithfulness,
        answer_relevancy
    ]
)

print(result)

Dataset({
    features: ['user_input', 'response', 'retrieved_contexts'],
    num_rows: 20
})


Evaluating: 100%|██████████| 40/40 [00:21<00:00,  1.85it/s]


{'faithfulness': 0.8729, 'answer_relevancy': 0.8156}


In [33]:
exp3_temp9_query_answer_2 = build_dataset(temp_9_response_generation, 20)

dataset_ragas = Dataset.from_pandas(exp3_temp9_query_answer_2)
print(dataset_ragas)
result = evaluate(
    dataset_ragas,
    metrics=[
        faithfulness,
        answer_relevancy
    ]
)

print(result)

Dataset({
    features: ['user_input', 'response', 'retrieved_contexts'],
    num_rows: 20
})


Evaluating: 100%|██████████| 40/40 [00:22<00:00,  1.75it/s]


{'faithfulness': 0.8625, 'answer_relevancy': 0.8617}


In [34]:
exp3_temp9_query_answer_2 = build_dataset(temp_9_response_generation, 20)

dataset_ragas = Dataset.from_pandas(exp3_temp9_query_answer_2)
print(dataset_ragas)
result = evaluate(
    dataset_ragas,
    metrics=[
        faithfulness,
        answer_relevancy
    ]
)

print(result)

Dataset({
    features: ['user_input', 'response', 'retrieved_contexts'],
    num_rows: 20
})


Evaluating: 100%|██████████| 40/40 [00:24<00:00,  1.62it/s]


{'faithfulness': 0.8792, 'answer_relevancy': 0.9068}


In [None]:
# seems like the temperature is not really consistent way to modify the faithfulness....
# but it may correlate more to the response relevancy!

# study more about the metrics: faithfulness and response relevancy 