In [1]:
import os
import json
import random
from collections import defaultdict
from typing import List, Tuple
from pydantic import BaseModel, computed_field

from dotenv import load_dotenv
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph

import pandas as pd
import json

from langchain import hub
from typing_extensions import List, TypedDict
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_core.language_models.chat_models import BaseChatModel


## Import model dependencies and load API Key

In [2]:
# for OPENAI

from langchain_openai import ChatOpenAI

load_dotenv()
api_key_openai = os.environ.get("OPENAI_API_KEY")

## Load data from info retrieval (will have maximum K of context)

In [3]:
current_file = "./data/final_generation/contractnli.json"
# current_file = "./data/final_generation/cuad.json"
# current_file = "./data/final_generation/maud.json"
# current_file = "./data/final_generation/privacy_qa.json"

In [4]:
with open(current_file) as f:
    QnA_data = json.load(f)

In [23]:
# get query and K number of context
def get_query_from_json_at_K(index=0, k_context=3):
    
    qna_data_query = QnA_data[index]["query"]
    retrieved_chunks = QnA_data[index]["retrieved_chunks_unranked"]
    expertise_level = QnA_data[index]["feature_extraction"][0]["readability"]

    contexts_from_json = []

    for i in range(min(k_context, len(retrieved_chunks))):
        context = retrieved_chunks[i]
        cur_context = {}
        cur_context["file_path"] = context["filepath"]
        cur_context["span"] = context["span"]
        cur_context["chunk"] = context["text"]

        contexts_from_json.append(cur_context)
    
    return qna_data_query, contexts_from_json, expertise_level

In [24]:
# example use
get_query_from_json_at_K(0, 10)

('Consider the Non-Disclosure Agreement between CopAcc and ToP Mentors; Does the document indicate that the Agreement does not grant the Receiving Party any rights to the Confidential Information?',
 [{'file_path': 'contractnli/CopAcc_NDA-and-ToP-Mentors_2.0_2017.txt',
   'span': [7711, 8160],
   'chunk': '4 Definition of Confidential Information\n“Confidential Information” means any Idea disclosed to Mentor, all data and information, know-how, business concepts, software, procedures, products, services, development projects, and programmes contained in such Idea and/or its description and any conclusions. Confidential Information does not include:\n> information already known or independently developed by Mentor prior to the disclosure of any Idea;'},
  {'file_path': 'contractnli/CopAcc_NDA-and-ToP-Mentors_2.0_2017.txt',
   'span': [359, 765],
   'chunk': '3 Payment ........................................................................................................................

## Response generator class

In [69]:
class ResponseGenerator:
    class State(TypedDict):
        question : str
        context : List[Document]
        expertise_level : str
        answer: str

    def __init__(self, prompt : PromptTemplate, llm : BaseChatModel):
        self.llm = llm
        self.prompt = prompt

        graph_builder = StateGraph(self.State)
        graph_builder.add_sequence([self.generate])
        graph_builder.add_edge(START, "generate")
        self.graph = graph_builder.compile()

    def generate(self, state : State):
        context_doc_message = "\n\n".join(doc for doc in state["context"])
        message = self.prompt.invoke({"question":state["question"], "context":context_doc_message, "expertise_level": state["expertise_level"]})
        response = self.llm.invoke(message)

        return({"answer":response})

## Generate response functions

In [70]:
from tqdm import tqdm  

def generate_response_with_context_at_K(response_generator: ResponseGenerator, size=10, k_context=3, JSON_CoT=False):
    qna_context_list = []

    # Wrap the range iterator with tqdm for progress tracking
    for i in tqdm(range(0, size), desc="Generating responses"):
        query, contexts, expertise_level = get_query_from_json_at_K(index=i, k_context=k_context)                
        output = response_generator.graph.invoke({"question": query, "context": [context["chunk"] for context in contexts], "expertise_level": expertise_level})
        
        user_input = query
        retrieved_contexts = [context["chunk"] for context in contexts]

        if JSON_CoT:
            # Prepare the raw response for later parsing
            raw_response = output["answer"].content.strip()
            qna_context_list.append([user_input, raw_response, retrieved_contexts])

            dataset_df = pd.DataFrame(qna_context_list, columns=["user_input", "raw_response", "retrieved_contexts"])

        else:
            response = output["answer"].content
            qna_context_list.append([user_input, response, retrieved_contexts])

            dataset_df = pd.DataFrame(qna_context_list, columns=["user_input", "response", "retrieved_contexts"])
    
    return dataset_df

## Define the model

In [71]:
llm_openAI = ChatOpenAI(model="gpt-4o-mini", temperature=0.2, api_key=api_key_openai)

## Initialize prompt and response generator

In [72]:
# ======================================================================================
# ========================        HUMAN-TUNED PROMPT        =======================
# ======================================================================================

human_tuned_prompt = PromptTemplate.from_template("""### Instruction:\n
                                                       You are an AI assistant specializing in legal contract analysis. Your task is to carefully examine the *provided Retrieved Chunk* and *answer the user's question accurately*.\n
                                                       Follow these guidelines:\n
                                                       
                                                       Read the clause carefully. Identify any terms, conditions, or restrictions related to the user's question.\n
                                                       Answer explicitly based on the clause. If the clause clearly states the information being asked, explain it clearly and accurately.\n
                                                       Do not ignore relevant details. If the clause contains conditions, restrictions, or exceptions, **mention them in your answer.\n
                                                       If the clause does not provide a direct answer, say so. Do not assume or infer information that is not stated.\n
                                                       Support your answer with key phrases from the clause clause when necessary.\n
                                                       Avoid unnecessary repetition or legal jargon. The goal is to make the answer **clear and understandable.\n
                                                       Articulate response according to the user expertise level: you may need to use simpler language for non-expert user\n
                                                       The user for this query is an {expertise_level} in legal domain\n
                                                       
                                                       ### Retrieved Chunk:\n
                                                       {context}\n
                                                       
                                                       ### User's Question:\n
                                                       {question}\n
                                                       ### Answer:\n                                                  
                                                       """)

human_tuned_response_generator_gpt = ResponseGenerator(prompt=human_tuned_prompt, llm=llm_openAI)


## Generate all responses at all K

In [73]:
def generate_responses_for_k(sample_size, k_values, model_name, human_tuned_response_generator):
        
    for k in k_values:
        # Manually written
        query_answer_human_tuned = generate_response_with_context_at_K(human_tuned_response_generator, sample_size, k_context=k)
        query_answer_human_tuned.to_json(f'query_answer_human_tuned_{model_name}_k{k}.json', orient="records", indent=4)

## Generate for GPT MODEL

In [74]:
generate_responses_for_k(sample_size = 194,
                         k_values = [10], 
                         model_name = "contractnli_gpt4omini",
                         human_tuned_response_generator = human_tuned_response_generator_gpt
                         )

Generating responses:   2%|▏         | 4/194 [00:23<18:55,  5.97s/it]


KeyboardInterrupt: 