In [38]:
!pip show langchain

Name: langchain
Version: 0.3.17
Summary: Building applications with LLMs through composability
Home-page: https://github.com/langchain-ai/langchain
Author: 
Author-email: 
License: MIT
Location: /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages
Requires: aiohttp, langchain-core, langchain-text-splitters, langsmith, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: langchain-community, ragas


In [1]:
import os
import json
import random
from collections import defaultdict
from typing import List, Tuple
from pydantic import BaseModel, computed_field

# llm=mistral, embedding=gte-base, vecetorstore=faiss
# from langchain.docstore.document import Document
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.embeddings import HuggingFaceEmbeddings
# from langchain.vectorstores import FAISS

# llm=openai, embedding=openai, vectorstore=chroma
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import START, StateGraph

load_dotenv()
os.environ.get("OPENAI_API_KEY")

# corpus
dataset_name = "contractnli"
test_file = f"../data/benchmarks/{dataset_name}.json"
result_file = f"../data/results/qa_results.json"

# embeddings
# embeddings = HuggingFaceEmbeddings(model_name="thenlper/gte-base") 
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# vector store
# vectorstore_path = "./vectorstore/faiss_store_gte_base"
persist_path = "./vectorstore/chroma_openai_embed_3_small"
# Chroma is just good for PoC and hobby
vector_store = Chroma(
        embedding_function=embeddings, 
        persist_directory=persist_path
    )

# LLM
llm = ChatOpenAI(model="gpt-3.5-turbo-0125")

In [2]:
def load_documents_with_spans(directory: str, chunk_size: int = 1000, chunk_overlap: int = 0):
    """
    Loads .txt files from a directory, splits each document's text into chunks using
    RecursiveCharacterTextSplitter, computes the span (start, end) for each chunk, and
    returns a list of Document objects with metadata (including filename, source, and span).
    """
    documents = []
    # Initialize the splitter with the desired separators and parameters.
    splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", "!", "?", ".", ":", ";", ",", " ", ""],
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        is_separator_regex=False,
        strip_whitespace=False,
    )
    
    # Process each .txt file in the directory.
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            filepath = os.path.join(directory, filename)
            with open(filepath, "r", encoding="utf-8") as f:
                text = f.read()
            
            # Split text into chunks.
            text_splits = splitter.split_text(text)
            
            # Verify that the chunks concatenate to the original text.
            assert "".join(text_splits) == text, "Concatenated splits do not match the original text."
            
            # Compute spans and create Document objects.
            prev_index = 0
            for i, chunk_text in enumerate(text_splits):
                span_start = prev_index
                span_end = prev_index + len(chunk_text)
                prev_index += len(chunk_text)
                doc = Document(
                    page_content=chunk_text,
                    metadata={
                        "filename": filename,
                        "filepath": f"{dataset_name}/{filename}",
                        "span_start": span_start,
                        "span_end":  span_end,  # Stores the (start, end) positions of the chunk.
                        "id": f"{filename}_chunk_{i}"
                    }
                )
                documents.append(doc)
    return documents

In [3]:
# Update this to the folder where your ContractNLI .txt files reside.
directory_path = f"../data/corpus/{dataset_name}"

# Load the documents, splitting each into chunks with span metadata.
documents = load_documents_with_spans(directory_path, chunk_size=500, chunk_overlap=0)
print(f"Loaded {len(documents)} document chunks with spans.")
print(documents[:5])

Loaded 3307 document chunks with spans.
[Document(metadata={'filename': 'QEP-Williston-Form-of-Confidentiality-Agreement-BMO.txt', 'filepath': 'contractnli/QEP-Williston-Form-of-Confidentiality-Agreement-BMO.txt', 'span_start': 0, 'span_end': 451, 'id': 'QEP-Williston-Form-of-Confidentiality-Agreement-BMO.txt_chunk_0'}, page_content='CONFIDENTIALITY AND NONDISCLOSURE AGREEMENT\nThis Confidentiality and Nondisclosure Agreement (this “Agreement”) is dated _______ ___, 2018 (the “Effective Date”), and is between QEP Energy Company (“Owner”), a Delaware corporation, and _____________________ (the “Receiving Company”), a ______ ______________. Owner and the Receiving Company are sometimes referred to herein individually as a “Party” and collectively as the “Parties.”\nR E C I T A L S'), Document(metadata={'filename': 'QEP-Williston-Form-of-Confidentiality-Agreement-BMO.txt', 'filepath': 'contractnli/QEP-Williston-Form-of-Confidentiality-Agreement-BMO.txt', 'span_start': 451, 'span_end': 751

In [4]:
print(documents[0])

page_content='CONFIDENTIALITY AND NONDISCLOSURE AGREEMENT
This Confidentiality and Nondisclosure Agreement (this “Agreement”) is dated _______ ___, 2018 (the “Effective Date”), and is between QEP Energy Company (“Owner”), a Delaware corporation, and _____________________ (the “Receiving Company”), a ______ ______________. Owner and the Receiving Company are sometimes referred to herein individually as a “Party” and collectively as the “Parties.”
R E C I T A L S' metadata={'filename': 'QEP-Williston-Form-of-Confidentiality-Agreement-BMO.txt', 'filepath': 'contractnli/QEP-Williston-Form-of-Confidentiality-Agreement-BMO.txt', 'span_start': 0, 'span_end': 451, 'id': 'QEP-Williston-Form-of-Confidentiality-Agreement-BMO.txt_chunk_0'}


In [5]:
# add the documents (chunks) to the vector store
_ = vector_store.add_documents(documents=documents)

# Retrieve Samples

In [6]:
with open(test_file, "r", encoding="utf-8") as file:
    test_data = json.load(file)
print("Test data loaded successfully.")

Test data loaded successfully.


In [12]:
qidx = random_number = random.randint(1, 10)
sample_test = test_data["tests"][qidx]
query_test = sample_test["query"]
answer_test = sample_test["snippets"][0]["answer"]
answer_file_path_test = sample_test["snippets"][0]["file_path"]

print(f"query: {query_test}")
print(f"answer: {answer_test}")
print(f"file_path: {answer_file_path_test}")

query: Consider the Non-Disclosure Agreement between CopAcc and ToP Mentors; Does the document mention that some obligations of the Agreement may survive the termination of the Agreement?
answer: Notwithstanding the termination of this Agreement, any Confidential Information must be kept confidential for as long as such Confidential Information is not publicly known unless it becomes part of the public domain through no wrongful act of Mentor. 
file_path: contractnli/CopAcc_NDA-and-ToP-Mentors_2.0_2017.txt


In [37]:
qidx = random_number = random.randint(1, 150)
sample_test = test_data["tests"][qidx]
query_test = sample_test["query"]
answer_test = sample_test["snippets"][0]["answer"]
answer_file_path_test = sample_test["snippets"][0]["file_path"]

# query_test = "Consider the Non-Disclosure Agreement between CopAcc and ToP Mentors; Does the document restrict the use of Confidential Information to the purposes stated in the Agreement?"
retrieved_docs = vector_store.similarity_search_with_relevance_scores(query_test, 3, filter={"filepath": answer_file_path_test})

print(f"query: {query_test}")
print(f"answer: {answer_test}")
print(f"file_path: {answer_file_path_test}")
print("\n\n\n")
for doc, score in retrieved_docs:
    # print(f"filename: {doc.metadata}")
    print(f"filename: {doc.metadata['filename']}")
    print(f"answer: {doc.page_content}")
    print(f"score: {score}")
    print("-"*50, '\n')

query: Consider the Non-Disclosure Agreement between CopAcc and ToP Mentors; Does the document allow the Receiving Party to share some Confidential Information with third parties, including consultants, agents, and professional advisors?
answer: Mentor shall not disclose any Confidential Information to any third party or to Mentor’s employees and/or employer without the prior written consent of the Participants. 
file_path: contractnli/CopAcc_NDA-and-ToP-Mentors_2.0_2017.txt




filename: CopAcc_NDA-and-ToP-Mentors_2.0_2017.txt
answer: 
Mentor shall not disclose any Confidential Information to any third party or to Mentor’s employees and/or employer without the prior written consent of the Participants. Mentor shall require his/her employees who will have access to Confidential Information to commit to a non-disclosure agreement that protects the Confidential Information to at least the same degree as this Agreement
score: 0.5104152956142991
--------------------------------------------

In [28]:
print(sample_test)

{'query': 'Consider the Non-Disclosure Agreement between CopAcc and ToP Mentors; Does the document allow the Receiving Party to share some Confidential Information with third parties, including consultants, agents, and professional advisors?', 'snippets': [{'file_path': 'contractnli/CopAcc_NDA-and-ToP-Mentors_2.0_2017.txt', 'span': [8667, 8837], 'answer': 'Mentor shall not disclose any Confidential Information to any third party or to Mentor’s employees and/or employer without the prior written consent of the Participants. '}]}


# Full RAG Pipeline

In [6]:
from langchain import hub
from typing_extensions import List, TypedDict

prompt = hub.pull("rlm/rag-prompt")

class State(TypedDict):
    question : str
    context : List[Document]
    answer: str

def retrieve(state : State):
    retrieved_documents = vector_store.similarity_search(state["question"])
    return({"context":retrieved_documents})

def generate(state : State):
    context_doc_message = "\n\n".join(doc.page_content for doc in state["context"])
    message = prompt.invoke({"question":state["question"], "context":context_doc_message})
    response = llm.invoke(message)
    return({"answer":response})

graph_builder = StateGraph(State)
graph_builder.add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()



In [29]:
qidx = random_number = random.randint(1, 10)
sample_test = test_data["tests"][qidx]
query_test = sample_test["query"]
answer_test = sample_test["snippets"][0]["answer"]
answer_file_path_test = sample_test["snippets"][0]["file_path"]

response = graph.invoke({"question": query_test})
print(response)

print(sample_test["snippets"][0]["file_path"])
# print(response["question"])
# print(response["answer"])

{'question': "Consider DBT's Mutual Non-Disclosure Agreement; Does the document indicate that the Agreement does not grant the Receiving Party any rights to the Confidential Information?", 'context': [Document(id='34a8fa79-3b00-4b47-8519-c373186ec79c', metadata={'filename': 'NDA-ONSemi_IndustryAnalystConf-2011.txt', 'filepath': 'contractnli/NDA-ONSemi_IndustryAnalystConf-2011.txt', 'id': 'NDA-ONSemi_IndustryAnalystConf-2011.txt_chunk_0', 'span_end': 45, 'span_start': 0}, page_content=' Confidentiality and Non-Disclosure Agreement'), Document(id='7b214ea7-0a1b-4391-a672-5743b1099f6d', metadata={'filename': 'NDA-ONSemi_IndustryAnalystConf-2011.txt', 'filepath': 'contractnli/NDA-ONSemi_IndustryAnalystConf-2011.txt', 'id': 'NDA-ONSemi_IndustryAnalystConf-2011.txt_chunk_0', 'span_end': 45, 'span_start': 0}, page_content=' Confidentiality and Non-Disclosure Agreement'), Document(id='fa7e4920-6283-40a7-b4e2-4a8c85842011', metadata={'filename': 'simply-fashion---standard-nda.txt', 'filepath': 

TypeError: 'Document' object is not subscriptable