In [1]:
import os
os.chdir("../")

from src.chroma_store import initialize_vectorstore, load_documents_from_dir
from src.chunking_strategies import chunk_by_semantic, chunk_by_recursive_split
from src.rag_pipeline import create_rank_fusion_chain, generate_answer
from langchain_openai import ChatOpenAI
from src.utils import format_tuple_docs_to_text, format_docs_to_text

In [2]:
from sentence_transformers import SentenceTransformer
from langchain.chains import HypotheticalDocumentEmbedder, LLMChain
from langchain_openai import OpenAI, OpenAIEmbeddings
from langchain.prompts import PromptTemplate


  from tqdm.autonotebook import tqdm, trange
2024-07-10 00:28:12 - datasets - INFO - PyTorch version 2.3.1 available.


In [3]:
from langchain.chains import HypotheticalDocumentEmbedder, LLMChain

base_embeddings = OpenAIEmbeddings()
llm = OpenAI(model="gpt-4o")

In [4]:
prompt_template = """Please answer the user's question about the legal contract
Question: {question}
Answer:"""
prompt = PromptTemplate(input_variables=["question"], template=prompt_template)
llm_chain = LLMChain(llm=llm, prompt=prompt)

  warn_deprecated(


In [5]:
embeddings = HypotheticalDocumentEmbedder(
    llm_chain=llm_chain, base_embeddings=base_embeddings
)

In [6]:
# multi_llm = OpenAI(n=4, best_of=4)
# embeddings = HypotheticalDocumentEmbedder.from_llm(
#     multi_llm, base_embeddings, "web_search"
# )

In [4]:
# Load the documents from the data directory.
documents = load_documents_from_dir("data/content")

chunks = chunk_by_recursive_split(documents, chunk_size=800)

vectorstore = initialize_vectorstore(chunks)

--INFO-- Loading documents from data/content


2024-07-10 00:28:22 - src.chroma_store - INFO - Clearing out the chroma database.
2024-07-10 00:28:22 - src.chroma_store - INFO - Creating a new chroma database.


--INFO-- Loaded 1 documents
Split 1 documents into 26 chunks.
Advisor shall be solely responsible for any income taxes or other assessments made or imposed by any governmental authority on Advisor with respect to the Services rendered and the compensation received hereunder, and any and all expenses and costs of himself, employees, agents and representatives, including, without limitation, any salary, overtime, severance or social benefits payable thereto, and marketing costs incurred in connection with the performance of obligations hereunder.

Confidentiality, Non-Competition and IP Ownership Undertaking: In connection with the performance of Advisor’s obligations under this Agreement, the Advisor shall execute a Confidentiality, Non-Competition and IP Ownership Undertaking in the form attached hereto as Exhibit A.
{'source': 'data/content/Robinson Advisory.docx', 'start_index': 5516}


2024-07-10 00:28:25 - chromadb.telemetry.product.posthog - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-07-10 00:28:29 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [5]:
similarity_count = 5
retriever = vectorstore.as_retriever(search_kwargs={"k": similarity_count})

### Question

In [6]:
question = "Who are the parties to the Agreement and what are their defined names?"


#### Test answer generation

In [5]:
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank
from langchain_community.llms import Cohere
from langchain_cohere import ChatCohere


llm = ChatCohere(
    model="command",
    temperature=0,
)

retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": 5,
    }
)
compressor = CohereRerank()
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

compressed_docs = compression_retriever.invoke(
    question
)
# pretty_print_docs(compressed_docs)

2024-07-08 12:40:03 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-08 12:40:04 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 200 OK"


In [10]:
llm = ChatOpenAI(model="gpt-4o", temperature=0)



# retrieval_chain = create_rank_fusion_chain(question, llm, compression_retriever)
docs = retriever.invoke({"question": question})
print(docs)

context_text = format_docs_to_text(docs)

answer = generate_answer(question, context_text, llm=llm)

2024-07-10 00:20:32 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 404 Not Found"


NotFoundError: Error code: 404 - {'error': {'message': 'This is a chat model and not supported in the v1/completions endpoint. Did you mean to use v1/chat/completions?', 'type': 'invalid_request_error', 'param': 'model', 'code': None}}

In [15]:
answer

'**Question:** Who are the parties to the Agreement and what are their defined names?\n\n**Answer:** The parties to the Agreement are "Cloud Investments Ltd." and "Jack Robinson." This is explicitly stated in the signature block at the end of the Agreement, where "Cloud Investments Ltd." is represented by Silvan Joseph, and the "Advisor" is Jack Robinson.'

### Ragas for testing

In [19]:
from src.ragas_pipeline import run_evaluation

results = run_evaluation(retriever=retriever,
               file_path="data/evaluation_sets/Robinson_Q&A.docx",
               llm=llm
               )

results.head()

  warn_deprecated(
2024-07-09 23:01:01 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
2024-07-09 23:01:02 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-09 23:01:11 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-09 23:01:15 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
2024-07-09 23:01:16 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-09 23:01:18 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-09 23:01:22 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/completions "HTTP/1.1 200 OK"
2024-07-09 23:01:23 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-09 23:01:29 - httpx - INFO - HTTP Request: POST https://api.openai.com

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

2024-07-09 23:02:35 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-09 23:02:36 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-09 23:02:36 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-09 23:02:36 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-09 23:02:36 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-09 23:02:36 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-09 23:02:36 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-09 23:02:36 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-09 23:02:37 - httpx - INFO - HTTP Request: POST https://

Unnamed: 0,question,answer,contexts,ground_truth,answer_correctness,faithfulness,context_precision,context_recall
0,Who are the parties to the Agreement and what ...,**Answer:**\n\nThe parties to the Agreement ar...,[Entire Agreement; No Waiver or Assignment: Th...,Cloud Investments Ltd. (“Company”) and Jack Ro...,0.725368,0.75,1.0,1.0
1,What is the termination notice?,**Question:** What is the termination notice?\...,[Term: The term of this Agreement shall commen...,According to section 4:14 days for convenience...,0.661744,0.666667,0.804167,0.5
2,What are the payments to the Advisor under the...,"**Answer:**\n\nUnder the Agreement, the paymen...",[As full and sole consideration for the Servic...,According to section 6: 1. Fees of $9 per hour...,0.218246,0.818182,0.95,1.0
3,Can the Agreement or any of its obligations be...,**Answer:**\n\nThe Agreement and its obligatio...,[Entire Agreement; No Waiver or Assignment: Th...,1. Under section 1.1 the Advisor can’t assign ...,0.729789,0.8,1.0,1.0
4,Who owns the IP?,**Question:** Who owns the IP?\n\n**Answer:** ...,"[IP: Any Work Product, upon creation, shall be...",According to section 4 of the Undertaking (App...,0.862393,0.666667,1.0,1.0


In [17]:
print(results)

NameError: name 'results' is not defined