In [1]:
import os
os.chdir("../")

from src.chroma_store import initialize_vectorstore, load_documents_from_dir
from src.chunking_strategies import chunk_by_semantic, chunk_by_recursive_split
from src.rag_pipeline import create_rank_fusion_chain, generate_answer
from langchain_openai import ChatOpenAI
from src.utils import format_tuple_docs_to_text, format_docs_to_text

In [2]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

text_splitter = SemanticChunker(
    OpenAIEmbeddings()
)



In [3]:
from langchain_cohere import CohereEmbeddings

embeddings = CohereEmbeddings(
    model="embed-english-v3.0"
)


In [4]:
# Load the documents from the data directory.
documents = load_documents_from_dir("data/content")

chunks = text_splitter.create_documents([document.page_content for document in documents])
# print(docs[0].page_content)
# chunks = chunk_by_recursive_split(documents, chunk_size=800)

vectorstore = initialize_vectorstore(chunks)

--INFO-- Loading documents from data/content
--INFO-- Loaded 1 documents


2024-07-10 08:03:37 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-10 08:03:38 - src.chroma_store - INFO - Clearing out the chroma database.
2024-07-10 08:03:38 - src.chroma_store - INFO - Creating a new chroma database.
2024-07-10 08:03:41 - chromadb.telemetry.product.posthog - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-07-10 08:03:43 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [5]:
print(chunks[0].page_content)

- 2-

ADVISORY SERVICES AGREEMENT

This Advisory Services Agreement is entered into as of June 15th, 2023 (the “Effective Date”), by and between Cloud Investments Ltd., ID 51-426526-3, an Israeli company (the "Company"), and Mr. Jack Robinson, Passport Number 780055578, residing at 1 Rabin st, Tel Aviv, Israel, Email: jackrobinson@gmail.com ("Advisor"). Whereas,	Advisor has expertise and/or knowledge and/or relationships, which are relevant to the Company’s business and the Company has asked Advisor to provide it with certain Advisory services, as described in this Agreement; and

Whereas, 	Advisor has agreed to provide the Company with such services, subject to the terms set forth in this Agreement.


In [6]:
similarity_count = 5
retriever = vectorstore.as_retriever(search_kwargs={"k": similarity_count})

### Question

In [7]:
question = "Who are the parties to the Agreement and what are their defined names?"


#### Test answer generation

In [6]:
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank
from langchain_community.llms import Cohere
from langchain_cohere import ChatCohere


llm = ChatCohere(
    model="command",
    temperature=0,
)

retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": 5,
    }
)
compressor = CohereRerank()
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

compressed_docs = compression_retriever.invoke(
    question
)
# pretty_print_docs(compressed_docs)

2024-07-10 07:46:19 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
2024-07-10 07:46:40 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
2024-07-10 07:47:03 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
2024-07-10 07:47:07 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
2024-07-10 07:47:11 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
2024-07-10 07:47:16 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
2024-07-10 07:47:20 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
2024-07-10 07:47:24 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
2024-07-10 07:47:28 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"
2024-07-10 07:47:32 - httpx - INFO - HTTP Requ

2024-07-10 07:46:20 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 200 OK"


In [10]:
llm = ChatOpenAI(model="gpt-4o", temperature=0)



In [9]:


retrieval_chain = create_rank_fusion_chain(question, llm, retriever)
docs = retrieval_chain.invoke({"question": question})
# docs = retriever.invoke(question)
print(docs)

context_text = format_tuple_docs_to_text(docs)

answer = generate_answer(question, context_text, llm=llm)

2024-07-10 00:59:25 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-10 00:59:26 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-10 00:59:26 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-10 00:59:26 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-10 00:59:26 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-10 00:59:26 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


[(Document(page_content='Entire Agreement; No Waiver or Assignment: This Agreement together with the Exhibits, which are attached hereto and incorporated herein, set forth the entire Agreement between the parties and shall supersede all previous communications and agreements between the parties, either oral or written. This Agreement may be modified only by a written amendment executed by both parties. This Agreement may not be assigned, sold, delegated or transferred in any manner by Advisor for any reason whatsoever. The Company may assign the Agreement to a successor of all or substantially all of its assets or business, provided the assignee has assumed the Company’s obligations under this Agreement.', metadata={'source': 'data/content/Robinson Advisory.docx', 'start_index': 7616}), 0.08253968253968252), (Document(page_content='Governing Law and Jurisdiction:  This Agreement shall be governed by the laws of the State of Israel, without giving effect to the rules respecting conflict

2024-07-10 00:59:28 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [10]:
answer

'The parties to the Agreement are Cloud Investments Ltd. ("Company") and Jack Robinson ("Advisor").'

### Ragas for testing

In [11]:
from ragas import evaluate
from datasets import Dataset
from ragas.metrics import (
    answer_correctness,
    faithfulness,
    context_precision,
    context_recall,
    context_relevancy,
    answer_relevancy,
)

from src.utils import extract_qa_pairs_to_df, format_tuple_docs_to_text, format_docs_to_text
# from backend.app.rag.rag_utils import generate_answer
from src.rag_pipeline import generate_answer

def evaluate_metrics(dataset):
  # evaluating dataest on listed metrics
  result = evaluate(
      dataset=dataset,
      metrics=[
          answer_correctness,
          faithfulness,
          context_precision,
          context_recall
      ]
  )


  df_results = result.to_pandas()

  return df_results
import time
def run_evaluation(retriever,
                   file_path="data/evaluation_sets/Robinson_Q&A.docx",
                    llm=None,
                   test_size=None):  # Replace with your actual file path
  df = extract_qa_pairs_to_df(file_path)

  if test_size:
    df = df.head(test_size)
    
  # Change the df columns to list
  questions = df["question"].tolist()
  ground_truths = df["ground_truths"].tolist()


  answers = []
  contexts = []
  # Inference
  for query in questions:
    #   documents = retriever.get_relevant_documents(query)
    
      time.sleep(2)
    
      # retrieval_chain = create_rank_fusion_chain(question, llm, compression_retriever)
      # documents = retrieval_chain.invoke({"question": query})
      
      # Cohere Compression retrieval
      documents = retriever.get_relevant_documents(query)
        
      contexts.append(
          [docs.page_content for docs in documents]
      )
      context_text = format_docs_to_text(documents)

      answers.append(generate_answer(query, context_text, llm=llm))
      
  # To dict
  data = {
      "question": questions,
      "answer": answers,
      "contexts": contexts,
      "ground_truth": ground_truths,
  }

  # Convert dict to dataset
  dataset = Dataset.from_dict(data)
  
  results = evaluate_metrics(dataset)
  return results



In [12]:
# from src.ragas_pipeline import run_evaluation

results = run_evaluation(retriever=retriever,
               file_path="data/evaluation_sets/Robinson_Q&A.docx",
               llm=llm
               )

results.head()

  warn_deprecated(
2024-07-10 08:04:23 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-10 08:04:25 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-10 08:04:27 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-10 08:04:30 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-10 08:04:32 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-10 08:04:35 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-10 08:04:37 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-10 08:04:39 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-10 08:04:41 - httpx - INFO - HTTP Request: POST https://api.o

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

2024-07-10 08:05:05 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-10 08:05:05 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-10 08:05:05 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-10 08:05:05 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-10 08:05:05 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-10 08:05:05 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-10 08:05:05 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-10 08:05:05 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-10 08:05:05 - httpx - INFO - HTTP Request: POST https://

Unnamed: 0,question,answer,contexts,ground_truth,answer_correctness,faithfulness,context_precision,context_recall
0,Who are the parties to the Agreement and what ...,The parties to the Agreement are Cloud Investm...,"[IN WITNESS WHEREOF, the undersigned has execu...",Cloud Investments Ltd. (“Company”) and Jack Ro...,0.732784,0.25,1.0,1.0
1,What is the termination notice?,Either party may terminate the Agreement at an...,[NOW THEREFORE THE PARTIES AGREE AS FOLLOWS:\n...,According to section 4:14 days for convenience...,0.560186,1.0,0.804167,0.5
2,What are the payments to the Advisor under the...,The Advisor shall receive hourly fees at a rat...,[NOW THEREFORE THE PARTIES AGREE AS FOLLOWS:\n...,According to section 6: 1. Fees of $9 per hour...,0.224562,1.0,0.95,1.0
3,Can the Agreement or any of its obligations be...,"The Agreement may not be assigned, sold, deleg...",[NOW THEREFORE THE PARTIES AGREE AS FOLLOWS:\n...,1. Under section 1.1 the Advisor can’t assign ...,0.499994,0.5,1.0,1.0
4,Who owns the IP?,The Company fully and exclusively owns any Wor...,[Cloud Investments Ltd. Advisor \n\nBy: ______...,According to section 4 of the Undertaking (App...,0.830365,1.0,1.0,1.0


In [10]:
print(results)

                                            question  \
0  Who are the parties to the Agreement and what ...   
1                    What is the termination notice?   
2  What are the payments to the Advisor under the...   
3  Can the Agreement or any of its obligations be...   
4                                  Who owns the IP?    
5  Is there a non-compete obligation to the Advisor?   
6              Can the Advisor charge for meal time?   
7             In which street does the Advisor live?   
8       Is the Advisor entitled to social benefits?    
9  What happens if the Advisor claims compensatio...   

                                              answer  \
0  The parties to the Agreement are Cloud Investm...   
1  Either party may terminate the Agreement upon ...   
2  The payments to the Advisor under the Agreemen...   
3  The Agreement may not be assigned, sold, deleg...   
4  The Company fully and exclusively owns the IP,...   
5  Yes, during the term of engagement with the 