In [1]:
import os
os.chdir("../")

from src.chroma_store import initialize_vectorstore

In [2]:
vectorstore = initialize_vectorstore()

2024-07-03 11:32:02 - src.chroma_store - INFO - Loading documents from data/content
2024-07-03 11:32:09 - src.chroma_store - INFO - Clearing out the chroma database.
2024-07-03 11:32:09 - src.chroma_store - INFO - Creating a new chroma database.


Split 1 documents into 1999 chunks.


2024-07-03 11:32:13 - chromadb.telemetry.product.posthog - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-07-03 11:32:18 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-03 11:32:26 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [3]:
similarity_threshold = 0.7
similarity_count = 5
retriever = vectorstore.as_retriever(search_type="similarity_score_threshold",
                                      search_kwargs={'score_threshold': similarity_threshold,
                                                      "k": similarity_count})

### Prompt

In [4]:
question = " Whose consent is required for the assignment of the Agreement by the Buyer?"


In [5]:
from src.rag_pipeline import create_rank_fusion_chain, generate_answer
from langchain.prompts import ChatPromptTemplate
from operator import itemgetter
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain.schema.runnable import RunnableMap
from src.utils import format_tuple_docs_to_text



In [6]:
llm = ChatOpenAI(temperature=0)

retrieval_chain = create_rank_fusion_chain(question, llm, retriever)
docs = retrieval_chain.invoke({"question": question})

context_text = format_tuple_docs_to_text(docs)

answer = generate_answer(question, context_text, llm=llm)

2024-07-03 11:32:42 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-03 11:32:42 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-03 11:32:42 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-03 11:32:42 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-03 11:32:42 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-03 11:32:42 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
  warn_beta(
2024-07-03 11:32:44 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [7]:
answer

'Yes, the consent of the Seller is required for the assignment of the Agreement by the Buyer.'

### Ragas for testing

In [8]:
from src.utils import extract_qa_pairs_to_df

file_path = "data/evaluation_sets/Robinson Q&A.docx"  # Replace with your actual file path
df = extract_qa_pairs_to_df(file_path)

In [9]:
df.head()

Unnamed: 0,question,ground_truths
0,Who are the parties to the Agreement and what ...,Cloud Investments Ltd. (“Company”) and Jack Ro...
1,What is the termination notice?,According to section 4:14 days for convenience...
2,What are the payments to the Advisor under the...,According to section 6: 1. Fees of $9 per hour...
3,Can the Agreement or any of its obligations be...,1. Under section 1.1 the Advisor can’t assign ...
4,Who owns the IP?,According to section 4 of the Undertaking (App...


##### Change the df columns to list

In [15]:
questions = df["question"].tolist()
ground_truths = df["ground_truths"].tolist()

In [18]:
from datasets import Dataset
answers = []
contexts = []

# Inference
for query in questions:
    contexts.append(
        [docs.page_content for docs in retriever.get_relevant_documents(query)]
    )
    context_text = format_tuple_docs_to_text(docs)

    answers.append(generate_answer(query, context_text, llm=llm))

  warn_deprecated(
2024-07-03 11:41:28 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-03 11:41:29 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-03 11:41:30 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-03 11:41:31 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-03 11:41:31 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-03 11:41:33 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-03 11:41:33 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-03 11:41:34 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-03 11:41:34 - httpx - INFO - HTTP Request: POST https://api.o

In [22]:
# To dict
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truth": ground_truths,
}

# Convert dict to dataset
dataset = Dataset.from_dict(data)

In [23]:
from ragas import evaluate
from ragas.metrics import (
    answer_correctness,
    faithfulness,
    answer_similarity,
    context_precision,
    context_utilization,
    context_recall,
    context_relevancy,
    answer_relevancy,
    context_entity_recall,
)


# evaluating dataest on listed metrics
result = evaluate(
    dataset=dataset,
    metrics=[
        answer_correctness,
        faithfulness,
        answer_similarity,
        context_precision,
        context_utilization,
        context_recall,
        context_relevancy,
        answer_relevancy,
        context_entity_recall,
    ],
)


df = result.to_pandas()

Evaluating:   0%|          | 0/90 [00:00<?, ?it/s]2024-07-03 11:46:46 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-03 11:46:46 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-03 11:46:46 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-03 11:46:46 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-03 11:46:46 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-03 11:46:46 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-03 11:46:46 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-07-03 11:46:47 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2024-07-03 11:46:47 - http

In [24]:
df.head()

Unnamed: 0,question,answer,contexts,ground_truth,answer_correctness,faithfulness,answer_similarity,context_precision,context_utilization,context_recall,context_relevancy,answer_relevancy,context_entity_recall
0,Who are the parties to the Agreement and what ...,The parties to the Agreement are the Sellers a...,[The following terms have the meaning set fort...,Cloud Investments Ltd. (“Company”) and Jack Ro...,0.187253,0.333333,0.749011,0.95,0.95,1.0,0.2,0.0,0.0
1,What is the termination notice?,The termination notice is not mentioned in the...,"[notice or lapse of time or both, would consti...",According to section 4:14 days for convenience...,0.200053,0.0,0.800212,0.325,0.325,0.0,0.2,0.0,0.142857
2,What are the payments to the Advisor under the...,The payments to the Advisor under the Agreemen...,[Payments; (b) any fees and expenses owing to ...,According to section 6: 1. Fees of $9 per hour...,0.182553,0.0,0.730111,0.755556,0.755556,0.0,0.125,0.986157,0.0
3,Can the Agreement or any of its obligations be...,Yes,[or obligations hereunder and (b) the Buyer ma...,1. Under section 1.1 the Advisor can’t assign ...,0.177694,,0.710776,1.0,1.0,0.0,0.4,0.911406,0.25
4,Who owns the IP?,The Buyer owns the IP.,"[Tax Ownership of Intellectual Property., Inte...",According to section 4 of the Undertaking (App...,0.20023,0.0,0.80092,0.7,0.7,0.0,0.2,0.991172,0.0
