In [1]:
import os
os.chdir("../")

from src.chroma_store import initialize_vectorstore, load_documents_from_dir
from src.chunking_strategies import chunk_by_semantic, chunk_by_recursive_split

In [2]:

from langchain_cohere import CohereEmbeddings

embeddings = CohereEmbeddings(
    model="embed-english-v3.0",
)

In [3]:
# Load the documents from the data directory.
documents = load_documents_from_dir("data/content")

chunks = chunk_by_recursive_split(documents, chunk_size=400)
# chunks = chunk_by_semantic(documents)

vectorstore = initialize_vectorstore(chunks, embeddings)

--INFO-- Loading documents from data/content


2024-07-04 09:31:57 - src.chroma_store - INFO - Clearing out the chroma database.
2024-07-04 09:31:57 - src.chroma_store - INFO - Creating a new chroma database.


--INFO-- Loaded 1 documents
Split 1 documents into 51 chunks.
Time Tracking: Advisor shall provide the Company with a written report, in a format acceptable by the Company, setting forth the number of hours in which he provided the Services, on a daily basis, as well as an aggregated monthly report at the last day of each calendar month.
{'source': 'data/content/Robinson Advisory.docx', 'start_index': 2311}


2024-07-04 09:31:59 - chromadb.telemetry.product.posthog - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2024-07-04 09:32:00 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/embed "HTTP/1.1 200 OK"


### Initialize retriever

In [11]:
similarity_threshold = 0.5
similarity_count = 20
retriever = vectorstore.as_retriever(search_type="similarity_score_threshold",
                                      search_kwargs={'score_threshold': similarity_threshold,
                                                      "k": similarity_count})

### Retreival of documents

In [4]:
from src.rag_pipeline import create_rank_fusion_chain, generate_answer
from src.utils import format_tuple_docs_to_text, format_docs_to_text

In [60]:
question = "Who owns the IP?"

In [61]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(temperature=0)
retrieval_chain = create_rank_fusion_chain(question, llm, retriever)

# docs = retrieval_chain.invoke({"question": question})
# context_text = format_tuple_docs_to_text(docs)

docs = retriever.get_relevant_documents(question)
context_text = format_docs_to_text(docs)


print(context_text)

  warn_deprecated(


IP: Any Work Product, upon creation, shall be fully and exclusively owned by the Company. The Advisor, immediately upon Company’s request, shall sign any document and/or perform any action needed to formalize such ownership. The Advisor shall not obtain any rights in the Work Product, including moral rights and/or rights for royalties or other consideration under any applicable law (including

------------

Confidentiality, Non-Competition and IP Ownership Undertaking: In connection with the performance of Advisor’s obligations under this Agreement, the Advisor shall execute a Confidentiality, Non-Competition and IP Ownership Undertaking in the form attached hereto as Exhibit A.

------------

Company.

------------

By: ________________________		By:________________________

Name:	Silvan Joseph				Name:	Jack Robinson		

Title: CEO					



Confidentiality, None Compete and IP Ownership Undertaking

Appendix A to Advisory Service Agreement as of June 15th, 2023

------------

any proprie

In [62]:
print(chunks[0].page_content)

- 2-

ADVISORY SERVICES AGREEMENT

This Advisory Services Agreement is entered into as of June 15th, 2023 (the “Effective Date”), by and between Cloud Investments Ltd., ID 51-426526-3, an Israeli company (the "Company"), and Mr. Jack Robinson, Passport Number 780055578, residing at 1 Rabin st, Tel Aviv, Israel, Email: jackrobinson@gmail.com ("Advisor").


#### Reranking with cohere

In [63]:
# Helper function for printing docs


def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )

In [64]:
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank
from langchain_community.llms import Cohere
from langchain_cohere import ChatCohere


llm = ChatCohere(
    model="command",
    temperature=0,
)

retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": 5,
    }
)
compressor = CohereRerank()
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

compressed_docs = compression_retriever.invoke(
    question
)
pretty_print_docs(compressed_docs)

2024-07-04 09:52:58 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 200 OK"


Document 1:

IP: Any Work Product, upon creation, shall be fully and exclusively owned by the Company. The Advisor, immediately upon Company’s request, shall sign any document and/or perform any action needed to formalize such ownership. The Advisor shall not obtain any rights in the Work Product, including moral rights and/or rights for royalties or other consideration under any applicable law (including
----------------------------------------------------------------------------------------------------
Document 2:

By: ________________________		By:________________________

Name:	Silvan Joseph				Name:	Jack Robinson		

Title: CEO					



Confidentiality, None Compete and IP Ownership Undertaking

Appendix A to Advisory Service Agreement as of June 15th, 2023
----------------------------------------------------------------------------------------------------
Document 3:

Confidentiality, Non-Competition and IP Ownership Undertaking: In connection with the performance of Advisor’s oblig

#### Answer generation

In [105]:
prompt_template = """
You are an experienced Legal Assistant who analyzes legal documents. Your expertise includes extracting facts and integrating information from multiple sources to provide well-supported answers. 

Guidelines:

1. Derive your answer strictly from the provided context. Do not introduce any new information.

2. Ensure complete contextuality: Address all aspects of the query, linking back to specific details in the context.

3. Avoid phrases like "In the context provided" or "According to my knowledge."

4. Be concise and to the point, don't starting with phrases like, "The parties are ..."

5. Write in a professional and legally appropriate manner.

6. Avoid statements like "Let me know if you need more information" or "I hope this helps."

Previous Q & A examples include:

  *   **Q:** Who owns the Intellectual Property (IP)?
      *   **A:** According to Section 4 of the Undertaking (Appendix A), any Work Product, upon creation, shall be fully and exclusively owned by the Company.
  *   **Q:** Is there a non-compete obligation for the Advisor?
      *   **A:** Yes, during the term of engagement with the Company and for a period of 12 months thereafter.
  *   **Q:** Can the Advisor charge for meal time?
      *   **A:** No. Section 6.1 specifies that billable hours do not include meals or travel time.


Given the guidelines and examples, please answer the question based on the following context.
Context: {context}

Question: {question}

Answer:

REMEMBER YOU SHOULD BE CONCISE AND STRAIGHT TO THE POINT. USE LEGAL TERMINOLOGY WHERE APPROPRIATE.
- MAKE SURE TO REFER TO AND CITE SPECIFIC SECTIONS OF THE DOCUMENTS IN YOUR ANSWER, SUCH AS "ACCORDING TO SECTION 5.1 of ..."

"""

In [106]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank
from langchain_community.llms import Cohere
from langchain_cohere import ChatCohere


llm = ChatCohere(
    model="command",
    temperature=0,
)

retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": 5,
    }
)
compressor = CohereRerank()
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

compressed_docs = compression_retriever.invoke(
    question
)
pretty_print_docs(compressed_docs)

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)
chain_type_kwargs = {"prompt": PROMPT, "verbose": False}

llm = ChatOpenAI(model="gpt-4o", temperature=0)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=compression_retriever,
    return_source_documents=True,
    verbose=False,
    chain_type_kwargs=chain_type_kwargs,
)

query = "Can the Advisor charge for meal time?"
response = qa.invoke({"query": query})
answer = response["result"]

In [107]:
query = "Can the Advisor charge for meal time?"
response = qa.invoke({"query": query})
result = response["result"]

2024-07-04 09:59:40 - httpx - INFO - HTTP Request: POST https://api.cohere.com/v1/rerank "HTTP/1.1 200 OK"
2024-07-04 09:59:42 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [108]:
# Using gpt model
print(result)

No. Billable hours do not include meals, as specified in the definition of "Billable Hour."
