# Analysis

In [None]:
! pip install langchain
! pip install openai
! pip install tiktoken
! pip install tqdm
! pip install chromadb
! pip install langchain_experimental
! pip install langchain_openai

In [2]:
import os
from typing import List, Union

import chromadb

from langchain.pydantic_v1 import Field, BaseModel
from langchain.chains.openai_functions import create_structured_output_chain
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

In [3]:
# add your api key

os.environ["OPENAI_API_KEY"] = ""

# Populate the Database using the Knowledge Base

In [4]:
client = chromadb.Client(chromadb.Settings(allow_reset=True))
client.reset()

# create collection to store github general terms documents
collection = client.get_or_create_collection("github-customer-agreement")

In [5]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
semantic_chunker = SemanticChunker(OpenAIEmbeddings())

def chunk_markdown(document: str):
    """Split ducument based on Markdown structure"""
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
    return markdown_splitter.split_text(document)

def chunk_semanticly(document: str):
    """Split document grouping sentences semanticly closed to each other"""
    return semantic_chunker.create_documents([document])

def read_and_chunk(path: str, chunking_function=chunk_semanticly):
    """Read file and chunck it

    Returns: a list of documents.
    """
    with open(path, "r") as f:
        document = f.read()
    return chunking_function(document)

def upsert_documents(document_id, documents):
    """Insert or updates a list of documents"""
    collection.upsert(
        documents=[document.page_content for document in documents],
        metadatas=[document.metadata for document in documents],
        ids=["{}-{}".format(document_id, i) for i in range(len(documents))]
    )

In [6]:
knoledge_base = [
    'github-general-terms.md',
    'github-data-protection.md'
]

for document_path in knoledge_base:
    documents = read_and_chunk(document_path, chunking_function=chunk_markdown)
    upsert_documents(document_path, documents)

/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:01<00:00, 46.7MiB/s]


In [7]:
collection.count()

44

# Chain of Thought QA using Vector Database with Knowledge Base

In [8]:
class Background(BaseModel):
    """Background explaining user question"""
    background: str = Field(..., description="Background explaining user questions")

class Thought(BaseModel):
    """A thought about user question"""
    thought: str = Field(..., description="A thought about user question")
    flawed: bool = Field(..., description="Whether or not the thought is flawed or misleading")
    helpful: bool = Field(..., description="Whether or not the thought is helpful to solve user question")

class Answer(BaseModel):
    """The answer to user question"""
    awnser: str = Field(..., description="Answer to user question")
    score: int = Field(..., description="Score from 1 to 10 on how correct the anwser is", min_value=1, max_value=10)

class ChainOfThought(BaseModel):
    """A chain of thoughts to answer user question"""
    background: Background = Field(..., description="Background explaining user questions")
    thoughts: List[Thought] = Field(..., description="List of thoughts about user question")
    answer: Answer = Field(..., description="The answer to user question")

class ChainOfThoughtBuilder():
    """Build a chain of thought (CoT) to anwser a questions using a LLM"""

    def __init__(self, llm, verbose=False):
        self.prompt = ChatPromptTemplate.from_messages([
            ("system", "You are the most intelligent person in the world."
                ""
                "You will receive a prize if you follow ALL these rules:"
                "- First, establish a detailed background useful to anwser the user question."
                "- Each thought must include whether it is relevant and whether it is helpful."
                "- Continue to add thoughts until you can confidently answer the question."
                "- The anwser must be scored accurately and honestly."),
            ("human", 'Useful context: {context}'),
            ("human", 'User question: """{question}"""'),
        ])
        self.chain = create_structured_output_chain(ChainOfThought, llm, self.prompt, verbose=verbose)

    def build(self, question: str, context: str = None):
        return self.chain.run(context=context, question=question)

In [9]:
def cot_to_markdown(cot: ChainOfThought, question: str, top_documents: List[str], metadatas: List[dict]):
    """Convert a ChainOfThought to markdown"""
    md = "# Top documents\n"
    for i, (document, metadata) in enumerate(zip(top_documents, metadatas)):
        md += "## Document " + str(i) + ": " + " - ".join(metadata.values()) + "\n"
        md += document + "\n\n"

    md += "# " + question + "\n"
    md += "## Background\n"
    md += cot.background.background + "\n\n"

    md += "## Thoughts\n"
    for thought in cot.thoughts:
        md += "- " + thought.thought + "\n"
        md += "  - Flawed: " + str(thought.flawed) + "\n"
        md += "  - Helpful: " + str(thought.helpful) + "\n\n"

    md += "## Answer\n"
    md += "- " + cot.answer.awnser + "\n"
    md += "  - Score: " + str(cot.answer.score) + "\n\n"

    return md

def search_topn_documents(query, n_results=5):
    """Return the top N document given a query text"""
    return collection.query(
        query_texts=query,
        n_results=n_results
    )

# Customer Agreement relevant questions

In [10]:
questions = [
    "As a customer, can I share the product or service with third parties?",
    "As a customer, can I modify the source code of the product or service to my liking?",
    "Can the supplier make modifications to the products or services?",
    "Does the contract provide any protection for confidential customer information?",
    "Does the customer retain ownership of the data he provides to the supplier?",
    "Are the rights of use the supplier receives over the customer's data limited to what is strictly necessary?",
    "Does the provider commit to any security standards or practices regarding customer content?",
    "Are the licenses received by the supplier on the customer's intellectual property limited?",
    "Does the supplier have to delete personal information after the end of the contract?",
    "Does the supplier indemnify the customer for infringement of third party intellectual property?",
    "Are the customer's indemnification obligations limited in third party claims?",
    "Is the customer's ability to confront the supplier or any other party limited?",
    "Does the supplier have a liability limit of 12 months' quota or higher?",
    "What limits of liability against consequential damages does the supplier have?",
    "If the supplier indemnifies for infringement of third party intellectual property, is it exempt from the limit of liability?",
    "Is the customer's liability limited?",
    "Does the customer have any right to terminate the agreement?",
    "If the contract is self-renewing can the customer opt out at that time?",
    "Does the customer have any liability to pay taxes?",
    "What rights does the customer have regarding data migration?",
    "Is the contract renewal automatic or does it need to be initiated by the customer?",
    "What is the mandatory governing law?",
    "Is the mandatory headquarters located within the United States?",
    "Is the customer's ability to develop or procure similar products or services from other suppliers limited?",
    "Who is responsible for ensuring that the services function properly?",
    "Can the customer notify the supplier via email?",
    "To answer the following questions we must simulate a real context. In this case, we consider a customer with an annual subscription, enjoys since January 1, 2024 of the provider's services and the total amount of the fees is 528$. What is the validity period of the contract?",
    "To answer the following questions we must simulate a real context. In this case, we consider a customer with an annual subscription, enjoys since January 1, 2024 of the provider's services and the total amount of the fees is 528$. When must the customer terminate the contract in order not to renew the license or subscription to the services?",
    "To answer the following questions we must simulate a real context. In this case, we consider a customer with an annual subscription, enjoys since January 1, 2024 of the provider's services and the total amount of the fees is 528$. When will the customer no longer be able to use the service if the contract is terminated?",
    "To answer the following questions we must simulate a real context. In this case, we consider a customer with an annual subscription, enjoys since January 1, 2024 of the provider's services and the total amount of the fees is 528$. The customer did not pay the previous fee. How much should the customer pay this month?",
    "To answer the following questions we must simulate a real context. In this case, we consider a customer with an annual subscription, enjoys since January 1, 2024 of the provider's services and the total amount of the fees is 528$. An incident has occurred, what is the provider's responsibility?",
    "To answer the following questions we must simulate a real context. In this case, we consider a customer with an annual subscription, enjoys since January 1, 2024 of the provider's services and the total amount of the fees is 528$. Until what day can the customer request data migration if the contract has been terminated?",
    "To answer the following questions we must simulate a real context. In this case, we consider a customer with an annual subscription, enjoys since January 1, 2024 of the provider's services and the total amount of the fees is 528$. For how long are the pricing conditions of this contract maintained?"
    ]

In [None]:
llm = ChatOpenAI(model="gpt-4-1106-preview", temperature=0)
cot_builder = ChainOfThoughtBuilder(llm, verbose=True)

for i, question in enumerate(questions):
  top_documents = search_topn_documents(question, n_results=44)
  cot = cot_builder.build(question, context='\n'.join(top_documents['documents'][0]))
  markdown = cot_to_markdown(cot, question, top_documents['documents'][0], top_documents['metadatas'][0])
  with open(f"question_{i}.md", "w") as f:
      f.write(markdown)