In [5]:
import json
import os

import pandas as pd
from langchain.chains import LLMChain
from langchain.embeddings import OllamaEmbeddings
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_community.llms import Ollama

csv_file_path = 'data/CUAD_v1/master_clauses.csv'
data = pd.read_csv(csv_file_path)

llama_model = Ollama(model="llama3")
embedding_model = OllamaEmbeddings(model="llama3")

documents_path = 'data/CUAD_v1/full_contract_txt/'
vectorstore = Chroma("vdbase", embedding_model)

In [6]:
def clear_vectorstore(chroma_client):
    try:
        collection = chroma_client._collection
        if collection:
            all_docs = collection.get()
            ids_to_delete = all_docs['ids']

            if ids_to_delete:
                collection.delete(ids=ids_to_delete)
                print(f"Deleted {len(ids_to_delete)} documents from the vector store.")
            else:
                print("No documents to delete.")
        else:
            print("No collection found.")
    except Exception as e:
        print(f"Error clearing vector store: {e}")


def ask_questions(document_name, retriever):
    try:
        columns = [
            'Document Name', 'Parties', 'Agreement Date', 'Effective Date', 'Expiration Date',
            'Renewal Term', 'Notice Period To Terminate Renewal', 'Governing Law',
            'Most Favored Nation', 'Competitive Restriction Exception', 'Non-Compete',
            'Exclusivity', 'No-Solicit Of Customers', 'No-Solicit Of Employees',
            'Non-Disparagement', 'Termination For Convenience', 'Rofr/Rofo/Rofn',
            'Change Of Control', 'Anti-Assignment', 'Revenue/Profit Sharing', 'Price Restrictions',
            'Minimum Commitment', 'Volume Restriction', 'Ip Ownership Assignment',
            'Joint Ip Ownership', 'License Grant', 'Non-Transferable License',
            'Affiliate License-Licensor', 'Affiliate License-Licensee',
            'Unlimited/All-You-Can-Eat-License', 'Irrevocable Or Perpetual License',
            'Source Code Escrow', 'Post-Termination Services', 'Audit Rights',
            'Uncapped Liability', 'Cap On Liability', 'Liquidated Damages',
            'Warranty Duration', 'Insurance', 'Covenant Not To Sue', 'Third Party Beneficiary'
        ]

        answers = {}

        for column in columns:
            question = f"What is the {column.lower().replace('_', ' ')} of the document '{document_name}'?"

            docs = retriever.get_relevant_documents(question)
            context = "\n\n".join([doc.page_content for doc in docs])

            prompt_template = PromptTemplate(template="""
            You're a helpful assistant. Your job is to answer question based on context. Keep your answers short and concise. If there is no answer in provided context just write "No answer" with nothing else.
            Context: {context}\n\nQuestion: {question}
            """, input_variables=["context", "question"])
            chain = LLMChain(llm=llama_model, prompt=prompt_template)

            answer = chain.run({"context": context, "question": question})
            print(answer)
            answers[question] = answer

        return answers
    except Exception as e:
        print(f"Error asking questions for document {document_name}: {e}")
        return {}


results = {}
document_files = [f for f in os.listdir(documents_path) if f.endswith(".txt")]

counter = 0

processed_files = set()
results_files = [f for f in os.listdir() if f.startswith('results_') and f.endswith('.csv')]
for rf in results_files:
    df = pd.read_csv(rf, index_col=0)
    processed_files.update(df.index)

for filename in document_files:
    doc_name = os.path.splitext(filename)[0]
    if doc_name in processed_files:
        print(f"Skipping already processed document: {filename}")
        continue

    try:
        print(f"Processing document: {filename}")
        filepath = os.path.join(documents_path, filename)

        with open(filepath, 'r', encoding='utf-8') as file:
            content = file.read()

        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        splits = text_splitter.split_text(content)
        print(f"Document split into chunks. len: {len(splits)}")

        clear_vectorstore(vectorstore)
        print("Vector store cleared.")

        vectorstore.add_texts(splits)
        print("Document chunks added to vector store.")

        retriever = vectorstore.as_retriever(search_kwargs={"k": min(len(splits), 4)})

        doc_results = ask_questions(doc_name, retriever)
        results[doc_name] = doc_results
        counter += 1

        if counter % 3 == 0:
            df = pd.DataFrame(results).T
            df.to_csv(f'results_{counter // 3}.csv', index=True)
            print(f"Results saved to results_{counter // 3}.csv")

    except Exception as e:
        print(f"Error processing document {filename}: {e}")

if counter % 3 != 0:
    df = pd.DataFrame(results).T
    df.to_csv(f'results_{counter // 3 + 1}.csv', index=True)
    print(f"Results saved to results_{counter // 3 + 1}.csv")

for doc_name, answers in results.items():
    print(f"Results for {doc_name}:")
    for question, answer in answers.items():
        print(f"{question}: {answer}")
    print("\n")

results_file = 'results.json'
with open(results_file, 'w') as f:
    json.dump(results, f, indent=4)

Skipping already processed document: LIMEENERGYCO_09_09_1999-EX-10-DISTRIBUTOR AGREEMENT.txt
Skipping already processed document: WHITESMOKE,INC_11_08_2011-EX-10.26-PROMOTION AND DISTRIBUTION AGREEMENT.txt
Skipping already processed document: LohaCompanyltd_20191209_F-1_EX-10.16_11917878_EX-10.16_Supply Agreement.txt
Skipping already processed document: CENTRACKINTERNATIONALINC_10_29_1999-EX-10.3-WEB SITE HOSTING AGREEMENT.txt
Skipping already processed document: NELNETINC_04_08_2020-EX-1-JOINT FILING AGREEMENT.txt
Skipping already processed document: ADAMSGOLFINC_03_21_2005-EX-10.17-ENDORSEMENT AGREEMENT.txt
Skipping already processed document: KIROMICBIOPHARMA,INC_05_11_2020-EX-10.23-CONSULTING AGREEMENT.txt
Skipping already processed document: VEONEER,INC_02_21_2020-EX-10.11-JOINT VENTURE AGREEMENT.txt
Skipping already processed document: DovaPharmaceuticalsInc_20181108_10-Q_EX-10.2_11414857_EX-10.2_Promotion Agreement.txt
Skipping already processed document: PACIRA PHARMACEUTICALS,

  warn_deprecated(
  warn_deprecated(
  warn_deprecated(


Third Addendum to the 2010 Agreement (2020 Agreement)
The parties to the document are Marv and Premier.
No answer
No answer. The context does not mention an "Effective Date" for the agreement. It only mentions that the agreement is executed on a certain date, but does not specify what that date is.
No answer
No answer
No answer. The provided context does not mention a notice period for terminating or renewing the agreement.
No answer. The provided context does not mention a governing law for the intellectual property agreement.
No answer
There is no competitive restriction exception mentioned in this document.
No answer. The provided context does not contain information about a non-compete clause or any restrictions on competition.
No answer. The provided context is from an intellectual property agreement and does not mention exclusivity.
No answer
No answer
No answer. The provided context does not contain any mention of a document called "PREMIERBIOMEDICALINC_05_14_2020-EX-10.2-INTELL

KeyboardInterrupt: 