In [3]:
import json
import os
import uuid

import networkx as nx
import numpy as np

import pandas as pd
import requests
from langchain.chains.llm import LLMChain
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.llms.ollama import Ollama
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import PromptTemplate
from langchain_text_splitters import RecursiveCharacterTextSplitter


def generate(ollama_url, model_name, prompt, system=None, template=None, context=None, options=None, callback=None):
    url = f"{ollama_url}/api/generate"
    payload = {
        "model": model_name,
        "prompt": prompt,
        "system": system,
        "template": template,
        "context": context,
        "options": options
    }

    payload = {k: v for k, v in payload.items() if v is not None}

    with requests.post(url, json=payload, stream=True) as response:
        response.raise_for_status()

        final_context = None
        full_response = ""
        for line in response.iter_lines():
            if line:
                chunk = json.loads(line)
                if callback:
                    callback(chunk)
                else:
                    if not chunk.get("done"):
                        response_piece = chunk.get("response", "")
                        full_response += response_piece
                        print(response_piece, end="", flush=True)
                if chunk.get("done"):
                    final_context = chunk.get("context")
        return full_response, final_context


def graphPrompt(ollama_url, input: str, metadata, model):
    SYS_PROMPT = (
        """You are a network graph maker with legal expertise who extracts terms and their relations from a given context. Your answer should help answer questions about one of the following themes:
            You are provided with a context chunk (delimited by ```) Your task is to extract the ontology 
            of terms mentioned in the given context. These terms should help answer questions on following themes:
                Document Name', 'Parties', 'Agreement Date', 'Effective Date', 'Expiration Date',
                Renewal Term', 'Notice Period To Terminate Renewal', 'Governing Law',
                Most Favored Nation', 'Competitive Restriction Exception', 'Non-Compete',
                Exclusivity', 'No-Solicit Of Customers', 'No-Solicit Of Employees',
                Non-Disparagement', 'Termination For Convenience', 'Rofr/Rofo/Rofn',
                Change Of Control', 'Anti-Assignment', 'Revenue/Profit Sharing', 'Price Restrictions',
                Minimum Commitment', 'Volume Restriction', 'Ip Ownership Assignment',
                Joint Ip Ownership', 'License Grant', 'Non-Transferable License',
                Affiliate License-Licensor', 'Affiliate License-Licensee',
                Unlimited/All-You-Can-Eat-License', 'Irrevocable Or Perpetual License',
                Source Code Escrow', 'Post-Termination Services', 'Audit Rights',
                Uncapped Liability', 'Cap On Liability', 'Liquidated Damages',
                Warranty Duration', 'Insurance', 'Covenant Not To Sue', 'Third Party Beneficiary'
            Thought 1: While traversing through each sentence, Think about the key terms mentioned in it.
                Terms may include object, entity, location, organization, person, 
                condition, acronym, documents, service, concept, etc.
                Terms should be as atomistic as possible
            Thought 2: Think about how these terms can have one on one relation with other terms.
                Terms that are mentioned in the same sentence or the same paragraph are typically related to each other.
                Terms can be related to many other terms
            Thought 3: Find out the relation between each such related pair of terms. 
                    "Format your output as a list of json. Each element of the list contains a pair of terms"
                    "Don't write anything else except for json file"
                    "and the relation between them, like the follwing: \n"
                    "[\n"
                    "   {\n"
                    '       "node_1": "A concept from extracted ontology",\n'
                    '       "node_2": "A related concept from extracted ontology",\n'
                    '       "edge": "relationship between the two concepts, node_1 and node_2 in one or two sentences"\n'
                    "   }, {...}\n"
                    "]"
                """
    )
    USER_PROMPT = f"context: ```{input}``` \n\n output: "
    response, _ = generate(ollama_url=ollama_url, model_name=model, system=SYS_PROMPT, prompt=USER_PROMPT)
    result = json.loads(response)
    result = [dict(item, **metadata) for item in result]
    return result


def documents2Dataframe(documents) -> pd.DataFrame:
    rows = []
    for chunk in documents:
        row = {
            "text": chunk.page_content,
            **chunk.metadata,
            "chunk_id": uuid.uuid4().hex,
        }
        rows = rows + [row]

    df = pd.DataFrame(rows)
    return df


def df2Graph(ollama_url, dataframe: pd.DataFrame, model=None) -> list:
    results = dataframe.apply(
        lambda row: graphPrompt(ollama_url, row.text, {"chunk_id": row.chunk_id}, model), axis=1
    )
    results = results.dropna()
    results = results.reset_index(drop=True)
    concept_list = np.concatenate(results).ravel().tolist()
    return concept_list


def graph2Df(nodes_list) -> pd.DataFrame:
    graph_dataframe = pd.DataFrame(nodes_list).replace(" ", np.nan)
    graph_dataframe = graph_dataframe.dropna(subset=["node_1", "node_2"])
    graph_dataframe["node_1"] = graph_dataframe["node_1"].apply(lambda x: x.lower())
    graph_dataframe["node_2"] = graph_dataframe["node_2"].apply(lambda x: x.lower())
    return graph_dataframe


def generate_graph(ollama_url, filename, outputdirectory):
    loader = TextLoader(filename)
    documents = loader.load()

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1500,
        chunk_overlap=150,
        length_function=len,
        is_separator_regex=False,
    )

    pages = splitter.split_documents(documents)
    print("Number of chunks = ", len(pages))
    df = documents2Dataframe(pages)
    
    regenerate = True
    if regenerate:
        concepts_list = df2Graph(ollama_url=ollama_url, dataframe=df, model='llama3:latest')
        dfg1 = graph2Df(concepts_list)
        if not os.path.exists(outputdirectory):
            os.makedirs(outputdirectory)

        dfg1.to_csv(os.path.join(outputdirectory, "graph.csv"), sep="|", index=False)
        df.to_csv(os.path.join(outputdirectory, "chunks.csv"), sep="|", index=False)
    else:
        dfg1 = pd.read_csv(outputdirectory / "graph.csv", sep="|")

    dfg1.replace("", np.nan, inplace=True)
    dfg1.dropna(subset=["node_1", "node_2", 'edge'], inplace=True)
    dfg1['count'] = 4

    def contextual_proximity(df: pd.DataFrame) -> pd.DataFrame:
        dfg_long = pd.melt(
            df, id_vars=["chunk_id"], value_vars=["node_1", "node_2"], value_name="node"
        )
        dfg_long.drop(columns=["variable"], inplace=True)
        dfg_wide = pd.merge(dfg_long, dfg_long, on="chunk_id", suffixes=("_1", "_2"))
        self_loops_drop = dfg_wide[dfg_wide["node_1"] == dfg_wide["node_2"]].index
        dfg2 = dfg_wide.drop(index=self_loops_drop).reset_index(drop=True)
        dfg2 = (
            dfg2.groupby(["node_1", "node_2"])
            .agg({"chunk_id": [",".join, "count"]})
            .reset_index()
        )
        dfg2.columns = ["node_1", "node_2", "chunk_id", "count"]
        dfg2.replace("", np.nan, inplace=True)
        dfg2.dropna(subset=["node_1", "node_2"], inplace=True)
        dfg2 = dfg2[dfg2["count"] != 1]
        dfg2["edge"] = "contextual proximity"
        return dfg2

    dfg2 = contextual_proximity(dfg1)

    dfg = pd.concat([dfg1, dfg2], axis=0)
    dfg = (
        dfg.groupby(["node_1", "node_2"])
        .agg({"chunk_id": ",".join, "edge": ','.join, 'count': 'sum'})
        .reset_index()
    )

    G = nx.Graph()

    nodes = set(dfg['node_1']).union(set(dfg['node_2']))
    for node in nodes:
        G.add_node(str(node))

    for index, row in dfg.iterrows():
        G.add_edge(
            str(row["node_1"]),
            str(row["node_2"]),
            title=row["edge"],
            weight=row['count'] / 4
        )
    return G


def clear_vectorstore(chroma_client):
    try:
        collection = chroma_client._collection
        if collection:
            all_docs = collection.get()
            ids_to_delete = all_docs['ids']
            if ids_to_delete:
                collection.delete(ids=ids_to_delete)
                print(f"Deleted {len(ids_to_delete)} documents from the vector store.")
            else:
                print("No documents to delete.")
        else:
            print("No collection found.")
    except Exception as e:
        print(f"Error clearing vector store: {e}")


def add_edges_to_vectorstore(graph, vectorstore):
    edge_texts = []
    for u, v, data in graph.edges(data=True):
        title = data.get('title', 'No title provided')
        edge_texts.append(f"{u}: {title}: {v}")
    vectorstore.add_texts(edge_texts)
    return graph.number_of_edges()


def retrieve_and_traverse_edges(question, retriever, graph):
    docs = retriever.get_relevant_documents(question)
    edge_contexts = [doc.page_content for doc in docs]

    all_traversed_edges = set(edge_contexts)

    max_edge_contexts = 20

    for edge_context in edge_contexts[:max_edge_contexts]:
        split = edge_context.split(': ')
        if len(split) == 3:
            # u = split[0]
            v = split[2]
            if v in graph:
                for neighbour in list(graph.neighbors(v))[:max_edge_contexts]:
                    edge_info = graph.get_edge_data(v, neighbour, default="No data")
                    all_traversed_edges.add(f"{v} -> {neighbour}: {edge_info}")
                    # Traversing the second level of successors
                    for second_neighbour in list(graph.neighbors(neighbour))[:max_edge_contexts]:
                        second_edge_info = graph.get_edge_data(neighbour, second_neighbour, default="No data")
                        all_traversed_edges.add(f"{neighbour} -> {second_neighbour}: {second_edge_info}")

    return "\n\n".join(all_traversed_edges)


def ask_questions(llama_model, document_name, graph, retriever):
    try:
        columns = [
            'Document Name', 'Parties', 'Agreement Date', 'Effective Date', 'Expiration Date',
            'Renewal Term', 'Notice Period To Terminate Renewal', 'Governing Law',
            'Most Favored Nation', 'Competitive Restriction Exception', 'Non-Compete',
            'Exclusivity', 'No-Solicit Of Customers', 'No-Solicit Of Employees',
            'Non-Disparagement', 'Termination For Convenience', 'Rofr/Rofo/Rofn',
            'Change Of Control', 'Anti-Assignment', 'Revenue/Profit Sharing', 'Price Restrictions',
            'Minimum Commitment', 'Volume Restriction', 'Ip Ownership Assignment',
            'Joint Ip Ownership', 'License Grant', 'Non-Transferable License',
            'Affiliate License-Licensor', 'Affiliate License-Licensee',
            'Unlimited/All-You-Can-Eat-License', 'Irrevocable Or Perpetual License',
            'Source Code Escrow', 'Post-Termination Services', 'Audit Rights',
            'Uncapped Liability', 'Cap On Liability', 'Liquidated Damages',
            'Warranty Duration', 'Insurance', 'Covenant Not To Sue', 'Third Party Beneficiary'
        ]

        answers = {}

        for column in columns:
            question = f"What is the {column.lower().replace('_', ' ')} of the document '{document_name}'?"

            context = retrieve_and_traverse_edges(question, retriever, graph)
            prompt_template = PromptTemplate(template="""
            You're a helpful assistant. Your job is to answer questions based on context. Keep your answers short and concise. If there is no answer in the provided context just write "No answer" with nothing else.
            Context: {context}\n\nQuestion: {question}
            """, input_variables=["context", "question"])
            chain = LLMChain(llm=llama_model, prompt=prompt_template)
            answer = chain.run({"context": context, "question": question})
            print(answer)
            answers[question] = answer
        return answers
    except Exception as e:
        print(f"Error asking questions for document {document_name}: {e}")
        return {}


def process_documents(ollama_url, documents_path, results_graph_path, graphs_path):
    llama_model = Ollama(model="llama3")
    embedding_model = OllamaEmbeddings(model="llama3")
    vectorstore = Chroma("vdbase", embedding_model)

    results = {}
    document_files = sorted(f for f in os.listdir(documents_path) if f.endswith(".txt"))

    counter = 0
    processed_files = set()

    results_files = [f for f in os.listdir(results_graph_path) if f.startswith('results_graph_') and f.endswith('.csv')]
    for rf in results_files:
        df = pd.read_csv(os.path.join(results_graph_path, rf), index_col=0)
        processed_files.update(df.index)
        counter += 1

    for filename in document_files:
        doc_name = os.path.splitext(filename)[0]
        if doc_name in processed_files:
            print(f"Skipping already processed document: {filename}")
            continue

        print(f"Processing document: {filename}")
        filepath = os.path.join(documents_path, filename)
        print(filepath)
 
        graph = generate_graph(ollama_url, filepath, graphs_path)

        clear_vectorstore(vectorstore)
        print("Vector store cleared.")

        edges = add_edges_to_vectorstore(graph, vectorstore)
        print("Document edges added to vector store.")

        retriever = vectorstore.as_retriever(search_kwargs={"k": min(edges, 15)})

        doc_results = ask_questions(llama_model, doc_name, graph, retriever)
        results[doc_name] = doc_results

        counter += 1

        if counter % 3 == 0:
            df = pd.DataFrame(results).T
            df.to_csv(f'{results_graph_path}/results_graph_{counter // 1}.csv', index=True)
            print(f"Results saved to {results_graph_path}/results_graph_{counter // 1}.csv")

        # Save the graph with the document name
        nx.write_gml(graph, os.path.join(graphs_path, f'{doc_name}.gml'))
        print(f"Graph saved for document: {doc_name}")

In [4]:
ollama_url = 'http://localhost:11434'
documents_path = '/Users/jalal/Documents/projects/KGRAG/data/data_half'
results_graph_path = '/Users/jalal/Documents/projects/KGRAG/data/results_graph'
graphs_path = '/Users/jalal/Documents/projects/KGRAG/data/graphs'
process_documents(ollama_url, documents_path, results_graph_path, graphs_path)

Processing document: LOYALTYPOINTINC_11_16_2004-EX-10.2-RESELLER AGREEMENT.txt
/Users/jalal/Documents/projects/KGRAG/data/data_half/LOYALTYPOINTINC_11_16_2004-EX-10.2-RESELLER AGREEMENT.txt
Number of chunks =  107
[
    {
        "node_1": "American Express",
        "node_2": "Incentive Services",
        "edge": "The two entities are part of a joint venture"
    },
    {
        "node_1": "Schoolpop, Inc.",
        "node_2": "Reseller",
        "edge": "Schoolpop is the reseller in this agreement"
    },
    {
        "node_1": "American Express Incentive Services, L.L.C.",
        "node_2": "AEIS",
        "edge": "AEIS is a Missouri limited liability company and one of the parties to the agreement"
    },
    {
        "node_1": "Schoolpop, Inc.",
        "node_2": "Delaware corporation",
        "edge": "Schoolpop is a Delaware corporation and one of the parties to the agreement"
    },
    {
        "node_1": "Agreement",
        "node_2": "Distribution Agreement",
        "edge"

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



",
       rietary Information",
        "node_2": "Confidentiality requirements",
        "edge": "The Contractor will provide Manufacturing Outsourcing Services in a manner that complies with the Confidentiality requirements of APPENDIX E."
    },
    {
        "node_1": "Contractor",
        "node_2": "Third party",
        "edge": "The Contractor shall not disclose, to any third party, except to suppliers/manufacturers of components and/or sub-assemblies that require specifications for their supply."
    },
    {
        "node_1": "NICE Software",
        "node_2": "Reverse engineering",
        "edge": "The Contractor shall not, in any way or manner, directly or indirectly, engineer, reverse engineer, compile, decompile or reverse assemble the NICE Software"
    }
][
    {
        "node_1": "NICE Proprietary Information",
        "node_2": "Personnel",
        "edge": "The NICE Proprietary Information shall not be disclosed to Personnel, except on a need to know basis."
    },
    

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



Document edges added to vector store.
Initial Edges: ['insurance company: contextual proximity: waiver of subrogation clause', 'parties: The Parties mutually agree on the required changes in APPENDIX C regarding the end of life of a component.: mutual agreement', 'services: Contractor provides services to help NICE obtain components for its internal requirements.: components for nice internal requirements', 'engineering system: The PDM system is an engineering system that Contractor shall be granted access to for the purpose of performance of this Agreement.: pdm system', 'products: contextual proximity: e- mail, computerized systems, postal delivery, courier delivery, facsimile transmission', "po's: The concept of cancellation is related to Purchase Orders (PO's) as NICE may, at its discretion, cancel, in whole or in part, PO's of Products issued.: cancellation of purchase orders", "facility: The facility at Contractor's premises designated for the Manufacturing Outsourcing Services i