In [None]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.vectorstores import Neo4jVector
from langchain_community.chat_models import ChatOllama

from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter, NLTKTextSplitter
from langchain_core.prompts import PromptTemplate
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.memory import ConversationBufferMemory


from neo4j import GraphDatabase
from neo4j import exceptions

from pathlib import Path
import os
import sys


In [None]:
os.environ['NEO4J_DRIV_PORT'] = '7687'
os.environ['NEO4J_HTTP_PORT'] = '7474'
os.environ['NEO4J_USERNAME'] = 'neo4j'
os.environ['NEO4J_PASSWORD'] = 'temp_password123'

!echo ${NEO4J_DRIV_PORT}
!echo ${NEO4J_HTTP_PORT}
!echo ${NEO4J_USERNAME}
!echo ${NEO4J_PASSWORD}

In [None]:
!docker ps -a | grep -ie neo4j | awk '{print $1}' | xargs -I{} docker rm {} -f
!docker ps -a | grep -ie neo4j
!docker run -d --restart always --publish=${NEO4J_HTTP_PORT}:${NEO4J_HTTP_PORT} --publish=${NEO4J_DRIV_PORT}:${NEO4J_DRIV_PORT} --env NEO4J_AUTH=${NEO4J_USERNAME}/${NEO4J_PASSWORD} neo4j:latest
!docker ps -a | grep -ie neo4j

In [None]:
ollama_emb = OllamaEmbeddings(
    model="llama3:latest",
)

In [None]:
documents_dictionary_struct = {
    "smartbear": Path("/Volumes/stuff/graphRagSandbox/assets/SmartBear_TOU-10FEB2023.docx"),
    "shrekmovie": Path("/Volumes/stuff/general_playground/assets/the_entire_shrek_script.txt"),
    "dantesinferno": Path("/Volumes/stuff/general_playground/assets/dantes_inferno_all_chp.txt")
}

# loader = TextLoader(smart_bear_contract)
docx_loader = Docx2txtLoader(documents_dictionary_struct["smartbear"])
documents = docx_loader.load()

text_splitter = NLTKTextSplitter(chunk_size=1500, chunk_overlap=20, separator=". ")
recursive_splitter = RecursiveCharacterTextSplitter(chunk_size=120, chunk_overlap=20, separators=['\n\n', "\n", "."], length_function=len)

nltk_docs = text_splitter.split_documents(documents)
rec_docs = recursive_splitter.split_documents(documents)
rec_docs

In [None]:
doc_name_only = documents_dictionary_struct["smartbear"].with_suffix("").name

for k,v in rec_docs[0]:
    print(v)

In [None]:
URI = f'neo4j://localhost:{os.environ['NEO4J_DRIV_PORT']}'
USERNAME = os.environ['NEO4J_USERNAME']
PASSWORD = os.environ['NEO4J_PASSWORD']

In [None]:
driver = GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD))

db = Neo4jVector.from_documents(
    rec_docs, ollama_emb, url=URI, username=USERNAME, password=PASSWORD
)

In [None]:
query_count: list = []
graph_db_index_create: str = "CREATE FULLTEXT INDEX text_index IF NOT EXISTS FOR (n:Chunk) ON EACH[n.text]"

with driver.session() as session:
    # session.run("CREATE FULLTEXT INDEX text_index IF NOT EXISTS FOR (n:Chunk) ON EACH[n.text]")
    try:
        result = session.run("CALL db.index.fulltext.queryNodes(\"text_index\", \".\") YIELD node RETURN node.id")
        for node in result:
            query_count.append(node)
    except exceptions.ClientError as e:
        print(f"FULLTEXT INDEX - Not Present")
        print(e)
    finally:
        session.run(graph_db_index_create)
        print(f"text_index - Created")
    

In [None]:
index_name = "vector"
keyword_index_name = "text_index"
search_type = "hybrid"

store = Neo4jVector.from_existing_index(
    ollama_emb,
    url=URI,
    username=USERNAME,
    password=PASSWORD,
    index_name=index_name,
    keyword_index_name=keyword_index_name,
    search_type=search_type,
)

retriever = store.as_retriever()

In [None]:
prompt_template = """You will be give a list of phrases to search for within the context provided. Strictly follow these outlined rules when producing answers:
1. If you don't know the answer, don't try to make up an answer. Just answer with NULL.
2. If you find the answer, only respond with a list of complete sentences that pertain to the provided phrases in the following format: [sentence1, sentence2, ...]

This document pertains to: {context}

Question: {question}

Answer:

{summaries}"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question", "summaries"]
)

In [None]:
context: str = doc_name_only
# question: str = input(f"What is your question about {context}")

raw_question_list: list = [
    'customer name apart of the contract',
    'contract effective date',
    'required notice time before non-renewal',
]

formatted_question_list: str = "["
for iter_n in range(len(raw_question_list)):
    if iter_n < len(raw_question_list)-1:
        formatted_question_list += str(raw_question_list[iter_n])+", "
    else:
        formatted_question_list += str(raw_question_list[iter_n])+"]"

# formatted_question_list: str = f"[{raw_question_list[0]}, {raw_question_list[1]}, {raw_question_list[2]}]"

memory = ConversationBufferMemory(memory_key="history", 
                                  input_key="question", 
                                  output_key='answer', 
                                  return_messages=True,
                                  )

llm = ChatOllama(model="llama3:latest", 
                 temperature=0.0,
                 format="json",
                 )

chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm,
    chain_type="stuff", 
    retriever=retriever, 
    chain_type_kwargs={ "prompt": PROMPT },
    memory=memory,
)

response = chain.invoke({'context': context, 'question': formatted_question_list})
print(f"Query list: {response['question']}\nAnswer list: {response['answer']}")