In [None]:
import json
# Install some packages
%pip install -r requirements.txt

In [None]:
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.vectorstores import Neo4jVector
from langchain_community.chat_models import ChatOllama

from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter, NLTKTextSplitter
from langchain_core.prompts import PromptTemplate
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.memory import ConversationBufferMemory


from neo4j import GraphDatabase
from neo4j import exceptions

# from tqdm.notebook import tqdm
# from sentence_transformers import SentenceTransformer
from typing import List

from pathlib import Path
import os
import sys

from dataclasses import dataclass, field
import secrets
import ast


In [None]:
### Setup container image env vars for auth - neo4j graph db

# GEN rand password for temp graphDB
one_time_password: str = secrets.token_urlsafe(23)

os.environ['NEO4J_DRIV_PORT'] = '7687'
os.environ['NEO4J_HTTP_PORT'] = '7474'
os.environ['NEO4J_USERNAME'] = 'neo4j'
os.environ['NEO4J_PASSWORD'] = one_time_password

!echo ${NEO4J_DRIV_PORT}
!echo ${NEO4J_HTTP_PORT}
!echo ${NEO4J_USERNAME}
!echo ${NEO4J_PASSWORD}

In [None]:
### Drop existing graph db completely (rm container) and rebuild from fresh

# Pull latest image if none exist - continue without pulling if already present
!docker images neo4j:latest | awk 'NR>1{print $1}' | if [[ $_ != "neo4j:latest" ]]; then docker pull neo4j:latest; else printf "NEO4J image present.\n"; fi

!docker ps -a | grep -iE 'neo4j|tini -g' | awk '{print $1}' | xargs -I{} docker rm {} -f
!docker ps -a | grep -ie neo4j
!docker run -d --restart always --publish=${NEO4J_HTTP_PORT}:${NEO4J_HTTP_PORT} --publish=${NEO4J_DRIV_PORT}:${NEO4J_DRIV_PORT} --env NEO4J_AUTH=${NEO4J_USERNAME}/${NEO4J_PASSWORD} neo4j:latest
!docker ps -a | grep -ie neo4j

In [None]:
# Try the embeddings model: msmarco-MiniLM-L-12-v3 or all-mpnet-base-v2 - using sentence-transformers locally
# Using a dict to store multiple model references - easier to keep track of things I've tried and whats present
ollama_model_library = {
    "smollm":"smollm:1.7b",
    "llama3.1":"llama3.1:8b",
    "all-minilm":"all-minilm:l6-v2",
    "llama3":"llama3:latest",
    "mxbai-embed-large":"mxbai-embed-large:latest",
    "codellama":"codellama:13b"
}

ollama_emb = OllamaEmbeddings(
    model=ollama_model_library['mxbai-embed-large'],
)

In [None]:
# Original test documents
documents_dictionary_struct = {
    "smartbear": Path("/Volumes/stuff/graphRagSandbox/assets/SmartBear_TOU-10FEB2023.docx"),
    "shrekmovie": Path("/Volumes/stuff/general_playground/assets/the_entire_shrek_script.txt"),
    "dantesinferno": Path("/Volumes/stuff/general_playground/assets/dantes_inferno_all_chp.txt")
}

# loader = TextLoader(smart_bear_contract)
docx_loader = Docx2txtLoader(documents_dictionary_struct["smartbear"])
documents = docx_loader.load()

In [None]:
# Created a dataclass incase I reference the same variables for splitter construction - one-stop shop for changing vars during testing
@dataclass
class RecTxtDataClass:
    c_size: int = 120
    c_overlap: int = 20
    c_separators: List = field(default_factory=lambda: ["\n\n", "\n", "."])
    c_len_fun: len = lambda x: len(x)

In [None]:
RecTxtObj = RecTxtDataClass()

# text_splitter = NLTKTextSplitter(chunk_size=1500, chunk_overlap=20, separator=". ")
# I used a recursive splitter since I wanted to break on paragraph, complete sentence, and single words (but not single chars) - this worked well for legal docs
recursive_splitter = RecursiveCharacterTextSplitter(chunk_size=RecTxtObj.c_size, 
                                                    chunk_overlap=RecTxtObj.c_overlap, 
                                                    separators=RecTxtObj.c_separators, 
                                                    length_function=RecTxtObj.c_len_fun)

# nltk_docs = text_splitter.split_documents(documents)
rec_docs = recursive_splitter.split_documents(documents)
rec_docs

In [None]:
doc_name_only = documents_dictionary_struct["smartbear"].with_suffix("").name

In [None]:
URI = f'neo4j://localhost:{os.environ['NEO4J_DRIV_PORT']}'
USERNAME = os.environ['NEO4J_USERNAME']
PASSWORD = os.environ['NEO4J_PASSWORD']

In [None]:
driver = GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD))

db = Neo4jVector.from_documents(
    rec_docs, ollama_emb, url=URI, username=USERNAME, password=PASSWORD
)

In [None]:
graph_db_index_create: str = f"CREATE FULLTEXT INDEX text_index IF NOT EXISTS FOR (n:Chunk) ON EACH[n.text]"
graph_db_fulltext_label_query: str = f"CALL db.index.fulltext.queryNodes(\"text_index\", \".\") YIELD node RETURN node.id"

with driver.session() as session:
    # session.run("CREATE FULLTEXT INDEX text_index IF NOT EXISTS FOR (n:Chunk) ON EACH[n.text]")
    try:
        result = session.run(graph_db_index_create)
    except exceptions.ClientError as e:
        print(f"FULLTEXT INDEX - Not Present")
        print(e)
    finally:
        session.run(graph_db_fulltext_label_query)
        print(f"text_index - Created")
    

In [None]:
index_name = "vector"
keyword_index_name = "text_index"
search_type = "hybrid"

# Look at this article in streamlining the graphDB construction and index ingestion into simpler method:
# https://medium.com/neo4j/using-langchain-in-combination-with-neo4j-to-process-youtube-playlists-and-perform-q-a-flow-5d245d51a735
store = Neo4jVector.from_existing_index(
    ollama_emb,
    url=URI,
    username=USERNAME,
    password=PASSWORD,
    index_name=index_name,
    keyword_index_name=keyword_index_name,
    search_type=search_type,
)

retriever = store.as_retriever()

In [None]:
# This prompt worked well for submitting lists of lookup phrases and receiving lists of found verified matches
prompt_template = """You will be give a list of phrases to search for within the context provided. Strictly follow these outlined rules when producing answers:
1. Only answer with information found within the context provided.
2. If you don't know the answer, don't try to make up an answer. Just answer with NONE.
3. If you find the answer, only respond with a list of complete sentences that pertain to the provided phrases in the following format: [sentence1, sentence2, ...]
4. If you do not find the answer at first, attempt again but only once.
5. Do not append newlines to end of response.

This document pertains to: {context}

Question: {question}

Answer:

{summaries}"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question", "summaries"]
)

In [None]:
context: str = doc_name_only
# question: str = input(f"What is your question about {context}")

raw_question_list: list = [
    'customer name apart of the contract',
    'effective date of document',
    'required notice time before non-renewal',
    'what countries are excluded due to sanctions',
    'who is not liable for delay of duties',
    'how long is given to not pay before suspense',
    'time for notice of non-renewal',
    'how much time is given from invoice date to pay fees',
    'any phrase present that discusses entity purchasing',
    'paragraph that outline acknowledgments for binding a user to agreement of conditions',
]

sanitized_question_list = [item for item in raw_question_list if item != ""]

formatted_question_list: str = "["
for iter_n in range(len(sanitized_question_list)):
    if iter_n < len(sanitized_question_list)-1:
        formatted_question_list += str(sanitized_question_list[iter_n])+", "
    else:
        formatted_question_list += str(sanitized_question_list[iter_n])+"]"

# formatted_question_list: str = f"[{raw_question_list[0]}, {raw_question_list[1]}, {raw_question_list[2]}]"

memory = ConversationBufferMemory(memory_key="history", 
                                  input_key="question", 
                                  output_key='answer', 
                                  return_messages=True,
                                  )

llm = ChatOllama(model="llama3:latest", 
                 temperature=0.2,
                 format="json",
                 )

chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm,
    chain_type="stuff", 
    retriever=retriever, 
    chain_type_kwargs={ "prompt": PROMPT },
    memory=memory,
)

response = ast.literal_eval(chain.invoke({'context': context, 'question': formatted_question_list})['answer'])

# print(f"Answer list: {response['answer']}")
for question, answer in response.items():
    print(f"Question: {question}\nAnswer: {answer}\n")