In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Neo4jVector
from unstructured.partition.pdf import partition_pdf
from unstructured.chunking.title import chunk_by_title
from langchain.docstore.document import Document
from tqdm import tqdm
from dotenv import load_dotenv
import os
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import PromptTemplate
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from neo4j import GraphDatabase


load_dotenv('.env', override=True)


NEO4J_URL = os.getenv('NEO4J_URI')
NEO4J_USER = os.getenv('NEO4J_USERNAME')
NEO4J_PASSWORD = os.getenv('NEO4J_PASSWORD')
NEO4J_DATABASE = os.getenv('NEO4J_DATABASE') or 'neo4j'
GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
%%time

pdf_file = 'data/EU AI ACT.pdf'

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap  = 200,
    length_function = len,
    is_separator_regex = False,
)

def split_pdf_data_from_file(pdf_file):
    chunks_with_metadata = [] # accumlate chunk records

    pdf_elements = partition_pdf(pdf_file)
    elements = chunk_by_title(pdf_elements)

    chunk_seq_id = 0
    for element in tqdm(elements):
        if len(element.text) < 5:
            continue

        chunks = text_splitter.split_text(element.text)

        for chunk in chunks:
            chunks_with_metadata.append(Document(
                page_content = chunk,
                metadata = {
                    "source": "local",
                    "chunk_seq_id": chunk_seq_id,
                    "page_number": element.metadata.to_dict()['page_number'],
                }
            ))
            chunk_seq_id += 1


    return chunks_with_metadata


docs = split_pdf_data_from_file(pdf_file)

# Initialize HuggingFace embeddings
model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=model_name)


# Create a Neo4jVectorStore
vector_store = Neo4jVector.from_documents(
    docs,
    embeddings,
    url=NEO4J_URL,
    username=NEO4J_USER,
    password=NEO4J_PASSWORD
)

# Perform a similarity search
query = "What is the EU AI Act?"
results = vector_store.similarity_search(query)

# Print the search results
for result in results:
    print(result.page_content)

100%|██████████| 1526/1526 [00:01<00:00, 1482.82it/s]


Given the major impact that AI can have on society and the need to build trust, it is vital

for AI and its regulatory framework to be developed in accordance with Union values as

enshrined in Article 2 of the Treaty on European Union (TEU), the fundamental rights

and freedoms enshrined in the Treaties and, pursuant to Article 6 TEU, the Charter. As

a pre-requisite, AI should be a human-centric technology. It should serve as a tool for
Parliament and of the Council16.

Directive 2000/31/EC of the European Parliament and of the Council of 8 June 2000 on certain legal aspects of information society services, in particular electronic commerce, in the Internal Market ('Directive on electronic commerce') (OJ L 178, 17.7.2000, p. 1).

(12)

The notion of ‘AI system’ in this Regulation should be clearly defined and should be

closely aligned with the work of international organisations working on AI to ensure
The Commission shall, after consulting the European Artificial Intelligence Board

In [8]:

driver = GraphDatabase.driver(NEO4J_URL, database=NEO4J_DATABASE, auth=(NEO4J_USER, NEO4J_PASSWORD))

with driver.session() as session:
    for doc in tqdm(docs):
        # Create a node for each document
        session.run(
            "CREATE (d:Document {id: $id, content: $content, page_number: $page_number})",
            id=doc.metadata["chunk_seq_id"],
            content=doc.page_content,
            page_number=doc.metadata["page_number"],
        )

        # Create relationships between documents based on similarity
        similar_docs = vector_store.similarity_search(doc.page_content, k=5)
        for similar_doc in similar_docs:
            if similar_doc.metadata["chunk_seq_id"] != doc.metadata["chunk_seq_id"]:
                session.run(
                    "MATCH (d1:Document {id: $id1}), (d2:Document {id: $id2}) "
                    "CREATE (d1)-[:SIMILAR_TO]->(d2)",
                    id1=doc.metadata["chunk_seq_id"],
                    id2=similar_doc.metadata["chunk_seq_id"],
                )

driver.close()

100%|██████████| 1526/1526 [01:19<00:00, 19.10it/s]


In [9]:
retriever = vector_store.as_retriever(search_kwargs={"k": 5})

# Initialize the Gemini model
llm = ChatGoogleGenerativeAI(model="gemini-pro", google_api_key=GEMINI_API_KEY)

# Prompt template to query Gemini
llm_prompt_template = """You are an assistant for question-answering tasks.
Use the following context to answer the question.
Use five sentences minimum and keep the answer concise.\n
Question: {question} \nContext: {context} \nAnswer:"""

llm_prompt = PromptTemplate.from_template(llm_prompt_template)

# Combine data from documents to readable string format.
def format_docs(docs):
    # print(docs)
    return "\n\n".join(doc.page_content for doc in docs)

# Create stuff documents chain using LCEL.
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | llm_prompt
    | llm
    | StrOutputParser()
)

In [10]:
rag_chain.invoke("What is the importance of AI for new businesses?")

'AI plays a crucial role for new businesses by enhancing prediction capabilities, optimizing operations, and personalizing digital solutions. It provides competitive advantages by improving resource allocation, leading to more efficient and effective business practices. AI empowers new businesses to leverage data and technology to gain insights, automate tasks, and deliver personalized experiences to customers. By embracing AI, startups and SMEs can accelerate their growth, compete more effectively, and drive innovation in their respective industries.'

## Show similarities between two documents