RAG Model 2 - Advanced RAG - Knowledge Graphs


#### Initialisation
Basic API keys


In [2]:
import os
from dotenv import load_dotenv
load_dotenv()
#open ai api key:
key = os.environ["OPENAI_3_5_KEY"] #Create a free openai 3.5 turbo key


#### Initialise Azure


In [2]:
import os
import openai
import dotenv
def initialise():
    dotenv.load_dotenv()
    use_azure_active_directory = False  # Set this flag to True if you are using Azure Active Directory (Code been removed)
    if not use_azure_active_directory:
        endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
        api_key = os.environ["AZURE_OPENAI_API_KEY"]

        client = openai.AzureOpenAI(
            azure_endpoint=endpoint,
            api_key=api_key,
            api_version="2023-09-01-preview"
        )
        return client
client = initialise()

deployment = "text-embedding-ada-002-wus" # Fill in the deployment name from the portal here
embeddings = client.embeddings.create(
    model=deployment,
    input="The food was delicious and the waiter..."
)
#Test that embeddings are working/ connection active.              
#print(embeddings)

CreateEmbeddingResponse(data=[Embedding(embedding=[0.0023481971584260464, -0.00928489863872528, 0.015701189637184143, -0.007710971869528294, -0.0047217803075909615, 0.014863453805446625, -0.009849735535681248, -0.038281962275505066, -0.006930354982614517, -0.028660697862505913, 0.025182828307151794, 0.018112851306796074, -0.0036047999747097492, -0.025576310232281685, 0.000499785237479955, -0.01636122167110443, 0.02827990986406803, 0.005350081715732813, 0.009652994573116302, -0.0163866076618433, -0.015396557748317719, 0.004258487373590469, 0.007006512489169836, -0.007241332437843084, -0.0038491394370794296, 0.018506333231925964, 0.00870736874639988, -0.02268231473863125, 0.011474433355033398, 0.023926224559545517, 0.015625031664967537, -0.0034905634820461273, -0.03493101894855499, -0.004160116892307997, -0.026147492229938507, -0.021539948880672455, -0.005632500164210796, 0.011785411275923252, 0.008453509770333767, 0.004122037906199694, 0.01920444518327713, -0.014444585889577866, 0.00897

#### Function that reads a PDF, creates a knowledge graph in neo4j, and adds embeddings.


In [3]:
##Initialise Neo4j
#Ensures that connection is successful 
from neo4j import GraphDatabase

# URI examples: "neo4j://localhost", "neo4j+s://xxx.databases.neo4j.io"
URI = "bolt://localhost:7687"
AUTH = ("neo4j", "m0del2_pass")

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()
    print("Connection established.")

Connection established.


A lot of the following code was reused from the following repository:<br>
https://github.com/Joshua-Yu/graph-rag/blob/main/openai%2Bllamaparse/demo_neo4j_vectordb.ipynb<br><br>


In [38]:
from llama_parse import LlamaParse
from llama_index.core.node_parser import MarkdownElementNodeParser
import nest_asyncio; 
nest_asyncio.apply()
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings

os.environ["OPENAI_API_KEY"] = key

EMBEDDING_MODEL  = "text-embedding-ada-002"
GENERATION_MODEL = "gpt-3.5-turbo"

llm = OpenAI(model=GENERATION_MODEL) #Using the free 3.5 model for now

Settings.llm = llm

## As this only gets preformed once, I decided to change the file name manually. 
## If in the future, you may need to change to include a for loop to iterate through the file.
## An example can be found in the Model 1 jupyter notebook.
pdf_file_name = './Vector_docs/ECB_Doc.pdf'

LlamaParse_api_key = os.environ["Llama_parse_API_key"]
documents = LlamaParse(api_key=LlamaParse_api_key,result_type="markdown").load_data(pdf_file_name) #You will be required to create your own LlamaParse key to parse documents.

print(f"Number of documents: {len(documents)}")
# Parse the documents using MarkdownElementNodeParser
node_parser = MarkdownElementNodeParser(llm=llm, num_workers=8)

# Retrieve nodes (text) and objects (table)
nodes = node_parser.get_nodes_from_documents(documents)

#convert nodes to documents
base_nodes, objects = node_parser.get_nodes_and_objects(nodes)

print(f"Number of nodes: {len(base_nodes)}")



Started parsing the file under job_id e73977dd-ce34-42c0-9de6-85d974f5556f
.Number of documents: 285


1it [00:00, ?it/s]
100%|██████████| 1/1 [00:01<00:00,  1.54s/it]
1it [00:00, ?it/s]
100%|██████████| 1/1 [00:03<00:00,  3.07s/it]
3it [00:00, ?it/s]
100%|██████████| 3/3 [00:02<00:00,  1.01it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, 87.67it/s]
100%|██████████| 1/1 [00:03<00:00,  3.24s/it]
2it [00:00, ?it/s]
100%|██████████| 2/2 [00:01<00:00,  1.39it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
3it [00:00, 3240.51it/s]
100%|██████████| 3/3 [00:01<00:00,  1.57it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, ?it/s]
100%|██████████| 1/1 [00:02<00:00,  2.49s/it]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
1it [00:00, ?it/s]
100%|██████████| 1/1 [00:01<00:00,  1.31s/it]
1it [00:00, ?i

Number of nodes: 389





In [4]:
from neo4j import GraphDatabase

# Local Neo4j instance
# NEO4J_URL = "bolt://localhost:7687"
# Remote Neo4j instance on AuraDB
NEO4J_URL = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "m0del2_pass"
NEO4J_DATABASE = "neo4j"

#We shall now create our knowledge graph's basic format. We create a section, chunk and document.
def initialiseNeo4jSchema():
    cypher_schema = [
        "CREATE CONSTRAINT sectionKey IF NOT EXISTS FOR (c:Section) REQUIRE (c.key) IS UNIQUE;",
        "CREATE CONSTRAINT chunkKey IF NOT EXISTS FOR (c:Chunk) REQUIRE (c.key) IS UNIQUE;",
        "CREATE CONSTRAINT documentKey IF NOT EXISTS FOR (c:Document) REQUIRE (c.url_hash) IS UNIQUE;",
        "CREATE VECTOR INDEX `chunkVectorIndex` IF NOT EXISTS FOR (e:Embedding) ON (e.value) OPTIONS { indexConfig: {`vector.dimensions`: 1536, `vector.similarity_function`: 'cosine'}};"
    ]

    driver = GraphDatabase.driver(NEO4J_URL, database=NEO4J_DATABASE, auth=(NEO4J_USER, NEO4J_PASSWORD))

    with driver.session() as session:
        for cypher in cypher_schema:
            session.run(cypher)
    driver.close()

# create constraints and indexes
initialiseNeo4jSchema()

In [None]:
import json
TABLE_REF_SUFFIX = '_table_ref'
TABLE_ID_SUFFIX  = '_table'

# Check parsed objects 

print(f"Number of objects: {len(objects)}")

#Iterate through each node in the list created, printing ids and other basic info
for node in objects: 
    print(f"id:{node.node_id}")
    print(f"hash:{node.hash}")
    print(f"parent:{node.parent_node}")
    print(f"prev:{node.prev_node}")
    print(f"next:{node.next_node}")

    # Check to see if an object is a table
    if node.node_id[-1 * len(TABLE_REF_SUFFIX):] == TABLE_REF_SUFFIX:

        if node.next_node is not None:
            next_node = node.next_node
        
            print(f"next_node metadata:{next_node.metadata}")
            print(f"next_next_node:{next_next_nod_id}")

            obj_metadata = json.loads(str(next_node.json()))

            print(str(obj_metadata))

            print(f"def:{obj_metadata['metadata']['table_df']}")
            print(f"summary:{obj_metadata['metadata']['table_summary']}")

    #print additional information about current node
    print(f"next:{node.next_node}")
    print(f"type:{node.get_type()}")
    print(f"class:{node.class_name()}")
    print(f"content:{node.get_content()[:200]}")
    print(f"metadata:{node.metadata}")
    print(f"extra:{node.extra_info}")
    
    node_json = json.loads(node.json())

    print(f"start_idx:{node_json.get('start_char_idx')}")
    print(f"end_idx:{node_json['end_char_idx']}")

    if 'table_summary' in node_json: 
        print(f"summary:{node_json['table_summary']}")

    print("=====================================")   

In [41]:
driver = GraphDatabase.driver(NEO4J_URL, database=NEO4J_DATABASE, auth=(NEO4J_USER, NEO4J_PASSWORD))

# ================================================
# 1) Save documents

print("Start saving documents to Neo4j...")
i = 0
with driver.session() as session:
    for doc in documents:
        cypher = "MERGE (d:Document {url_hash: $doc_id}) ON CREATE SET d.url=$url;"
        session.run(cypher, doc_id=doc.doc_id, url=doc.doc_id)
        i = i + 1
    session.close()

print(f"{i} documents saved.")

# ================================================
# 2) Save nodes

print("Start saving nodes to Neo4j...")

i = 0
with driver.session() as session:
    for node in base_nodes: 

        # >>1 Create Section node
        cypher  = "MERGE (c:Section {key: $node_id})\n"
        cypher += " FOREACH (ignoreMe IN CASE WHEN c.type IS NULL THEN [1] ELSE [] END |\n"
        cypher += "     SET c.hash = $hash, c.text=$content, c.type=$type, c.class=$class_name, c.start_idx=$start_idx, c.end_idx=$end_idx )\n"
        cypher += " WITH c\n"
        cypher += " MATCH (d:Document {url_hash: $doc_id})\n"
        cypher += " MERGE (d)<-[:HAS_DOCUMENT]-(c);"

        node_json = json.loads(node.json())

        session.run(cypher, node_id=node.node_id, hash=node.hash, content=node.get_content(), type='TEXT', class_name=node.class_name()
                          , start_idx=node_json['start_char_idx'], end_idx=node_json['end_char_idx'], doc_id=node.ref_doc_id)

        # >>2 Link node using NEXT relationship

        if node.next_node is not None: # and node.next_node.node_id[-1*len(TABLE_REF_SUFFIX):] != TABLE_REF_SUFFIX:
            cypher  = "MATCH (c:Section {key: $node_id})\n"    # current node should exist
            cypher += "MERGE (p:Section {key: $next_id})\n"    # previous node may not exist
            cypher += "MERGE (p)<-[:NEXT]-(c);"

            session.run(cypher, node_id=node.node_id, next_id=node.next_node.node_id)

        if node.prev_node is not None:  # Because tables are in objects list, so we need to link from the opposite direction
            cypher  = "MATCH (c:Section {key: $node_id})\n"    # current node should exist
            cypher += "MERGE (p:Section {key: $prev_id})\n"    # previous node may not exist
            cypher += "MERGE (p)-[:NEXT]->(c);"

            if node.prev_node.node_id[-1 * len(TABLE_ID_SUFFIX):] == TABLE_ID_SUFFIX:
                prev_id = node.prev_node.node_id + '_ref'
            else:
                prev_id = node.prev_node.node_id

            session.run(cypher, node_id=node.node_id, prev_id=prev_id)

        i = i + 1
    session.close()

print(f"{i} nodes saved.")

# ================================================
# 3) Save objects

print("Start saving objects to Neo4j...")

i = 0
with driver.session() as session:
    for node in objects:               
        node_json = json.loads(node.json())

        # Object is a Table, then the ????_ref_table object is created as a Section, and the table object is Chunk
        if node.node_id[-1 * len(TABLE_REF_SUFFIX):] == TABLE_REF_SUFFIX:
            if node.next_node is not None:  # here is where actual table object is loaded
                next_node = node.next_node

                obj_metadata = json.loads(str(next_node.json()))

                cypher  = "MERGE (s:Section {key: $node_id})\n"
                cypher += "WITH s MERGE (c:Chunk {key: $table_id})\n"
                cypher += " FOREACH (ignoreMe IN CASE WHEN c.type IS NULL THEN [1] ELSE [] END |\n"
                cypher += "     SET c.hash = $hash, c.definition=$content, c.text=$table_summary, c.type=$type, c.start_idx=$start_idx, c.end_idx=$end_idx )\n"
                cypher += " WITH s, c\n"
                cypher += " MERGE (s) <-[:UNDER_SECTION]- (c)\n"
                cypher += " WITH s MATCH (d:Document {url_hash: $doc_id})\n"
                cypher += " MERGE (d)<-[:HAS_DOCUMENT]-(s);"

                session.run(cypher, node_id=node.node_id, hash=next_node.hash, content=obj_metadata['metadata']['table_df'], type='TABLE'
                                  , start_idx=node_json['start_char_idx'], end_idx=node_json['end_char_idx']
                                  , doc_id=node.ref_doc_id, table_summary=obj_metadata['metadata']['table_summary'], table_id=next_node.node_id)
                
            if node.prev_node is not None:
                cypher  = "MATCH (c:Section {key: $node_id})\n"    # current node should exist
                cypher += "MERGE (p:Section {key: $prev_id})\n"    # previous node may not exist
                cypher += "MERGE (p)-[:NEXT]->(c);"

                if node.prev_node.node_id[-1 * len(TABLE_ID_SUFFIX):] == TABLE_ID_SUFFIX:
                    prev_id = node.prev_node.node_id + '_ref'
                else:
                    prev_id = node.prev_node.node_id
                
                session.run(cypher, node_id=node.node_id, prev_id=prev_id)
                
        i = i + 1
    session.close()

# ================================================
# 4) Create Chunks for each Section object of type TEXT
# If there are changes to the content of TEXT section, the Section node needs to be recreated

print("Start creating chunks for each TEXT Section...")

with driver.session() as session:

    cypher  = "MATCH (s:Section) WHERE s.type='TEXT' \n"
    cypher += "WITH s CALL {\n"
    cypher += "WITH s WITH s, split(s.text, '\n') AS para\n"
    cypher += "WITH s, para, range(0, size(para)-1) AS iterator\n"
    cypher += "UNWIND iterator AS i WITH s, trim(para[i]) AS chunk, i WHERE size(chunk) > 0\n"
    cypher += "MERGE (c:Chunk {key: s.key + '_' + i}) SET c.type='TEXT', c.text = chunk, c.seq = i \n"
    cypher += "MERGE (s) <-[:UNDER_SECTION]-(c) } IN TRANSACTIONS OF 500 ROWS ;"
    
    session.run(cypher)
    
    session.close()


print(f"{i} objects saved.")

print("=================DONE====================")

driver.close()

Start saving documents to Neo4j...
285 documents saved.
Start saving nodes to Neo4j...
389 nodes saved.
Start saving objects to Neo4j...
Start creating chunks for each TEXT Section...
113 objects saved.


In [12]:
from openai import OpenAI
import os
NEO4J_URL = "bolt://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "m0del2_pass"
NEO4J_DATABASE = "neo4j"

def get_embedding(client, text, model):
    response = client.embeddings.create(
                    input=text,
                    model=model,
                )
    return response.data[0].embedding

def LoadEmbedding(label, property):
    driver = GraphDatabase.driver(NEO4J_URL, auth=(NEO4J_USER, NEO4J_PASSWORD), database=NEO4J_DATABASE)
    openai_client = OpenAI (api_key = key)

    with driver.session() as session:
        # get chunks in document, together with their section titles
        result = session.run(f"MATCH (ch:{label}) RETURN id(ch) AS id, ch.{property} AS text")
        # call OpenAI embedding API to generate embeddings for each proporty of node
        # for each node, update the embedding property
        count = 0
        for record in result:
            id = record["id"]
            text = record["text"]
            
            # For better performance, text can be batched
            embedding = get_embedding(openai_client, text, EMBEDDING_MODEL)
            
            # key property of Embedding node differentiates different embeddings
            cypher = "CREATE (e:Embedding) SET e.key=$key, e.value=$embedding, e.model=$model"
            cypher = cypher + " WITH e MATCH (n) WHERE id(n) = $id CREATE (n) -[:HAS_EMBEDDING]-> (e)"
            session.run(cypher,key=property, embedding=embedding, id=id, model=EMBEDDING_MODEL) 
            count = count + 1

        session.close()
        
        print("Processed " + str(count) + " " + label + " nodes for property @" + property + ".")
        return count

__Please note that the creation of embeddings may take up to an hour__


In [None]:
EMBEDDING_MODEL  = "text-embedding-ada-002"
# LoadEmbedding("Chunk", "text")

Now the knowledge graph has been created, we do not need to run the code anymore.

#### Retrieval
We will now create the embeddings if they do not already exist<br>
Then we will traverse the knowledge graph to generate context

In [18]:
import os
from langchain.vectorstores.neo4j_vector import Neo4jVector
from langchain.embeddings.openai import OpenAIEmbeddings
import openai
openai.api_key = key
os.environ["OPENAI_API_KEY"]=key
database_pass = os.environ["model_2_pass"]

# We now need to create an a vector index in Neo4j. This is essentially like a vector database.
# Once created we can run the following code to access it. I have named mine index_1.
vector_index = Neo4jVector.from_existing_graph(
    OpenAIEmbeddings(),
    url="bolt://localhost:7687",
    username="neo4j",
    password=database_pass,
    database="neo4j",
    index_name='index_1',
    node_label="Chunk",
    text_node_properties=['key'],
    embedding_node_property='embedding',
)

In [20]:
response = vector_index.similarity_search(
    "The institution shall calculate the fully adjusted value of the exposure"
)
print(response[3].page_content)
print(response)

#This just demonstrates that the knowledge index can be accessed. 


key: f64b792f-a77d-4cde-8e4d-4ed2aa2fa12c_10
[Document(metadata={'text': 'or being omitted from the risk engines.', 'seq': 12, 'type': 'TEXT'}, page_content='\nkey: f64b792f-a77d-4cde-8e4d-4ed2aa2fa12c_12'), Document(metadata={'text': 'Monitoring RNIME', 'seq': 20, 'type': 'TEXT'}, page_content='\nkey: f64b792f-a77d-4cde-8e4d-4ed2aa2fa12c_20'), Document(metadata={'text': 'This is a draft Instrument to accompany CP16/22 ‘Implementation of the Basel 3.1 standards’.', 'seq': 0, 'type': 'TEXT'}, page_content='\nkey: f78b3d28-81a5-4d80-a1e5-21d2e848f559_0'), Document(metadata={'text': 'Other Factors Leading to Risks Not Being Captured Accurately', 'seq': 10, 'type': 'TEXT'}, page_content='\nkey: f64b792f-a77d-4cde-8e4d-4ed2aa2fa12c_10')]


In [16]:
from neo4j import GraphDatabase

# Connect to Neo4j
uri = "bolt://localhost:7687"
username = "neo4j"
password = "m0del2_pass"
driver = GraphDatabase.driver(uri, auth=(username, password))

# Define the function to find connected nodes
def find_connected_nodes(tx, initial_node_id):
    query = (
        """
        MATCH (n)-[r]->(m)
        WHERE n.key = $initial_node_id
        RETURN n, r, m
        """
    ) # This runs a cypher query that allows for us to retrieved all connected nodes to our 'initial node'
    result = tx.run(query, initial_node_id=initial_node_id)
    return [(record["n"], record["r"], record["m"]) for record in result] #returns relevant information

def get_connected_nodes(initial_node_id):
    with driver.session() as session:
        result = session.execute_read(find_connected_nodes, initial_node_id)
        return result

## Example retrieval:
# Fetch connected nodes for the initial node
initial_node_id = "e18d8566-ced0-4f03-afe1-07b1a7e45b84_2"  # Replace with your actual node ID
connected_nodes = get_connected_nodes(initial_node_id)

#Fetch context from connected node
def generate_context(connected_nodes):
    for n, r, m in connected_nodes:
        if m['text'] != 'None' and r.type == 'UNDER_SECTION':
            #print("Context found:",m['text'])
            return m['text']

# generate_context(connected_nodes)

# Don't forget to close the driver connection when done
driver.close()


  result = session.read_transaction(find_connected_nodes, initial_node_id)


#### Generating our response from full context

In [17]:
from langchain_openai import AzureChatOpenAI
import openai
from langchain.prompts import ChatPromptTemplate
from langchain.docstore.document import Document

# Function to generate response
def generate_response1(question, vector_index):
    # Initialize the AzureChatOpenAI instance
    llm = AzureChatOpenAI(
        azure_deployment="gpt-4o-gs-v1",
        azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
        openai_api_key=os.getenv("AZURE_OPENAI_API_KEY"),
        api_version="2023-05-15",
        verbose=False,
        temperature=0,
    )
 
    # Create a prompt template
    template = """
    You are a regulatory assistant called Reggie, answer the question based on the documents provided. If you can't answer
    the question, reply "This question is outside my dataset. Please try again or see 'help' for advice on how to structure a question efficiently.".
        
    Context: {context}

    Question: {question}
    """
    prompt_template = ChatPromptTemplate.from_template(template)

    # Define the context generator function
    def context_generator(question):
        response_key = vector_index.similarity_search(question) #Uses similarity search to return the nearest embedding's Neo4j key. Returns 4 keys by default.
        list_of_keys = []
        context_document = []
        for i in range(0, len(response_key)):
            new_response_key = response_key[i].page_content
            list_of_keys.append(new_response_key[6:]) #removes the word 'key:  ' from the start of each key.
        #print(list_of_keys)

        for i in range(0, len(list_of_keys)):
            initial_node_id = list_of_keys[i]
            connected_nodes = get_connected_nodes(initial_node_id) #Retrieves the text from the Section linked to each chunk. (This provides full context for that chunk)
            context_document.append(Document(page_content =generate_context(connected_nodes) )) # Then adds all of the context into a list ## need to add meta data as well??
        print(context_document[3])
        #Need to ensure each document is unique:
        unique_documents = []
        seen_ids = set()
        for doc in context_document:
            if doc.id not in seen_ids:
                unique_documents.append(doc)
                seen_ids.add(doc.id)

        return unique_documents

    context_document = context_generator(question)
    # Format the documents
    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    # Generate context and format the prompt
    context_text = format_docs(context_document)

    formatted_prompt = prompt_template.format(context=context_text, question=question) # Adds context to prompt
    #print(formatted_prompt)
    
    # Create the RAG response
    response = llm.invoke(formatted_prompt) 
    print(response.content)
    
    return response.content

generate_response1("what is VaR?",vector_index)

  with driver.session() as session:
  result = session.read_transaction(find_connected_nodes, initial_node_id)


page_content='Bank of England

Page 119

changes to the IRB framework that would need to be in place by the PRA’s proposed
implementation date (see Chapter 1 – Overview);

measures that would need to be in place where firms’ IRB models are not fully compliant
by the PRA’s proposed implementation date; and

timescales for IRB model submissions.

 Changes required by the PRA’s proposed implementation date

4.17 The PRA proposes that firms implement non-modelling related changes and input floors
to model parameters by the PRA’s proposed implementation date of 1 January 2025. The
following changes would apply from this date:

all restrictions on the scope of IRB models (eg restrictions on modelling EAD and mandatory use of the SA or FIRB approach, subject to the transitional arrangements for equity exposures);all changes to LGD and EAD under the FIRB approach and all changes to the maturity calculation; andall IRB input floors.

4.18 The PRA notes that the above non-modelling related chang

"This question is outside my dataset. Please try again or see 'help' for advice on how to structure a question efficiently."