# GraphRAG with GraphDB and LangChain
This is the notebook that accompanies my blogpost about doing GraphRAG with GraphDB and LangChain.

In [1]:
import pandas as pd
import os
from typing import Dict, Any, List
from openai import OpenAI
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.graphs import OntotextGraphDBGraph
from langchain_community.chains.graph_qa.prompts import GRAPHDB_QA_TEMPLATE
from elasticsearch import Elasticsearch
import json
from tqdm import tqdm

In [2]:
# endpoint for GraphDB
graphdb_endpoint = "http://localhost:7200/repositories/msft-graphrag-300"

In [23]:
es_username = 'elastic'
es_password = ''  # put your password here

## Setup the question
The question we will ask will be:
```
What is the relationship between Bob Cratchit and Belinda Cratchit?
```

In [4]:
question_text = "What is the relationship between Bob Cratchit and Belinda Cratchit?"

## Convert the question text
First step is to convert the question into an embedding vector. To do this we will use our local LM Studio instance and call the embedding OpenAI endpoint.

In [5]:
def get_embedding(text: str, client: Any, model: str="CompendiumLabs/bge-large-en-v1.5-gguf"):
    """Convert the text into an embedding vector using the model provided

    :param text: text to be converted to and embedding vector
    :param client: OpenAI client
    :param model: name of the model to use for encoding
    """
    text = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=model).data[0].embedding

In [6]:
client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")

In [7]:
embedding_vector = get_embedding(question_text, client=client)

Set the limits for our searches

In [8]:
top_chunks = 3
top_communities = 3
top_outside_relationships = 10
top_inside_relationships = 10
top_entities = 10

## Find nearest Entities
Now we need to use our Elasticsearch index to do a k-nearest neighbour search for our **embedding_vector** to find the 10 nearest `Entity` instances.

In [9]:
index_name = "entity_graph_index" 

In [10]:
es = Elasticsearch("http://localhost:9200", 
                   basic_auth=(es_username, es_password), 
                   verify_certs=False)

In [11]:
query = {
    "field" : "description_embedding" ,
    "query_vector" : embedding_vector,
    "k" : top_entities,
    "num_candidates" : 100 ,
}
res = es.search(index=index_name, knn=query, source=["id"])
search_results = res["hits"]["hits"]
# convert our results into a list of Entities
# This list will be ordered by match score descending (i.e the more likely matches will be at the beginning)
entity_list = [x['_id'] for x in search_results]
entity_list

['dde131ab575d44dbb55289a6972be18f',
 '68105770b523412388424d984e711917',
 '40e4ef7dbc98473ba311bd837859a62a',
 '254770028d7a4fa9877da4ba0ad5ad21',
 'da1684437ab04f23adac28ff70bd8429',
 'd91a266f766b4737a06b0fda588ba40b',
 'bc0e3f075a4c4ebbb7c7b152b65a5625',
 '23becf8c6fca4f47a53ec4883d4bf63f',
 '496f17c2f74244c681db1b23c7a39c0c',
 '3d0dcbc8971b415ea18065edc4d8c8ef']

## Using OntotextGraphDBGraph
We'll use the `OntotextGraphDBGraph` class in LangChain to define our interface to the GraphDB instance. One of the things that it needs is an Ontology, which is included in this repository: [msft-graphrag.owl](../data/msft-graphrag.owl)

In [12]:
ontology_path = "D:/Data/RDF/msft-graphrag.owl"
graph = OntotextGraphDBGraph(
    query_endpoint=graphdb_endpoint,
    local_file=ontology_path,
)

### Create an Entity Filter
Create an `Entity` filter in SPARQL that we'll use to limit our searches later on.

In [13]:
entity_id_filter = ""
first = True
for entity_id in entity_list:
    if first:
        entity_id_filter += "FILTER("
    else:
        entity_id_filter += " || "
    entity_id_filter += f'?id = "{entity_id}" '
    first = False
entity_id_filter += ")"
entity_id_filter

'FILTER(?id = "dde131ab575d44dbb55289a6972be18f"  || ?id = "68105770b523412388424d984e711917"  || ?id = "40e4ef7dbc98473ba311bd837859a62a"  || ?id = "254770028d7a4fa9877da4ba0ad5ad21"  || ?id = "da1684437ab04f23adac28ff70bd8429"  || ?id = "d91a266f766b4737a06b0fda588ba40b"  || ?id = "bc0e3f075a4c4ebbb7c7b152b65a5625"  || ?id = "23becf8c6fca4f47a53ec4883d4bf63f"  || ?id = "496f17c2f74244c681db1b23c7a39c0c"  || ?id = "3d0dcbc8971b415ea18065edc4d8c8ef" )'

### Create our Main SPARQL query
Create our main SPARQL query using our **entity_id_filter** and the information in our Knowledge Graph.

In [14]:
def get_multiple(entity_id_filter: str, 
                 limit_chunks: int = 3, 
                 limit_communities: int = 3,
                 limit_inside_relationships: int = 10,
                 limit_outside_relationships: int = 10):
    """Combine multiple SPARQL queries into one

    :param entity_id_filter: SPARQL FILTER text for Entity IDs
    :param limit_chunks: maximum number of Chunk records to fetch
    :param limit_communities: maximum number of Community records to fetch
    :param limit_inside_relationships: maximum number of inside related_to records to fetch
    :param limit_outside_relationships: maximum number of outside related_to records to fetch
    """
    query = """
PREFIX gr: <http://ormynet.com/ns/msft-graphrag#>

SELECT * WHERE
{ 
  {
    #-- Entities -->
    SELECT ?description
    WHERE
    {
        ?entity_uri a gr:Entity;
        gr:id ?id;
        gr:description ?entity_desc .
        BIND(REPLACE(?entity_desc, "\\r\\n", " ", "i") AS ?description)
    """
    query += entity_id_filter
    query += """
    }
  }
  UNION
  {
    #-- Chunks -->
    SELECT 
    ?chunkText 
    (COUNT(?entity_uri) AS ?freq)
    WHERE {
        ?chunk_uri gr:has_entity ?entity_uri;
        gr:text ?chunk_text .
        ?entity_uri a gr:Entity;
            gr:id ?id .
    """
    query += entity_id_filter
    query += """
        BIND(REPLACE(?chunk_text, "\\r\\n", " ") as ?chunkText)
    }
    GROUP BY ?chunk_uri ?chunkText
    ORDER BY DESC(?freq)
    """
    query += f" LIMIT {limit_chunks} "
    query += """
  }
  UNION
  {
    #-- Communities -->
    SELECT ?summary
    WHERE
    {
        ?community_uri a gr:Community;
          gr:rank ?rank;
          gr:weight ?weight;
          gr:summary ?community_summary .
        BIND(REPLACE(?community_summary, "\\r\\n", " ", "i") AS ?summary)
        ?entity_uri gr:in_community ?community_uri;
            gr:id ?id .
    """
    query += entity_id_filter
    query += """
    }
    GROUP BY ?rank ?weight ?community_uri ?summary
    ORDER BY DESC(?rank) DESC(?weight)
    """
    query += f" LIMIT {limit_communities} "
    query += """
  }
  UNION
  {
    #-- Outside Relationships -->
    SELECT ?description
    WHERE {
        ?related_to_uri a gr:related_to;
            gr:id ?related_id;
            gr:rank ?rank;
            gr:description ?desc;
            gr:weight ?weight .
        BIND(REPLACE(?desc, "\\r\\n", "") as ?description)
        ?entity_from_uri ?related_to_uri ?entity_to_uri .
        ?entity_from_uri gr:id ?entity_from_id .
        ?entity_to_uri gr:id ?id .
    """
    query += entity_id_filter
    query += """
    }
    ORDER BY DESC(?rank) DESC(?weight)
    """
    query += f" LIMIT {limit_inside_relationships} "  
    query += """
  }
  UNION
  {
    #-- Inside Relationships -->
        SELECT ?description
        WHERE {
            ?related_to_uri a gr:related_to;
                gr:id ?related_id;
                gr:rank ?rank;
                gr:description ?desc;
                gr:weight ?weight .
            BIND(REPLACE(?desc, "\\r\\n", "") as ?description)
            ?entity_from_uri ?related_to_uri ?entity_to_uri .
            ?entity_from_uri gr:id ?id .
            ?entity_to_uri gr:id ?entity_to_id .
    """
    query += entity_id_filter
    query += """
    }
    ORDER BY DESC(?rank) DESC(?weight)
    """
    query += f" LIMIT {limit_outside_relationships} "  
    query += """
  }
}
    """
    return query

In [15]:
query_sparql = get_multiple(entity_id_filter, 
                            top_chunks, 
                            top_communities, 
                            top_inside_relationships, 
                            top_outside_relationships)

### Use the GraphDB interface to do the SPARQL query
We will now use our LangChain GraphDB graph to perform the SPARQL query. What is returned is a list of RDF triples.

In [16]:
query_results = graph.query(query_sparql)

### Inject the RDF Triples into the Prompt
Within LangChain there is a prompt template that we can use that can cope with RDF triples. It's called `GRAPHDB_QA_TEMPLATE`. This is what it says:

```
Task: Generate a natural language response from the results of a SPARQL query.
You are an assistant that creates well-written and human understandable answers.
The information part contains the information provided, which you can use to construct an answer.
The information provided is authoritative, you must never doubt it or try to use your internal knowledge to correct it.
Make your response sound like the information is coming from an AI assistant, but don't add any information.
Don't use internal knowledge to answer the question, just say you don't know if no information is available.
Information:
{context}

Question: {prompt}
Helpful Answer:
```

In [17]:
llm = ChatOpenAI(
    model="lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    api_key="lm-server",
    base_url="http://localhost:1234/v1"
)

Create our PromptTemplate using the `GRAPHDB_QA_TEMPLATE`

In [18]:
qa_prompt = PromptTemplate(
    input_variables=["context", "prompt"], 
    template=GRAPHDB_QA_TEMPLATE
)

Create our LangChain chain

In [20]:
qa_chain = qa_prompt | llm | StrOutputParser()

Now we can use the chain to do ask our question and get our results.

In [21]:
qa_chain.invoke(
    {
        "prompt": {'query': question_text}, 
        "context": query_results
    }
)

'According to the information provided, Belinda Cratchit is a daughter of Bob Cratchit. Additionally, it is mentioned that Mrs. Cratchit and Miss Belinda collaborate on food preparation for the family gathering, suggesting a close relationship between them as part of the same family unit.'

## Prompt with no context
To show again, how much better this answer is than if we had simply asked the question to ChatGPT, here is what happens when we do that.

In [22]:
prompt_no_context = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful assistant that answers questions about a book.",
        ),
        ("human", "{input}"),
    ]
)
simple_chain = prompt_no_context | llm | StrOutputParser()
simple_chain.invoke(
    {
        "input": question_text,
    }
)

'In Charles Dickens\' novel "A Christmas Carol", there is no character named Belinda Cratchit. However, I believe you may be thinking of Mrs. Cratchit, who is the wife of Bob Cratchit.\n\nMrs. Cratchit (also known as Emily) is a kind and hardworking woman who manages to keep her family together despite their poverty. She is often seen helping her husband with his work and taking care of their children.'