In [None]:
import pandas as pd
import os
import ast
import os
from tqdm import tqdm
import numpy as np
from elasticsearch import Elasticsearch
import json
import urllib3
urllib3.disable_warnings()

In [None]:
with open('config.json') as f:
    data = json.load(f)
es_username = data['es_username']
es_password = data['es_password']

## Microsoft GraphRAG Ontology
Ontologies enable us to define and also validate our Knowledge Graph. They have been around for a while and I've built one for the Knowledge Graph we'll be using. I used a cool product called [Metaphactory](https://metaphacts.com/) to create this and this is what it looks like:

![](../images/msft_graphrag_ontology.png)

In [None]:
output_folder = 'D:/Data/RDF'
graphrag_folder = 'D:/Logs/ragtest/output/20240804-161103/artifacts/'  # 1200 chunk size and covariate records
f_ttl = open(os.path.join(output_folder, 'msft-graphrag-1200-chunk-size-plus-covariates.ttl'), 'w', encoding='utf-8')
#graphrag_folder = 'D:/Logs/ragtest/output/20240806-182605/artifacts/'  # 300 chunk size and no covariates
#f_ttl = open(os.path.join(output_folder, 'msft-graphrag-300-chunk-size.ttl'), 'w', encoding='utf-8')

In [None]:
# write the header
f_ttl.write("@prefix gr: <http://ormynet.com/ns/msft-graphrag#> .\n")  # graph definition
f_ttl.write("@prefix d: <http://ormynet.com/ns/data#> .\n")            # instance data

## Import Document
Import the Document data from the final documents parquet file. We'll add instances of the `Document` class for each record we read there. We'll only need the **id** and **title** fields as the other fields can be obtained through the relationships that will be created.

In [None]:
doc_df = pd.read_parquet(os.path.join(graphrag_folder, 'create_final_documents.parquet'))
doc_df

In [None]:
def write_document_ttl(df: pd.DataFrame, recnum: int):
    """Add a Document to the turtle file

    :param df: Document dataframe
    :param recnum: record number we are looking at
    """
    doc_id = df['id'].iloc[recnum]
    title = df['title'].iloc[recnum]
    f_ttl.write(f'\nd:Document_{doc_id} a gr:Document ;\n')
    f_ttl.write(f'    gr:id "{doc_id}";\n')
    f_ttl.write(f'    gr:title "{title}" .\n')

In [None]:
for i in tqdm(range(0, len(doc_df))):
    write_document_ttl(doc_df, i)

## Import Text Units
Import the text units data from the final text units parquet file. We'll add instances of the `Chunk` class for each record we read. We'll also link each `Chunk` back to it's `Document` using the `part_of` relationship.

In [None]:
text_df = pd.read_parquet(os.path.join(graphrag_folder, 'create_final_text_units.parquet'))
text_df

In [None]:
def add_chunk_with_ttl(df: pd.DataFrame, recnum: int):
    """Add a Chunk to the turtle file

    :param df: text units dataframe
    :param recnum: record number we are looking at
    """
    chunk_id = df['id'].iloc[recnum]
    text = df['text'].iloc[recnum]
    n_tokens = df['n_tokens'].iloc[recnum]
    f_ttl.write(f'\nd:Chunk_{chunk_id} a gr:Chunk ;\n')
    f_ttl.write(f'    gr:id "{chunk_id}";\n')
    if text is not None:
        f_ttl.write('    gr:text """\n')
        f_ttl.write(text)
        f_ttl.write('\n""";\n')
    f_ttl.write(f'    gr:n_tokens {n_tokens} .\n')

In [None]:
def add_chunk_part_of_with_ttl(chunk_id: str, document_id: str):
    """Add 'part_of' link to the turtle file

    :param chunk_id: id of the Chunk
    :param document_id: id of the Document
    """
    f_ttl.write(f'\nd:Chunk_{chunk_id} gr:part_of d:Document_{document_id} .\n')

In [None]:
for i in tqdm(range(0, len(text_df))):
    add_chunk_with_ttl(text_df, i)
    chunk_id = text_df['id'].iloc[i]
    document_ids = text_df['document_ids'].iloc[i].tolist()
    for document_id in document_ids:
        add_chunk_part_of_with_ttl(chunk_id, document_id)

## Import Entities
Import the entities data from the final entities parquet file. We'll add instances of the `Entity` class for each record we read. We'll also link each `Chunk` instance that contains this `Entity` using the `has_entity` relationship.

In [None]:
entity_df = pd.read_parquet(os.path.join(graphrag_folder, 'create_final_entities.parquet'))
entity_df

In [None]:
def remove_leading_trailing_double_quotes(x) -> str:
    """Remove double-quotes from a string only if they are at the start and end of a string

    :param x: string to be parsed
    :returns: trimmed string
    """
    trimmed_string = x
    if len(x) > 1:  # ensure the string is long enough to have 2 double-quotes
        if x[0] =='"' and x[-1] == '"':
            trimmed_string = x[1:-1]
    return trimmed_string

In [None]:
def add_has_entity_with_ttl(chunk_id: str, entity_id: str):
    """Add has_entity relationship to the turtle file

    :param chunk_id: id of the Chunk
    :param entity_id: id of the Entity
    """
    f_ttl.write(f'\nd:Chunk_{chunk_id}  gr:has_entity d:Entity_{entity_id} . \n')

In [None]:
def add_entity_with_ttl(df: pd.DataFrame, recnum: int):
    """Add an Entity instance to the turtle file
    
    :param df: datafram containing the Entity data
    :param recnum: record number we are looking at
    """
    entity_id = df['id'].iloc[recnum]
    name = df['name'].iloc[recnum]
    entity_type = df['type'].iloc[recnum]
    description = df['description'].iloc[recnum]
    human_readable_id = df['human_readable_id'].iloc[recnum]
    f_ttl.write(f'\nd:Entity_{entity_id} a gr:Entity;\n')
    f_ttl.write(f'    gr:id "{entity_id}";\n')
    if name is not None:
        name = remove_leading_trailing_double_quotes(name)
        f_ttl.write(f'    gr:name "{name}";\n')
    if entity_type is not None:
        entity_type = remove_leading_trailing_double_quotes(entity_type)
        f_ttl.write(f'    gr:type "{entity_type}";\n')
    if description is not None:
        f_ttl.write('    gr:description """\n')
        f_ttl.write(remove_leading_trailing_double_quotes(description))
        f_ttl.write('\n""";\n')
    f_ttl.write(f'    gr:human_readable_id "{human_readable_id}" .\n')

Add all the `Entity` instances. We also create an **entity_lookup** table that we will use when setting up relationships later on.

In [None]:
entity_lookup = {}  # this will become our lookup table when creating relationships later on
for i in tqdm(range(0, len(entity_df))):
    add_entity_with_ttl(entity_df, i)
    entity_id = entity_df['id'].iloc[i]
    name = remove_leading_trailing_double_quotes(entity_df['name'].iloc[i])
    entity_lookup[name] = entity_id  # store this for later relationships
    text_unit_ids = entity_df['text_unit_ids'].iloc[i].tolist()
    for text_unit_id in text_unit_ids:
        add_has_entity_with_ttl(text_unit_id, entity_id)

## Import Relationships
Import the relationships data from the final relationships parquet file. For the relationships we use the **entity_lookup** table we created to find the actual **id** of the source and target `Entity`.

In [None]:
rel_df = pd.read_parquet(os.path.join(graphrag_folder, 'create_final_relationships.parquet'))
rel_df

In [None]:
def add_relationship_with_ttl(df: pd.DataFrame, recnum: int):
    """Add related_to to the turtle file

    :param df: dataframe containing the relationship data
    :param recnum: record number we are looking at
    """
    source_id = entity_lookup[remove_leading_trailing_double_quotes(df['source'].iloc[recnum])]
    target_id = entity_lookup[remove_leading_trailing_double_quotes(df['target'].iloc[recnum])]
    rel_id = df['id'].iloc[recnum]
    rank = df['rank'].iloc[recnum]
    weight = df['weight'].iloc[recnum]
    description = remove_leading_trailing_double_quotes(df['description'].iloc[recnum])
    text_unit_ids = df['text_unit_ids']
    human_readable_id = df['human_readable_id'].iloc[recnum]
    relationships_lookup[rel_id] = (source_id, target_id)
    f_ttl.write(f'\nd:related_to_{rel_id} a gr:related_to;\n')
    f_ttl.write(f'    gr:id "{rel_id}";\n')
    f_ttl.write(f'    gr:rank {rank};\n')
    f_ttl.write(f'    gr:weight {weight};\n')
    if description is not None:
        f_ttl.write('    gr:description """\n')
        f_ttl.write(description)
        f_ttl.write('\n""";\n')
    f_ttl.write(f'    gr:human_readable_id "{human_readable_id}" .\n')
    f_ttl.write(f'\nd:Entity_{source_id} d:related_to_{rel_id} d:Entity_{target_id} .\n')

In [None]:
relationships_lookup = {}
for i in tqdm(range(0, len(rel_df))):
    add_relationship_with_ttl(rel_df, i)

## Import Community Data
Import the community data from the final communities parquet file. We'll add instances of the `Community` class for each record we read. We'll also Add the community relationships of `related_to` and `in_community`.

In [None]:
community_df = pd.read_parquet(os.path.join(graphrag_folder, 'create_final_communities.parquet'))
community_df

In [None]:
def add_community_with_ttl(df: pd.DataFrame, recnum: int):
    """Add a Community instance to the turtle file

    :param df: dataframe containing the community records
    :param recnum: record number we are looking at
    """
    community_id = df['id'].iloc[recnum]
    level = df['level'].iloc[recnum]
    f_ttl.write(f'\nd:Community_{community_id} a gr:Community;\n')
    f_ttl.write(f'    gr:id "{community_id}";\n')
    f_ttl.write(f'    gr:level {level} .\n')

In [None]:
def add_community_relationships_with_ttl(community_id:str, source_id: str, target_id: str, related_id: str):
    """Add Comunity relationships to the turtle file

    :param community_id: id of the Community instance
    :param source_id: id of the source Entity instance
    :param target_id: id of the target Entity instance
    :param related_id: id of the related_to instance
    """
    f_ttl.write(f'\nd:related_to_{related_id} a gr:related_to .\n')
    f_ttl.write(f'\nd:Entity_{source_id} d:related_to_{related_id} d:Entity_{target_id} .\n')
    f_ttl.write(f'\nd:Entity_{source_id} gr:in_community d:Community_{community_id} .\n')
    f_ttl.write(f'\nd:Entity_{target_id} gr:in_community d:Community_{community_id} .\n')

In [None]:
for i in tqdm(range(0, len(community_df))):
    add_community_with_ttl(community_df, i)
    community_id = community_df['id'].iloc[i]
    relationship_ids = community_df['relationship_ids'].iloc[i].tolist()
    for relationship_id in relationship_ids:
        source_id, target_id = relationships_lookup[relationship_id]
        add_community_relationships_with_ttl(community_id, source_id, target_id, relationship_id)

## Import Community Reports
Import the community reports data from the final community reports parquet file.

In [None]:
community_report_df = pd.read_parquet(os.path.join(graphrag_folder, 'create_final_community_reports.parquet'))
community_report_df

In [None]:
def add_community_report_with_ttl(df: pd.DataFrame, recnum: int):
    """Add the Community report with turtle

    :param df: dataframe containing the community report records
    :param recnum: record number we are looking at
    """
    community_id = df['community'].iloc[recnum]
    findings_id = df['id'].iloc[recnum]
    level = df['level'].iloc[recnum]
    title = df['title'].iloc[recnum]
    summary = df['summary'].iloc[recnum]
    findings = df['findings'].iloc[recnum]
    rank = df['rank'].iloc[recnum]
    rank_explanation = df['rank_explanation'].iloc[recnum]
    full_content = df['full_content'].iloc[recnum]
    f_ttl.write(f'\nd:Community_{community_id} gr:rank "{rank}";\n')
    f_ttl.write(f'    gr:level {level};\n')
    f_ttl.write(f'    gr:title "{title}";\n')
    f_ttl.write('    gr:rank_explanation """\n')
    f_ttl.write(f'{rank_explanation}')
    f_ttl.write('\n""";\n')
    f_ttl.write('    gr:full_content """\n')
    f_ttl.write(f'{full_content}')
    f_ttl.write('\n""";\n')
    f_ttl.write('    gr:summary """\n')
    f_ttl.write(f'{summary}')
    f_ttl.write('\n""" .\n')
    # Findings
    finding_id_start = findings_id.replace('-', '')
    for i, f in enumerate(findings):
        f_ttl.write(f'\nd:Finding_{finding_id_start}_{i} a gr:Finding;\n')
        f_ttl.write('    gr:finding """\n')
        f_ttl.write(f'{f}')
        f_ttl.write('\n""" .\n')
        f_ttl.write(f'\nd:Community_{community_id} gr:has_finding d:Finding_{finding_id_start}_{i} .\n')

In [None]:
for i in tqdm(range(0, len(community_report_df))):
    add_community_report_with_ttl(community_report_df, i)

## Import Covariates
Read all the covariates from the final covariates parquet file. This file will only have been created if you set the flag to extract claims in the config file. I set it once and it added about a day to my run!

In [None]:
covariate_df = pd.read_parquet(os.path.join(graphrag_folder, 'create_final_covariates.parquet'))
covariate_df

In [None]:
def add_covariate_with_ttl(df: pd.DataFrame, recnum: int):
    """Add the Covariate with turtle

    :param df: dataframe containing the covariate records
    :param recnum: record number we are looking at
    """
    covariate_id = df['id'].iloc[recnum]
    text_unit_id = df['text_unit_id'].iloc[recnum]
    document_ids = df['document_ids'].iloc[recnum]
    n_tokens = df['n_tokens'].iloc[recnum]
    f_ttl.write(f'\nd:Covariate_{covariate_id} a gr:Covariate .\n')
    if document_ids is not None:
        f_ttl.write(f'd:Covariate_{covariate_id} gr:document_ids "{document_ids}" .\n')
    if n_tokens is not None:
        f_ttl.write(f'd:Covariate_{covariate_id} gr:n_tokens "{n_tokens}" .\n')
    if text_unit_id is not None:
        f_ttl.write(f'd:Covariate_{covariate_id} gr:text_unit_id "{text_unit_id}" .\n')
        f_ttl.write(f'\nd:Entity_{text_unit_id} gr:has_covariate "{covariate_id}" .\n')

In [None]:
for i in tqdm(range(0, len(covariate_df))):
    add_covariate_with_ttl(covariate_df, i)

Close our turtle file.

In [None]:
f_ttl.flush()
f_ttl.close()

## Create Elasticsearch Index
Create an Elasticsearch index using the `Entity` **description_embedding** data. In order to do this part your Elasticsearch server must be up and running. I've got mine running on my local machine. When it first starts it generates a random password for the server. Put this into the config.json file in this folder. The username is always `elastic`.

In [None]:
embedding_vector = entity_df['description_embedding'].iloc[0]
print(f'description_embedding - dimensions: {len(embedding_vector)}')

In [None]:
es = Elasticsearch("https://localhost:9200", 
                   basic_auth=(es_username, es_password), 
                   verify_certs=False)
es.info().body

We're going to configure the index mappings. We tell it the name of the column we'll be using as our index (in our case it's **description_embedding**) and then we add all the other columns we'd like to return with our search. We only need the **id** field. That will be enough to lookup our corresponding `Entity` record.

In [None]:
indexMapping = {
    "properties":{
        "id":{
            "type":"keyword"
        },
        "description_embedding":{
            "type": "dense_vector",
            "dims": len(embedding_vector),
            "index": True,
            "similarity": "l2_norm",
        }, 
    }
}

index_name = "entity_graph_index" 
#es.indices.delete(index=index_name)
es.indices.create(index=index_name, mappings=indexMapping)

We need to convert our DataFrame to a dictionary as this is the format Elasticsearch expects when we're adding a record to the index.

In [None]:
record_list = entity_df[['id', 'description_embedding']].to_dict("records")

In [None]:
for record in record_list:
    es.index(index=index_name, document=record, id=record["id"])

Verify that all our `Entity` records have been index.

In [None]:
index_count = es.count(index=index_name)['count']
if index_count == len(entity_df):
    print("Success! All the Entity records have been indexed")
else:
    print("**ERROR** Failed to index all the records")