# Analyse Knowledge Graph
Analyse the data in the Knowledge Graph using SPARQL. This is mostly a copy of Tomaz Bratanic's notebook, but with all the CYPHER queries changed to SPARQL. 

Tomaz Bratanic's notebook is on Github: [https://github.com/tomasonjo/blogs/blob/master/msft_graphrag/ms_graphrag_retriever.ipynb](https://github.com/tomasonjo/blogs/blob/master/msft_graphrag/ms_graphrag_retriever.ipynb)

In [None]:
import pandas as pd
import os
import urllib.parse
import ast
from io import StringIO
from SPARQLWrapper import SPARQLWrapper, CSV, SELECT, POST, POSTDIRECTLY
import os
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from typing import Dict, Any

In [None]:
# Adjust pandas display settings
pd.set_option(
    "display.max_colwidth", None
)  # Set to None to display the full column width
pd.set_option("display.max_columns", None)

In [None]:
# endpoint for GraphDB
#endpoint = "http://localhost:7200/repositories/msft-graphrag-1200"
endpoint = "http://localhost:7200/repositories/msft-graphrag-300"

In [None]:
def sparql_query(query: str) -> pd.DataFrame:
    sparql_conn.setQuery(query)
    sparql_conn.setReturnFormat(CSV)
    results = sparql_conn.query().convert()
    return pd.read_csv(StringIO(results.decode('utf-8')), sep=",")

In [None]:
sparql_conn = SPARQLWrapper(endpoint)

What's the distribution of chunk sizes for this graph?

In [None]:
query = """
PREFIX gr: <http://ormynet.com/ns/msft-graphrag#>

SELECT (SAMPLE(?n_tokens) AS ?token_count) (COUNT(?n_tokens) as ?count)
WHERE {
    ?chunk_uri a gr:Chunk;
    gr:n_tokens ?n_tokens .
}
GROUP BY ?n_tokens
"""
sparql_query(query)

Let's look at an exmaple `Entity` instance.

In [None]:
query = """
PREFIX gr: <http://ormynet.com/ns/msft-graphrag#>

SELECT ?name ?description
WHERE {
    ?entity_uri a gr:Entity;
    gr:name ?name;
    gr:description ?desc .
    BIND(REPLACE(?desc, "\\r\\n", " ", "i") AS ?description)
}
LIMIT 1
"""
sparql_query(query)

Let's look at an example relationship.

In [None]:
query = """
PREFIX gr: <http://ormynet.com/ns/msft-graphrag#>

SELECT ?description
WHERE {
    ?rel_uri a gr:related_to;
    gr:description ?desc .
    BIND(REPLACE(?desc, "\\r\\n", " ", "i") AS ?description)
}
LIMIT 5
"""
sparql_query(query)

Let's look at an example `Community` instance.

In [None]:
query = """
PREFIX gr: <http://ormynet.com/ns/msft-graphrag#>

SELECT ?title ?summary ?full_content
WHERE {
    ?community_uri a gr:Community;
    gr:title ?title;
    gr:summary ?summ;
    gr:full_content ?full_c .
    BIND(REPLACE(?summ, "\\r\\n", " ", "i") AS ?summary)
    BIND(REPLACE(?full_c, "\\r\\n", " ", "i") AS ?full_content)
}
LIMIT 1
"""
sparql_query(query)

Let's inspect the distribution of the count of extracted entities from text chunks.

In [None]:
query = """
PREFIX gr: <http://ormynet.com/ns/msft-graphrag#>
select
(COUNT(?n_entities) as ?entity_count)
where {
    ?chunk_uri a gr:Chunk;
     gr:has_entity ?n_entities .
}
GROUP BY ?chunk_uri
"""
entity_df = sparql_query(query)
# Plot distribution
plt.figure(figsize=(10, 6))
sns.histplot(entity_df["entity_count"], kde=True, bins=15, color="skyblue")
plt.axvline(
    entity_df["entity_count"].mean(), color="red", linestyle="dashed", linewidth=1
)
plt.axvline(
    entity_df["entity_count"].median(), color="green", linestyle="dashed", linewidth=1
)
plt.xlabel("Entity Count", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.title("Distribution of Entity Count", fontsize=15)
plt.legend(
    {
        "Mean": entity_df["entity_count"].mean(),
        "Median": entity_df["entity_count"].median(),
    }
)
plt.show()

Let's evaluate the node degree distribution. A node degree is the number of relationships a node has.

In [None]:
query = """
PREFIX gr: <http://ormynet.com/ns/msft-graphrag#>
select
?from_entity_uri (COUNT(?to_entity_uri) as ?node_degree)
where {
    ?rel_uri a gr:related_to .
    ?from_entity_uri ?rel_uri ?to_entity_uri .
}
GROUP BY ?from_entity_uri
"""
degree_dist_df = sparql_query(query)
# Calculate mean and median
mean_degree = np.mean(degree_dist_df["node_degree"])
percentiles = np.percentile(degree_dist_df["node_degree"], [25, 50, 75, 90])
# Create a histogram with a logarithmic scale
plt.figure(figsize=(12, 6))
sns.histplot(degree_dist_df["node_degree"], bins=50, kde=False, color="blue")
# Use a logarithmic scale for the x-axis
plt.yscale("log")
# Adding labels and title
plt.xlabel("Node Degree")
plt.ylabel("Count (log scale)")
plt.title("Node Degree Distribution")
# Add mean, median, and percentile lines
plt.axvline(
    mean_degree,
    color="red",
    linestyle="dashed",
    linewidth=1,
    label=f"Mean: {mean_degree:.2f}",
)
plt.axvline(
    percentiles[0],
    color="purple",
    linestyle="dashed",
    linewidth=1,
    label=f"25th Percentile: {percentiles[0]:.2f}",
)
plt.axvline(
    percentiles[1],
    color="orange",
    linestyle="dashed",
    linewidth=1,
    label=f"50th Percentile: {percentiles[1]:.2f}",
)
plt.axvline(
    percentiles[2],
    color="yellow",
    linestyle="dashed",
    linewidth=1,
    label=f"75th Percentile: {percentiles[2]:.2f}",
)
plt.axvline(
    percentiles[3],
    color="brown",
    linestyle="dashed",
    linewidth=1,
    label=f"90th Percentile: {percentiles[3]:.2f}",
)
# Add legend
plt.legend()
# Show the plot
plt.show()

Most real-world networks follow a power-law node degree distribution, with most nodes having relatively small degrees and some important nodes having a lot. While our graph is small, the node degree follows the power law. However, there's an outlier over on the right-hand side. Let's see which one it is:

In [None]:
query = """
PREFIX gr: <http://ormynet.com/ns/msft-graphrag#>
select
?name (COUNT(?to_entity_uri) as ?degree)
where {
    ?rel_uri a gr:related_to .
    ?from_entity_uri ?rel_uri ?to_entity_uri;
        gr:name ?name .
}
GROUP BY ?name
ORDER BY DESC(?degree)
LIMIT 5
"""
sparql_query(query)

It's no surprise that `SCROOGE` would have the highest degree! He's the main character of the book.

Finally, let's inspect the distribution of community size per hierarchical level:

In [None]:
query = """
PREFIX gr: <http://ormynet.com/ns/msft-graphrag#>
select
?level
(COUNT(?community_uri) as ?members)
where {
    ?entity_uri gr:in_community ?community_uri .
    ?community_uri gr:level ?level .
}
GROUP BY ?level ?community_uri
"""
community_data = sparql_query(query)
stats = (
    community_data.groupby("level")
    .agg(
        min_members=("members", "min"),
        max_members=("members", "max"),
        median_members=("members", "median"),
        avg_members=("members", "mean"),
        num_communities=("members", "count"),
        total_members=("members", "sum"),
    )
    .reset_index()
)

# Create box plot
plt.figure(figsize=(10, 6))
sns.boxplot(x="level", y="members", data=community_data, palette="viridis", hue="level", legend=False)
plt.xlabel("Level")
plt.ylabel("Members")

# Add statistical annotations
for i in range(stats.shape[0]):
    level = stats["level"][i]
    max_val = stats["max_members"][i]
    text = (
        f"num: {stats['num_communities'][i]}\n"
        f"all_members: {stats['total_members'][i]}\n"
        f"min: {stats['min_members'][i]}\n"
        f"max: {stats['max_members'][i]}\n"
        f"med: {stats['median_members'][i]}\n"
        f"avg: {stats['avg_members'][i]:.2f}"
    )
    plt.text(level, 85, text, horizontalalignment="center", fontsize=9)

plt.show()