Knowledge Graph Implementation (At article level)

In [17]:
import json
import os
from typing import List, Dict, Any
import numpy as np
import rdflib
from rdflib import Graph, Namespace, URIRef, Literal, RDF, RDFS, XSD
from sentence_transformers import SentenceTransformer

We specify a base URI and the number of top matches we need

In [None]:
EMBED_MODEL = "all-MiniLM-L6-v2"
TOP_N = 5
BASE_URI = "http://example.org/gdpr#"  # namespace for our demo graph

Also defined some helper functions for cosine similarity, loading the jsons and concatenating the article and section text in one entity as the article

In [19]:
def cosine(u: np.ndarray, v: np.ndarray) -> float:
    """Cosine similarity between two 1‑D numpy arrays."""
    return float(np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v) + 1e-9))


def load_json(path: str) -> Any:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def article_full_text(article: Dict[str, Any]) -> str:
    """Concatenate article title + clauses into a single string."""
    clauses = " ".join(
        " ".join(cl.values()) if isinstance(cl, dict) else str(cl)
        for cl in article.get("sections", [])
    )
    return f"Art. {article['article_number']} – {article['article_title']} {clauses}"

We load both the jsons here and also the model and then create the article embeddings

In [20]:
articles = load_json("gdpr_articles_baseline.json")
sections = load_json("ikea_privacy_policy.json")

model = SentenceTransformer(EMBED_MODEL)

In [21]:
article_embs = {}
for art in articles:
    emb = model.encode(article_full_text(art))
    article_embs[art["article_number"]] = {
        "embedding": emb,
        "title": art["article_title"],
        "uri": URIRef(f"{BASE_URI}Article{art['article_number']}")
    }
print(article_embs)

{1: {'embedding': array([-5.22496253e-02,  6.06772304e-02, -4.26751524e-02, -6.21294938e-02,
        5.40080033e-02,  6.60941750e-02,  6.36447743e-02, -7.64360130e-02,
       -1.96180586e-02,  2.26295460e-03,  5.29458262e-02,  3.18475552e-02,
        2.47883108e-02, -5.61292358e-02, -8.41624511e-04,  7.19902851e-03,
       -1.91918258e-02, -1.76476035e-02, -8.17015395e-02,  5.96973523e-02,
        5.37405424e-02, -1.46686090e-02, -4.33222428e-02,  2.00859271e-02,
       -5.23935668e-02,  3.05585153e-02, -6.08501444e-03, -5.21926917e-02,
        3.79003584e-02, -3.91187109e-02, -1.49977813e-02, -2.93638166e-02,
        2.45792810e-02,  9.00923163e-02, -2.79090777e-02, -6.62183315e-02,
        2.20055282e-02, -3.66508327e-02, -3.80271412e-02,  3.44505757e-02,
       -5.81527539e-02, -8.05751681e-02, -1.20519258e-01,  2.00337004e-02,
        2.18241476e-02,  1.07706673e-01,  2.16077529e-02, -1.41838391e-03,
       -9.26439166e-02,  2.32761167e-02, -2.69620605e-02,  2.14549880e-02,
       

Once we have the embeddings, we start creating the Graph for the articles

In [22]:
g = Graph()
EX = Namespace(BASE_URI)
g.bind("ex", EX)

# Add article nodes
for num, info in article_embs.items():
    g.add((info["uri"], RDF.type, EX.Article))
    g.add((info["uri"], RDFS.label, Literal(f"Article {num}: {info['title']}")))
print(g.serialize(format="turtle"))

@prefix ex: <http://example.org/gdpr#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

ex:Article1 a ex:Article ;
    rdfs:label "Article 1: Subject-matter and objectives" .

ex:Article10 a ex:Article ;
    rdfs:label "Article 10: Processing of personal data relating to criminal convictions and offences" .

ex:Article11 a ex:Article ;
    rdfs:label "Article 11: Processing which does not require identification" .

ex:Article12 a ex:Article ;
    rdfs:label "Article 12: Transparent information, communication and modalities for the exercise of the rights of the data subject" .

ex:Article13 a ex:Article ;
    rdfs:label "Article 13: Information to be provided where personal data are collected from the data subject" .

ex:Article14 a ex:Article ;
    rdfs:label "Article 14: Information to be provided where personal data have not been obtained from the data subject" .

ex:Article15 a ex:Article ;
    rdfs:label "Article 15: Right of access by the data subject" .

ex:Article16 a 

We finally analyze the privacy policy articles against the GDPR articles to find the similarites to all articles and add the similarity score and relatesTo in the Graph.

In [23]:
report = []
for idx, sec in enumerate(sections, start=1):
    title = sec.get("section_title", f"Section {idx}")
    text = sec.get("section_text", "")
    if not text.strip():
        continue

    # Section node
    sec_uri = URIRef(f"{BASE_URI}PolicySection{idx}")
    g.add((sec_uri, RDF.type, EX.PolicySection))
    g.add((sec_uri, RDFS.label, Literal(title)))

    # Embed section
    sec_emb = model.encode(text)

    # Compute similarities to all articles
    sims = []
    for num, ainfo in article_embs.items():
        sim = cosine(sec_emb, ainfo["embedding"])
        sims.append({
            "article": num,
            "title": ainfo["title"],
            "similarity": round(sim, 4),
            "uri": ainfo["uri"],
        })
    sims.sort(key=lambda x: x["similarity"], reverse=True)
    top_matches = sims[:TOP_N]

    # Add triples linking section → top articles with similarity score
    for m in top_matches:
        g.add((sec_uri, EX.relatesTo, m["uri"]))
        g.add((sec_uri, EX.similarityScore, Literal(m["similarity"], datatype=XSD.float)))

    # Append to JSON report
    report.append({
        "section_index": idx,
        "section_title": title,
        "top_matches": [{k: v for k, v in m.items() if k != "uri"} for m in top_matches]
    })


Finally, we save the graph in ttl format for visualization

In [None]:
g.serialize(destination="gdpr_policy_graph.ttl", format="turtle")
print(json.dumps(report, indent=2, ensure_ascii=False))

[
  {
    "section_index": 1,
    "section_title": "1. Who is the responsible controller for the data processing and whom you may contact?",
    "top_matches": [
      {
        "article": 30,
        "title": "Records of processing activities",
        "similarity": 0.3899
      },
      {
        "article": 14,
        "title": "Information to be provided where personal data have not been obtained from the data subject",
        "similarity": 0.3697
      },
      {
        "article": 13,
        "title": "Information to be provided where personal data are collected from the data subject",
        "similarity": 0.3528
      },
      {
        "article": 77,
        "title": "Right to lodge a complaint with a supervisory authority",
        "similarity": 0.3472
      },
      {
        "article": 24,
        "title": "Responsibility of the controller",
        "similarity": 0.3257
      }
    ]
  },
  {
    "section_index": 2,
    "section_title": "2. What data is being processed and 