Knowledge Graph Implementation (At section level)

In [1]:
import json
import os
from typing import List, Dict, Any
import numpy as np
import rdflib
from rdflib import Graph, Namespace, URIRef, Literal, RDF, RDFS, XSD
from sentence_transformers import SentenceTransformer

We specify a base URI and the number of top matches we need

In [2]:
EMBED_MODEL = "all-MiniLM-L6-v2"
TOP_N = 5
BASE_URI = "http://example.org/gdpr#"  # namespace for our demo graph

Also defined some helper functions for cosine similarity, loading the jsons and cleaning the label and returning clause texts for articles

In [3]:
def cosine(u: np.ndarray, v: np.ndarray) -> float:
    """Cosine similarity between two 1‑D numpy arrays."""
    return float(np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v) + 1e-9))


def load_json(path: str) -> Any:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def clean_label(label: str) -> str:
    return ''.join(ch for ch in label if ch.isalnum())

def clause_text(article_num: int, clause_label: str, text: str) -> str:
    return f'Art. {article_num} ({clause_label}) {text}'

We load the GDPR json here and also the model and then create the article/clause embeddings

In [4]:
articles = load_json('../gdpr_articles_baseline.json')

model = SentenceTransformer(EMBED_MODEL)

EX = Namespace(BASE_URI)
g = Graph()
g.bind('ex', EX)

# Build article & clause nodes with embeddings
clause_embs = {}   # (article_num, clause_label) -> dict
for art in articles:
    art_num = art['article_number']
    art_uri = URIRef(f'{BASE_URI}Article{art_num}')
    g.add((art_uri, RDF.type, EX.Article))
    g.add((art_uri, RDFS.label, Literal(f'Article {art_num}: {art["article_title"]}')))

    for sec_dict in art.get('sections', []):
        for cl_label, cl_text in sec_dict.items():
            lbl_clean = clean_label(cl_label)
            cl_uri = URIRef(f'{BASE_URI}Article{art_num}_{lbl_clean}')
            full_text = clause_text(art_num, cl_label, cl_text)
            emb = model.encode(full_text)

            clause_embs[(art_num, cl_label)] = {
                'embedding': emb,
                'article_title': art['article_title'],
                'clause_label': cl_label,
                'uri': cl_uri
            }

            # RDF triples for clause
            g.add((cl_uri, RDF.type, EX.Clause))
            g.add((cl_uri, RDFS.label, Literal(f'Art. {art_num} {cl_label}')))
            g.add((cl_uri, EX.partOf, art_uri))
print(g.serialize(format="turtle"))

@prefix ex: <http://example.org/gdpr#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

ex:Article10_1 a ex:Clause ;
    rdfs:label "Art. 10 1." ;
    ex:partOf ex:Article10 .

ex:Article11_1 a ex:Clause ;
    rdfs:label "Art. 11 1." ;
    ex:partOf ex:Article11 .

ex:Article11_2 a ex:Clause ;
    rdfs:label "Art. 11 2." ;
    ex:partOf ex:Article11 .

ex:Article12_1 a ex:Clause ;
    rdfs:label "Art. 12 1." ;
    ex:partOf ex:Article12 .

ex:Article12_10 a ex:Clause ;
    rdfs:label "Art. 12 10." ;
    ex:partOf ex:Article12 .

ex:Article12_2 a ex:Clause ;
    rdfs:label "Art. 12 2." ;
    ex:partOf ex:Article12 .

ex:Article12_3 a ex:Clause ;
    rdfs:label "Art. 12 3." ;
    ex:partOf ex:Article12 .

ex:Article12_4 a ex:Clause ;
    rdfs:label "Art. 12 4." ;
    ex:partOf ex:Article12 .

ex:Article12_5 a ex:Clause ;
    rdfs:label "Art. 12 5." ;
    ex:partOf ex:Article12 .

ex:Article12_6 a ex:Clause ;
    rdfs:label "Art. 12 6." ;
    ex:partOf ex:Article12 .

ex:Article

In [5]:
sections = load_json('../ikea_privacy_policy.json')

We finally analyze the privacy policy articles against the GDPR article sections to find the similarites to all sections and add the similarity score and relatesTo in the Graph.

In [6]:
report = []
for idx, sec in enumerate(sections, start=1):
    title = sec.get('section_title', f'Section {idx}')
    text  = sec.get('section_text', '')
    if not text.strip():
        continue

    sec_uri = URIRef(f'{BASE_URI}PolicySection{idx}')
    g.add((sec_uri, RDF.type, EX.PolicySection))
    g.add((sec_uri, RDFS.label, Literal(title)))

    sec_emb = model.encode(text)
    sims = []
    for (art_num, cl_label), cinfo in clause_embs.items():
        sim = cosine(sec_emb, cinfo['embedding'])
        sims.append({
            'article': art_num,
            'clause': cl_label,
            'similarity': round(sim, 4),
            'uri': cinfo['uri']
        })
    sims.sort(key=lambda x: x['similarity'], reverse=True)
    top_matches = sims[:TOP_N]

    for m in top_matches:
        g.add((sec_uri, EX.relatesToClause, m['uri']))
        g.add((sec_uri, EX.similarityScore, Literal(m['similarity'], datatype=XSD.float)))

    report.append({
        'section_index': idx,
        'section_title': title,
        'top_matches': [{k: v for k, v in m.items() if k != 'uri'} for m in top_matches]
    })

Finally, we save the graph in ttl format for visualization

In [7]:
g.serialize(destination='gdpr_policy_graph_v2.ttl', format='turtle')
print(json.dumps(report, indent=2, ensure_ascii=False))

[
  {
    "section_index": 1,
    "section_title": "1. Who is the responsible controller for the data processing and whom you may contact?",
    "top_matches": [
      {
        "article": 28,
        "clause": "8.",
        "similarity": 0.3805
      },
      {
        "article": 30,
        "clause": "1.",
        "similarity": 0.3618
      },
      {
        "article": 58,
        "clause": "13.",
        "similarity": 0.3594
      },
      {
        "article": 31,
        "clause": "1.",
        "similarity": 0.3571
      },
      {
        "article": 12,
        "clause": "4.",
        "similarity": 0.355
      }
    ]
  },
  {
    "section_index": 2,
    "section_title": "2. What data is being processed and from which sources do these stem from?",
    "top_matches": [
      {
        "article": 17,
        "clause": "7.",
        "similarity": 0.4437
      },
      {
        "article": 5,
        "clause": "1.",
        "similarity": 0.4013
      },
      {
        "article": 14,