RAG with LLM Implementation

In [1]:
import json
import math
from typing import List, Dict, Any
from functools import lru_cache

from openai import OpenAI

We will provide embeddings to LLM to find the best article match.

In [2]:
EMBED_MODEL = "text-embedding-3-small"
TOP_N = 1
client = OpenAI()

Helper methods to load the json files

In [3]:
def load_gdpr_articles(path: str = "gdpr_articles_baseline.json") -> List[Dict[str, Any]]:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

def load_privacy_policy(path: str = "ikea_privacy_policy.json") -> List[Dict[str, str]]:
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

Helper methods to calculate cosine similarity between embedding vectors, prepares cached embeddings and concatenated article text for semantic comparison.

In [4]:
def cosine_similarity(v1: List[float], v2: List[float]) -> float:
    dot = sum(a*b for a, b in zip(v1, v2))
    norm1 = math.sqrt(sum(a*a for a in v1))
    norm2 = math.sqrt(sum(b*b for b in v2))
    return dot / (norm1 * norm2 + 1e-10)

@lru_cache(maxsize=1024)
def get_embedding(text: str) -> List[float]:
    """Return embedding vector for given text (with caching)."""
    resp = client.embeddings.create(model=EMBED_MODEL, input=text.replace("\n", " "))
    return resp.data[0].embedding

def prepare_article_text(article: Dict[str, Any]) -> str:
    """Concatenate article title and its section texts into one string."""
    body = " ".join(
        " ".join(sec.values()) if isinstance(sec, dict) else str(sec)
        for sec in article.get("sections", [])
    )
    return f"Art. {article['article_number']} – {article['article_title']} {body}"

We define the method to analyze each section of the privacy policy by comparing its embedding to GDPR article embeddings and return the top N most similar articles per section.

In [7]:
def audit_policy_similarity(policy_path: str = "../ikea_privacy_policy.json", articles_path: str = "../gdpr_articles_baseline.json", top_n: int = TOP_N) -> List[Dict[str, Any]]:
    # Load data
    gdpr_articles = load_gdpr_articles(articles_path)
    sections      = load_privacy_policy(policy_path)

    # Pre‑compute embeddings for all GDPR articles
    article_embeddings = {}
    for art in gdpr_articles:
        art_text = prepare_article_text(art)
        article_embeddings[art["article_number"]] = {
            "embedding": get_embedding(art_text),
            "title": art["article_title"]
        }

    # Analyse each section
    report = []
    for idx, sec in enumerate(sections, start=1):
        sec_text = sec.get("section_text", "")
        if not sec_text.strip():
            continue  # skip empty sections

        sec_emb = get_embedding(sec_text)
        # Compute similarity for every article
        sims = []
        for num, info in article_embeddings.items():
            sim = cosine_similarity(sec_emb, info["embedding"])
            sims.append({"article": num, "title": info["title"], "similarity": round(sim, 4)})
        # Take top N
        sims.sort(key=lambda x: x["similarity"], reverse=True)
        top_matches = sims[:top_n]

        report.append({
            "section_index": idx,
            "section_title": sec.get("section_title", f"Section {idx}"),
            "top_matches": top_matches,
        })

    return report


In [8]:
result = audit_policy_similarity()
print(json.dumps(result, indent=2, ensure_ascii=False))

[
  {
    "section_index": 1,
    "section_title": "1. Who is the responsible controller for the data processing and whom you may contact?",
    "top_matches": [
      {
        "article": 28,
        "title": "Processor",
        "similarity": 0.4692
      }
    ]
  },
  {
    "section_index": 2,
    "section_title": "2. What data is being processed and from which sources do these stem from?",
    "top_matches": [
      {
        "article": 13,
        "title": "Information to be provided where personal data are collected from the data subject",
        "similarity": 0.4975
      }
    ]
  },
  {
    "section_index": 3,
    "section_title": "3. For which purpose and for how long is the data being processed?",
    "top_matches": [
      {
        "article": 13,
        "title": "Information to be provided where personal data are collected from the data subject",
        "similarity": 0.5906
      }
    ]
  },
  {
    "section_index": 4,
    "section_title": "4. On which legal basis is 