In [19]:
import json
import pickle
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch

# Loading small version of our gdpr riles
with open("gdpr_articles_flattened.json", "r") as f:
    gdpr_articles = json.load(f)

# Processing gdpr rule and putting them in map
gdpr_map = {}
texts = []
for article in gdpr_articles:
    number = article["article_number"]
    title = article["article_title"]
    content = article["content"]
    full_text = f"Article {number}: {title}. {content}"
    gdpr_map[number] = {"title": title, "text": full_text}
    texts.append(full_text)

# using a small model here for effecience, there are ofcourse other models we can try
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device).eval()

# we will generate embeddings
embeddings = []
for text in texts:
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        output = model(**inputs)
        embedding = output.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(embedding[0])

gdpr_embeddings = np.array(embeddings)

# to be effecient, I am saving it as cached pickle
with open("gdpr_cached_embeddings.pkl", "wb") as f:
    pickle.dump((gdpr_map, gdpr_embeddings), f)

print("✅ GDPR map + embeddings cached to 'gdpr_cached_embeddings.pkl'")

✅ GDPR map + embeddings cached to 'gdpr_cached_embeddings.pkl'


Below is for debug/test

In [18]:
def load_gdpr_embeddings(pkl_path="gdpr_cached_embeddings-gc.pkl"):
    with open(pkl_path, "rb") as f:
        gdpr_map, gdpr_embeddings = pickle.load(f)
    gdpr_map = {str(int(k)): v for k, v in gdpr_map.items()}
    return gdpr_map, gdpr_embeddings
gdpr_map, gdpr_embeddings = load_gdpr_embeddings("gdpr_cached_embeddings.pkl")

print(gdpr_map)


{'6': {'title': 'Lawfulness of processing', 'text': 'Article 6: Lawfulness of processing. 1. Processing shall be lawful only if and to the extent that at least one of the following applies: the data subject has given consent to the processing of his or her personal data for one or more specific purposes; processing is necessary for the performance of a contract to which the data subject is party or in order to take steps at the request of the data subject prior to entering into a contract; processing is necessary for compliance with a legal obligation to which the controller is subject; processing is necessary in order to protect the vital interests of the data subject or of another natural person; processing is necessary for the performance of a task carried out in the public interest or in the exercise of official authority vested in the controller; processing is necessary for the purposes of the legitimate interests pursued by the controller or by a third party, except where such in

The following code distilbert-base-uncased model to compare the pre-embeded GDPR laws with the companies GDPR Complaince

In [None]:
import os
import re
import json
import pickle
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import torch


# Step 1. Load Precomputed GDPR Embeddings
def load_gdpr_embeddings(pkl_path="gdpr_cached_embeddings-gc.pkl"):
    with open(pkl_path, "rb") as f:
        gdpr_map, gdpr_embeddings = pickle.load(f)
    gdpr_map = {str(int(k)): v for k, v in gdpr_map.items()}
    #print(gdpr_map)
    return gdpr_map, gdpr_embeddings


# Step 2. Load Company Article Sections
def load_company_articles(folder_path="scraped-policies-3/atlassian_com_classifications"):
    article_sections = {}
    for fname in os.listdir(folder_path):
        #print(f"Checking file: {fname}")
        match = re.match(r"^Article_(\d+)_.*\.txt$", fname.strip())
        if match:
            print(f"Matched: {fname}")
            article_number = match.group(1)
            with open(os.path.join(folder_path, fname), "r", encoding="utf-8") as f:
                article_sections[article_number] = f.read()
    #print(f"Loaded articles: {list(article_sections.keys())}")
    return article_sections


# Step 3. Embed Text
def get_embeddings(texts, tokenizer, model, device):
    embeddings = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            output = model(**inputs)
            embedding = output.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.append(embedding[0])
    return np.array(embeddings)


# Step 4. Compare and Score
def run_compliance_check(company_texts, gdpr_map, gdpr_embeddings, tokenizer, model, device):
    results = []
    article_nums = sorted(company_texts.keys(), key=lambda x: int(x))
    company_text_list = [company_texts[num] for num in article_nums]
    company_embeddings = get_embeddings(company_text_list, tokenizer, model, device)

    for i, article_num in enumerate(article_nums):
        article_id = str(int(article_num))  # normalize key
        if article_id not in gdpr_map:
            print(f"Skipping Article {article_id} - not in GDPR map")
            continue

        #print(gdpr_map[article_id])
        gdpr_text = gdpr_map[article_id]["text"]
        gdpr_vec = get_embeddings([gdpr_text], tokenizer, model, device)
        policy_vec = company_embeddings[i].reshape(1, -1)
        similarity = cosine_similarity(gdpr_vec, policy_vec)[0][0]

        presence_threshold = 0.35
        score = round(max(0, (similarity - presence_threshold) / (1 - presence_threshold) * 100), 2)

        results.append({
            "article_number": article_id,
            "article_title": gdpr_map[article_id]["title"],
            "similarity": round(similarity, 4),
            "compliance_score": score
        })

    return results



# Step 5 Load model and tokenizer
#model_name = "nlpaueb/bert-base-uncased-eurlex"
model_name = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device).eval()


gdpr_map, gdpr_embeddings = load_gdpr_embeddings("gdpr_cached_embeddings.pkl")
company_articles = load_company_articles("scraped-policies-3/atlassian_com_classifications")

# Step 6 Run matching
results = run_compliance_check(company_articles, gdpr_map, gdpr_embeddings, tokenizer, model, device)


pd.DataFrame(results)


Matched: Article_7_Consent.txt
Matched: Article_20_Data_Portability.txt
Matched: Article_37_Data_Protection_Officer.txt
Matched: Article_12_Transparent_Information.txt
Matched: Article_6_Lawful_Basis.txt
Matched: Article_21_Right_to_Object.txt
Matched: Article_25_Data_Protection_by_Design.txt
Matched: Article_16_Right_to_Rectification.txt
Matched: Article_15_Right_of_Access.txt
Matched: Article_13_Information_Collection.txt
Matched: Article_14_Information_Third_Parties.txt
Matched: Article_33_34_Data_Breach.txt
Skipping Article 33 - not in GDPR map


Unnamed: 0,article_number,article_title,similarity,compliance_score
0,6,Lawfulness of processing,0.9349,89.980003
1,7,Conditions for consent,0.9112,86.330002
2,12,"Transparent information, communication and mod...",0.8831,82.019997
3,13,Information to be provided where personal data...,0.8926,83.480003
4,14,Information to be provided where personal data...,0.9013,84.809998
5,15,Right of access by the data subject,0.9041,85.239998
6,16,Right to rectification,0.9277,88.879997
7,20,Right to data portability,0.8962,84.040001
8,21,Right to object,0.8679,79.68
9,25,Data protection by design and by default,0.9018,84.900002
