# GDPR compliance text classification (Multinomial NB & logistic regression)

In [None]:
import json
import joblib
import nltk
from pathlib import Path
from nltk.tokenize import sent_tokenize
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [5]:
# load gdpr articles from json file
with open("gdpr_articles_baseline.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# extract all sections into a flat list
records = []
for article in data:
    article_number = article.get("article_number")
    article_title = article.get("article_title")
    for section in article.get("sections", []):
        for sec_num, sec_text in section.items():
            records.append({
                "article_number": article_number,
                "article_title": article_title,
                "section_number": sec_num,
                "section_text": sec_text
            })

# convert to dataframe and create label column
df_gdpr = pd.DataFrame(records)
df_gdpr["label"] = "Art. " + df_gdpr["article_number"].astype(str)

# combine policy and gdpr texts into tf-idf vectors
vectorizer = TfidfVectorizer()
X_all = vectorizer.fit_transform(df["paragraph_text"].tolist() + df_gdpr["section_text"].tolist())
X_policy = X_all[:len(df)]
X_gdpr = X_all[len(df):]

# calculate similarity and assign best matching gdpr label to each paragraph
similarity = cosine_similarity(X_policy, X_gdpr)
best_matches = similarity.argmax(axis=1)
df["label"] = [df_gdpr["label"].iloc[i] for i in best_matches]

## Load paragraph data

In [13]:
# download punkt tokenizer for sentence splitting
nltk.download("punkt")

# set the folder where the txt files are
data_dir = Path("data")
output_data = []

# go through each txt file and split it into sentences
for txt_file in data_dir.glob("*.txt"):
    with open(txt_file, "r", encoding="utf-8", errors="ignore") as f:
        content = f.read()
        sentences = sent_tokenize(content)
        for i, sentence in enumerate(sentences):
            if len(sentence.strip()) >= 20:  # skip very short sentences
                output_data.append({
                    "paragraph_id": f"{txt_file.stem}_{i}",
                    "source": txt_file.name,
                    "paragraph_text": sentence.strip()
                })

# save the result as json file
output_json_path = "policy_sentences.json"
with open(output_json_path, "w", encoding="utf-8") as f:
    json.dump(output_data, f, indent=2, ensure_ascii=False)

output_json_path

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lenaw\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


'policy_sentences.json'

## Load GDPR articles and create Labels

In [6]:
# open the gdpr json file and load the data
with open('gdpr_articles_baseline.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# collect all gdpr sections in a flat list
records = []
for article in data:
    article_number = article.get("article_number")
    article_title = article.get("article_title")
    for section in article.get("sections", []):
        for sec_num, sec_text in section.items():
            records.append({
                "article_number": article_number,
                "article_title": article_title,
                "section_number": sec_num,
                "section_text": sec_text
            })

# convert the list to a dataframe and create a label column
df_gdpr = pd.DataFrame(records)
df_gdpr["label"] = "Art. " + df_gdpr["article_number"].astype(str)

## Create training set (simulated labels for testing)

In [7]:
# load sentence data from policy_sentences.json
with open("policy_sentences.json", "r", encoding="utf-8") as f:
    sentence_data = json.load(f)

# convert to dataframe
df = pd.DataFrame(sentence_data)

# simulate label assignment using cosine similarity
vectorizer = TfidfVectorizer()
X_all = vectorizer.fit_transform(df["paragraph_text"].tolist() + df_gdpr["section_text"].tolist())

# split tf-idf matrix into policy and gdpr parts
X_policy = X_all[:len(df)]
X_gdpr = X_all[len(df):]

# calculate cosine similarity between each policy paragraph and gdpr section
similarity = cosine_similarity(X_policy, X_gdpr)
best_matches = similarity.argmax(axis=1)

# assign the most similar gdpr label to each paragraph
df["label"] = [df_gdpr["label"].iloc[i] for i in best_matches]
df["similarity_score"] = similarity.max(axis=1)

df.head()

Unnamed: 0,paragraph_id,source,paragraph_text,label,similarity_score
0,aol_com_policy_0,aol_com_policy.txt,Welcome to the Yahoo Privacy PolicyLast update...,Art. 24,0.068724
1,aol_com_policy_1,aol_com_policy.txt,"We serve our consumers, partners, advertisers ...",Art. 57,0.073494
2,aol_com_policy_2,aol_com_policy.txt,"If you have an existing Yahoo or AOL account, ...",Art. 58,0.115544
3,aol_com_policy_3,aol_com_policy.txt,If you have not yet agreed to this Privacy Pol...,Art. 41,0.098341
4,aol_com_policy_4,aol_com_policy.txt,For Yahoo products or services that are access...,Art. 99,0.367017


## Train classifier (MultinomialNB + TF-IDF)

In [18]:
# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df["paragraph_text"], df["label"],
    test_size=0.2, stratify=df["label"], random_state=42
)

# convert text to tf-idf features
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# train a naive bayes model
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# make predictions on the test set
y_pred = model.predict(X_test_tfidf)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      Art. 1       1.00      0.18      0.31        88
     Art. 10       0.00      0.00      0.00         2
     Art. 11       0.96      0.15      0.27       156
     Art. 12       0.94      0.33      0.49       257
     Art. 13       0.31      0.96      0.46      2460
     Art. 14       0.63      0.68      0.65      1260
     Art. 15       0.44      0.89      0.59      1531
     Art. 16       0.00      0.00      0.00         3
     Art. 17       0.97      0.38      0.54       402
     Art. 18       0.99      0.50      0.66       268
     Art. 19       0.00      0.00      0.00         1
      Art. 2       0.94      0.28      0.43       112
     Art. 20       0.96      0.30      0.46       160
     Art. 21       1.00      0.26      0.42       242
     Art. 22       0.98      0.33      0.50       276
     Art. 23       0.84      0.79      0.82       972
     Art. 24       0.00      0.00      0.00        15
     Art. 25       1.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Prediction function for new paragraphs

In [19]:
# return top n gdpr predictions for a given text
def top_n_predictions(text, n=3):
    vec = vectorizer.transform([text])  # convert text to tf-idf vector
    proba = model.predict_proba(vec)[0]  # get class probabilities
    top_indices = proba.argsort()[::-1][:n]  # get indices of top n classes
    return [(model.classes_[i], round(proba[i], 4)) for i in top_indices]

# get top 5 predictions
text = df.iloc[100]["paragraph_text"]

top_preds = top_n_predictions(text, n=5)

# print the paragraph and its top predictions
print("Paragraph:", text[:300])
print("Top GDPR predictions:")
for label, score in top_preds:
    print(f"  {label}: {score:.2f}")

Paragraph: Please see ourLegal basespage for more information.Data Processing and TransfersWhen you use or interact with any of our Services, you consent to the data processing, sharing, transferring and uses of your information as outlined in this Privacy Policy.
Top GDPR predictions:
  Art. 23: 0.44
  Art. 13: 0.12
  Art. 40: 0.11
  Art. 15: 0.09
  Art. 47: 0.04


In [20]:
# save the model
joblib.dump(model, "multinomialNB_model.joblib")
joblib.dump(vectorizer, "multinomialNB_vectorizer.joblib")

['multinomialNB_vectorizer.joblib']

## Train Logistic Regression Model

In [11]:
# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df["paragraph_text"], df["label"], 
    test_size=0.2, stratify=df["label"], random_state=42
)

# convert text to tf-idf vectors
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# train logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# make predictions on the test set
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      Art. 1       0.97      0.80      0.88        88
     Art. 10       0.00      0.00      0.00         2
     Art. 11       0.93      0.89      0.91       156
     Art. 12       0.95      0.90      0.93       257
     Art. 13       0.85      0.95      0.90      2460
     Art. 14       0.91      0.92      0.92      1260
     Art. 15       0.87      0.94      0.91      1531
     Art. 16       0.00      0.00      0.00         3
     Art. 17       0.93      0.89      0.91       402
     Art. 18       0.96      0.86      0.91       268
     Art. 19       0.00      0.00      0.00         1
      Art. 2       0.97      0.87      0.92       112
     Art. 20       0.97      0.88      0.92       160
     Art. 21       0.95      0.93      0.94       242
     Art. 22       0.97      0.92      0.94       276
     Art. 23       0.91      0.95      0.93       972
     Art. 24       1.00      0.73      0.85        15
     Art. 25       0.95    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Prediction Function for New Paragraphs

In [16]:
# return top n gdpr predictions for a given text
def top_n_predictions(text, n=5):
    vec = vectorizer.transform([text])  # convert text to tf-idf vector
    proba = model.predict_proba(vec)[0]  # get prediction probabilities
    top_indices = proba.argsort()[::-1][:n]  # get indices of top n scores
    return [(model.classes_[i], round(proba[i], 4)) for i in top_indices]

# get top 5 predictions
text = df.iloc[100]["paragraph_text"]
top_preds = top_n_predictions(text, n=5)

# print the paragraph and its top predictions
print("Paragraph:", text[:300])
print("Top GDPR predictions:")
for label, score in top_preds:
    print(f"  {label}: {score:.2f}")

Paragraph: Please see ourLegal basespage for more information.Data Processing and TransfersWhen you use or interact with any of our Services, you consent to the data processing, sharing, transferring and uses of your information as outlined in this Privacy Policy.
Top GDPR predictions:
  Art. 23: 0.77
  Art. 15: 0.04
  Art. 13: 0.03
  Art. 9: 0.02
  Art. 7: 0.01


In [17]:
# save the model
joblib.dump(model, "logistic_regression_model.joblib")
joblib.dump(vectorizer, "logistic_regression_vectorizer.joblib")

['logistic_regression_vectorizer.joblib']