In [2]:
!pip install torch transformers scikit-learn pandas numpy matplotlib plotly spacy
!python -m spacy download nl_core_news_sm
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

Collecting nl-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/nl_core_news_sm-3.8.0/nl_core_news_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     - -------------------------------------- 0.5/12.8 MB 3.4 MB/s eta 0:00:04
     ---- ----------------------------------- 1.3/12.8 MB 3.4 MB/s eta 0:00:04
     ------ --------------------------------- 2.1/12.8 MB 3.4 MB/s eta 0:00:04
     -------- ------------------------------- 2.6/12.8 MB 3.4 MB/s eta 0:00:03
     ---------- ----------------------------- 3.4/12.8 MB 3.5 MB/s eta 0:00:03
     ------------- -------------------------- 4.5/12.8 MB 3.6 MB/s eta 0:00:03
     ---------------- ----------------------- 5.2/12.8 MB 3.7 MB/s eta 0:00:03
     ------------------ --------------------- 6.0/12.8 MB 3.8 MB/s eta 0:00:02
     --------------------- ------------------

In [8]:
#policy_Embedder

import torch
import pandas as pd
import numpy as np
import spacy
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline, AutoModel
import pickle

# Load dictionary
dictionary_df = pd.read_excel('policy_dictionairy.xlsx')

# Load policy documents
unseen_policy_df = pd.read_excel("Policy-documents/2015_selectedtypes_cleaned (1).xlsx")
nlp = spacy.load("nl_core_news_sm")

# Load topic mappings
with open('policy-label2topic.pickle', 'rb') as fp:
    label2topic = pickle.load(fp)
with open('policy-topic2label.pickle', 'rb') as fp:
    topic2label = pickle.load(fp)

display(label2topic)

# Device detection
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

MODEL_DIR = "bertje_policy/checkpoint-63"
BASE_MODEL = "GroNLP/bert-base-dutch-cased"

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR).to(device)
embedder = AutoModel.from_pretrained(BASE_MODEL).to(device)

def split_sentences(text):
    doc = nlp(str(text))
    return [sent.text.strip() for sent in doc.sents]

# Collect all sentences and doc_ids
all_sentences = []
doc_ids = []
for i, row in unseen_policy_df.iterrows():
    doc_id = row['filename'] if 'filename' in row else f'doc_{i}'
    text = row['clean_text']
    if not pd.isna(text):
        sents = split_sentences(text)
        all_sentences.extend(sents)
        doc_ids.extend([doc_id]*len(sents))

# HuggingFace pipeline for topic prediction
nlp_pipe = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1,
)

# Predict topics (batched)
results = nlp_pipe(all_sentences, truncation=True, max_length=128, batch_size=32)

# Keep results and sentences aligned; fill label/score only if score >= 0.7
#labels = []
#cores = []
#for r in results:
#    if r['score'] >= 0.3:
#        labels.append(r['label'])
#        scores.append(r['score'])
#    else:
#        labels.append(None)
#        scores.append(None)

# DataFrame build
results_df = pd.DataFrame({
    'document': doc_ids,
    'sentence': all_sentences,
    'labels' : [r['label'] for r in results],
    'scores': [r['score'] for r in results]
})

# Label int and topic only for confident predictions
results_df['label_int'] = results_df['label'].apply(lambda x: int(x.replace('LABEL_', '')) if pd.notnull(x) else None)
results_df['topic'] = results_df['label_int'].map(label2topic)

# Embedding function (only for sentences with valid label)
def bertje_embed(sentences, tokenizer, model, device=device, batch_size=32):
    embeddings = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        encoded = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
        with torch.no_grad():
            output = model(**encoded)
        batch_embeds = output.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_embeds)
    return np.array(embeddings)

# Only embed sentences with a confident topic label
mask = results_df['label'].notnull()
embed_sentences = results_df.loc[mask, 'sentence'].tolist()
sentence_embeddings = bertje_embed(embed_sentences, tokenizer, embedder, device=device, batch_size=32)

# Store embeddings back in the DataFrame (as lists), else None
embedding_col = [None] * len(results_df)
for i, idx in enumerate(results_df.index[mask]):
    embedding_col[idx] = sentence_embeddings[i].tolist()
results_df['embedding'] = embedding_col

# Save as CSV and pickle
results_df.to_csv('policy-bertje-matched_results-2015.csv', index=False)
with open("policy-bertje-matched_results-2015.pickle", "wb") as f:
    pickle.dump(results_df, f)

print(results_df.head())
print(results_df['topic'].value_counts())


{np.int64(0): 'Cultuur_en_sport',
 np.int64(1): 'Ramp_ongeval',
 np.int64(2): 'arbeid',
 np.int64(3): 'buitenlandse_zaken',
 np.int64(4): 'burgerrechten',
 np.int64(5): 'criminaliteit',
 np.int64(6): 'democratie_en_bestuur',
 np.int64(7): 'gezondheidszorg',
 np.int64(8): 'huisvesting',
 np.int64(9): 'immigratie',
 np.int64(10): 'milieu',
 np.int64(11): 'not-matched',
 np.int64(12): 'onderwijs',
 np.int64(13): 'religie',
 np.int64(14): 'ruimtelijke_ordening',
 np.int64(15): 'sociale_zaken'}

Using device: cuda


Some weights of BertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


KeyError: 'label'

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

# Load dictionary
dictionary_df = pd.read_excel('policy_dictionairy.xlsx')

# Load training_df
train_df = pd.read_excel('policy-regex-matched_sentences.csv', index=False)

# Load policy documents
unseen_policy_df = pd.read_excel("Policy-documents/2015_selectedtypes_cleaned (1).xlsx")
nlp = spacy.load("nl_core_news_sm")

# Load topic mappings
with open('policy-label2topic.pickle', 'rb') as fp:
    label2topic = pickle.load(fp)
with open('policy-topic2label.pickle', 'rb') as fp:
    topic2label = pickle.load(fp)

display(label2topic)

# Device detection
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

MODEL_DIR = "bertje_policy/checkpoint-63"
BASE_MODEL = "GroNLP/bert-base-dutch-cased"

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR).to(device)
embedder = AutoModel.from_pretrained(BASE_MODEL).to(device)

def split_sentences(text):
    doc = nlp(str(text))
    return [sent.text.strip() for sent in doc.sents]

# DataFrame build
results_df = pd.DataFrame({
    'document': doc_ids,
    'sentence': all_sentences,
    'labels' : [r['label'] for r in results],
    'scores': [r['score'] for r in results]
})


# STEP 1: Group training sentences by topic name using label2topic
topic_sentences = defaultdict(list)
for i, row in train_df.iterrows():
    if pd.notnull(row['sentence']) and pd.notnull(row['label_int']):
        topic = label2topic[row['label_int']]
        topic_sentences[topic].append(row['sentence'])

# STEP 2: Embed topic sentences with fine-tuned embedder
def bertje_embed(sentences, tokenizer, model, device, batch_size=32):
    embeddings = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        encoded = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
        with torch.no_grad():
            output = model(**encoded)
        batch_embeds = output.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_embeds)
    return np.array(embeddings)

topic_centroids = {}
for topic, sents in topic_sentences.items():
    embeds = bertje_embed(sents, tokenizer, embedder, device=device, batch_size=32)
    topic_centroids[topic] = np.mean(embeds, axis=0)

# STEP 3: Embed all new policy sentences (from your policy documents)
test_embeddings = bertje_embed(all_sentences, tokenizer, embedder, device=device, batch_size=32)

# STEP 4: Similarity and topic assignment
topic_names = list(topic_centroids.keys())
topic_matrix = np.stack([topic_centroids[t] for t in topic_names])

similarities = cosine_similarity(test_embeddings, topic_matrix)
best_indices = similarities.argmax(axis=1)
best_scores = similarities.max(axis=1)
labels = [topic_names[idx] if best_scores[i] > 0.65 else None for i, idx in enumerate(best_indices)]

# STEP 5: Collect results
results_centroid_df = pd.DataFrame({
    'document': doc_ids,
    'sentence': all_sentences,
    'topic_centroid': labels,
    'centroid_score': best_scores
})

print(results_centroid_df.head())
print(results_centroid_df['topic_centroid'].value_counts())


In [13]:
import torch
import pandas as pd
import numpy as np
import spacy
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import pickle

# Load resources
dictionary_df = pd.read_excel('policy_dictionairy.xlsx')
train_df = pd.read_csv('Policy_regex_filtered_matched')  # Removed index=False
unseen_policy_df = pd.read_excel("Policy-documents/2015_selectedtypes_cleaned (1).xlsx")
nlp = spacy.load("nl_core_news_sm")

with open('policy-label2topic.pickle', 'rb') as fp:
    label2topic = pickle.load(fp)
with open('policy-topic2label.pickle', 'rb') as fp:
    topic2label = pickle.load(fp)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

MODEL_DIR = "bertje_policy/checkpoint-63"
BASE_MODEL = "GroNLP/bert-base-dutch-cased"

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR).to(device)
embedder = AutoModel.from_pretrained(BASE_MODEL).to(device)

def split_sentences(text):
    doc = nlp(str(text))
    return [sent.text.strip() for sent in doc.sents]

# ---- Split policy docs into sentences ----
all_sentences = []
doc_ids = []
for i, row in unseen_policy_df.iterrows():
    doc_id = row['filename'] if 'filename' in row else f'doc_{i}'
    text = row['clean_text']
    if not pd.isna(text):
        sents = split_sentences(text)
        all_sentences.extend(sents)
        doc_ids.extend([doc_id]*len(sents))

# ---- Group training sentences by topic ----
topic_sentences = defaultdict(list)
for i, row in train_df.iterrows():
    if pd.notnull(row['sentence']) and pd.notnull(row['label']):
        topic = label2topic[row['label']]
        topic_sentences[topic].append(row['sentence'])

# ---- Embedder function ----
def bertje_embed(sentences, tokenizer, model, device, batch_size=32):
    embeddings = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        encoded = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
        with torch.no_grad():
            output = model(**encoded)
        batch_embeds = output.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_embeds)
    return np.array(embeddings)

# ---- Build centroids ----
topic_centroids = {}
for topic, sents in topic_sentences.items():
    embeds = bertje_embed(sents, tokenizer, embedder, device=device, batch_size=32)
    topic_centroids[topic] = np.mean(embeds, axis=0)

# ---- Embed all policy sentences ----
test_embeddings = bertje_embed(all_sentences, tokenizer, embedder, device=device, batch_size=32)

# ---- Similarity and topic assignment ----
topic_names = list(topic_centroids.keys())
topic_matrix = np.stack([topic_centroids[t] for t in topic_names])

similarities = cosine_similarity(test_embeddings, topic_matrix)
best_indices = similarities.argmax(axis=1)
best_scores = similarities.max(axis=1)
labels = [topic_names[idx] if best_scores[i] > 0.65 else None for i, idx in enumerate(best_indices)]

# ---- Collect results ----
results_centroid_df = pd.DataFrame({
    'document': doc_ids,
    'sentence': all_sentences,
    'topic_centroid': labels,
    'centroid_score': best_scores
})

print(results_centroid_df.head())
print(results_centroid_df['topic_centroid'].value_counts())


Using device: cuda


Some weights of BertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


                                            document  \
0  mariene-strategie-voor-het-nederlandse-deel-va...   
1  mariene-strategie-voor-het-nederlandse-deel-va...   
2  mariene-strategie-voor-het-nederlandse-deel-va...   
3  mariene-strategie-voor-het-nederlandse-deel-va...   
4  mariene-strategie-voor-het-nederlandse-deel-va...   

                                            sentence      topic_centroid  \
0                                            Mariene         not-matched   
1  Strategie voor het Nederlandse deel van de Noo...  buitenlandse_zaken   
2                                            Mariene         not-matched   
3  Strategie voor het Nederlandse deel van de Noo...  buitenlandse_zaken   
4              KRM-programma van maatregelen Bijlage       sociale_zaken   

   centroid_score  
0        0.766507  
1        0.772560  
2        0.766507  
3        0.772560  
4        0.865638  
topic_centroid
not-matched              25282
buitenlandse_zaken       17451
sociale_z

In [19]:
import torch
import pandas as pd
import numpy as np
import spacy
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict
import pickle

# --- LOAD RESOURCES ---
dictionary_df = pd.read_excel('policy_dictionairy.xlsx')
train_df = pd.read_csv('Policy_regex_filtered_matched')  # Use read_csv for .csv
unseen_policy_df = pd.read_excel("Policy-documents/2015_selectedtypes_cleaned (1).xlsx")
nlp = spacy.load("nl_core_news_sm")

with open('policy-label2topic.pickle', 'rb') as fp:
    label2topic = pickle.load(fp)
with open('policy-topic2label.pickle', 'rb') as fp:
    topic2label = pickle.load(fp)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

MODEL_DIR = "bertje_policy/checkpoint-63"
BASE_MODEL = "GroNLP/bert-base-dutch-cased"

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR).to(device)
embedder = AutoModel.from_pretrained(BASE_MODEL).to(device)

def split_sentences(text):
    doc = nlp(str(text))
    return [sent.text.strip() for sent in doc.sents]

# --- BUILD SENTENCE LISTS FROM POLICY DOCUMENTS ---
all_sentences = []
doc_ids = []
for i, row in unseen_policy_df.iterrows():
    doc_id = row['filename'] if 'filename' in row else f'doc_{i}'
    text = row['clean_text']
    if not pd.isna(text):
        sents = split_sentences(text)
        all_sentences.extend(sents)
        doc_ids.extend([doc_id]*len(sents))


# --- GROUP TRAINING SENTENCES BY TOPIC ---
topic_sentences = defaultdict(list)
for i, row in train_df.iterrows():
    if pd.notnull(row['sentence']) and pd.notnull(row['label']):
        topic = label2topic[row['label']]
        topic_sentences[topic].append(row['sentence'])

# --- OPTIONALLY EXCLUDE TOPICS BEFORE EMBEDDING ---
exclude_topics = ['not-matched', 'buitenlandse_zaken','democratie_en_bestuur','milieu','sociale_zaken','ruimtelijke_ordening' ]  # <--- EDIT THIS LIST AS NEEDED
for excl in exclude_topics:
    if excl in topic_sentences:
        del topic_sentences[excl]

print("Topics included for centroid construction:", list(topic_sentences.keys()))

# --- EMBEDDER FUNCTION ---
def bertje_embed(sentences, tokenizer, model, device, batch_size=32):
    embeddings = []
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i+batch_size]
        encoded = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=128).to(device)
        with torch.no_grad():
            output = model(**encoded)
        batch_embeds = output.last_hidden_state.mean(dim=1).cpu().numpy()
        embeddings.extend(batch_embeds)
    return np.array(embeddings)

# --- BUILD CENTROIDS ONLY FOR INCLUDED TOPICS ---
topic_centroids = {}
for topic, sents in topic_sentences.items():
    embeds = bertje_embed(sents, tokenizer, embedder, device=device, batch_size=32)
    topic_centroids[topic] = np.mean(embeds, axis=0)

# --- EMBED ALL POLICY SENTENCES ---
test_embeddings = bertje_embed(all_sentences, tokenizer, embedder, device=device, batch_size=32)

# --- SIMILARITY AND TOPIC ASSIGNMENT ---
topic_names = list(topic_centroids.keys())
topic_matrix = np.stack([topic_centroids[t] for t in topic_names])

similarities = cosine_similarity(test_embeddings, topic_matrix)
best_indices = similarities.argmax(axis=1)
best_scores = similarities.max(axis=1)
labels = [topic_names[idx] if best_scores[i] > 0.65 else None for i, idx in enumerate(best_indices)]

# --- COLLECT RESULTS ---
results_centroid_df = pd.DataFrame({
    'document': doc_ids,
    'sentence': all_sentences,
    'topic_centroid': labels,
    'centroid_score': best_scores
})

print(results_centroid_df.head())
print(results_centroid_df['topic_centroid'].value_counts())



Using device: cuda


Some weights of BertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Topics included for centroid construction: ['criminaliteit', 'arbeid', 'onderwijs', 'Cultuur_en_sport', 'huisvesting', 'immigratie', 'gezondheidszorg', 'Ramp_ongeval', 'burgerrechten']
                                            document  \
0  mariene-strategie-voor-het-nederlandse-deel-va...   
1  mariene-strategie-voor-het-nederlandse-deel-va...   
2  mariene-strategie-voor-het-nederlandse-deel-va...   
3  mariene-strategie-voor-het-nederlandse-deel-va...   
4  mariene-strategie-voor-het-nederlandse-deel-va...   

                                            sentence    topic_centroid  \
0                                            Mariene        immigratie   
1  Strategie voor het Nederlandse deel van de Noo...  Cultuur_en_sport   
2                                            Mariene        immigratie   
3  Strategie voor het Nederlandse deel van de Noo...  Cultuur_en_sport   
4              KRM-programma van maatregelen Bijlage     criminaliteit   

   centroid_score  
0        0.75

In [22]:
results_centroid_df.to_csv('Policy_2015_centroid.csv')

In [4]:
print(results_df.head())
print(results_df['topic'].value_counts())

                                            document  \
0  mariene-strategie-voor-het-nederlandse-deel-va...   
1  mariene-strategie-voor-het-nederlandse-deel-va...   
2  mariene-strategie-voor-het-nederlandse-deel-va...   
3  mariene-strategie-voor-het-nederlandse-deel-va...   
4  mariene-strategie-voor-het-nederlandse-deel-va...   

                                            sentence     label     score  \
0                                            Mariene  LABEL_12  0.118065   
1  Strategie voor het Nederlandse deel van de Noo...   LABEL_3  0.167812   
2                                            Mariene  LABEL_12  0.118065   
3  Strategie voor het Nederlandse deel van de Noo...   LABEL_3  0.167812   
4              KRM-programma van maatregelen Bijlage   LABEL_3  0.234807   

   label_int               topic  \
0         12           onderwijs   
1          3  buitenlandse_zaken   
2         12           onderwijs   
3          3  buitenlandse_zaken   
4          3  buitenlandse

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd


try:
    print(results_df)
except NameError:
    with open("slavery-bertje-matched_results-2015.pickle", "rb") as f:
        results_df = pickle.load(fp)


grouped = results_df.groupby('topic')

topic_cohesion = {}
for topic, group in grouped:
    embeds = np.vstack(group['embedding'].values)
    if embeds.shape[0] > 1:  # Only if there's more than one sentence
        sims = cosine_similarity(embeds,dense_output=True)
        # Take mean of upper triangle (excluding the diagonal)
        mean_sim = np.mean(sims[np.triu_indices_from(sims, k=1)])
        topic_cohesion[topic] = mean_sim
    else:
        topic_cohesion[topic] = np.nan  # Or 0



cohesion_df = pd.DataFrame(
    list(topic_cohesion.items()), columns=['topic', 'mean_cosine_similarity']
).sort_values('mean_cosine_similarity', ascending=True)

all_topics = set(label2topic.values())
existing_topics = set(cohesion_df['topic'])
missing_topics = all_topics - existing_topics

print(cohesion_df)

print(results_df['topic'].value_counts())

print(results_df['label'].unique())

: 

In [None]:
import pandas as pd

# Ensure 'label_int' is numeric
results_df['label_int'] = pd.to_numeric(results_df['label_int'], errors='coerce')

# Drop rows where label_int is null or 3
filtered_df = results_df[~(results_df['label_int'].isnull() | (results_df['label_int'] == 11))].copy()

display(filtered_df.head())



Unnamed: 0,document,sentence,label,score,label_int,topic,embedding
0,mariene-strategie-voor-het-nederlandse-deel-va...,Mariene,LABEL_12,0.118065,12,onderwijs,"[-0.012886330485343933, -0.18227195739746094, ..."
1,mariene-strategie-voor-het-nederlandse-deel-va...,Strategie voor het Nederlandse deel van de Noo...,LABEL_3,0.167812,3,buitenlandse_zaken,"[0.5770301222801208, -0.16932857036590576, -0...."
2,mariene-strategie-voor-het-nederlandse-deel-va...,Mariene,LABEL_12,0.118065,12,onderwijs,"[-0.012886330485343933, -0.18227195739746094, ..."
3,mariene-strategie-voor-het-nederlandse-deel-va...,Strategie voor het Nederlandse deel van de Noo...,LABEL_3,0.167812,3,buitenlandse_zaken,"[0.5770301222801208, -0.16932857036590576, -0...."
4,mariene-strategie-voor-het-nederlandse-deel-va...,KRM-programma van maatregelen Bijlage,LABEL_3,0.234807,3,buitenlandse_zaken,"[0.5225617289543152, -0.26490160822868347, -0...."


In [None]:
import pandas as pd

# Ensure 'label_int' is numeric
results_df['label_int'] = pd.to_numeric(results_df['label_int'], errors='coerce')

# Drop rows where label_int is null or 3
filtered_df = results_df[~(results_df['label_int'].isnull() | (results_df['label_int'] == 3))].copy()

display(filtered_df.head())
