In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import pandas as pd
import spacy
import torch

# Ensure CUDA is detected
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Paths
MODEL_DIR = "./bertje-finetuned-final"    # Path to your local fine-tuned model directory
BASE_MODEL = "GroNLP/bert-base-dutch-cased"  # Always use base model for tokenizer

# Load tokenizer and model (from local)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR).to(device)

# Load your policy docs (CSV or Excel)
unseen_policy_df = pd.read_excel("2015_selectedtypes_cleaned (1).xlsx")  # Or use .csv

# SpaCy for Dutch sentence splitting
nlp = spacy.load("nl_core_news_sm")

def split_sentences(text):
    doc = nlp(str(text))
    return [sent.text.strip() for sent in doc.sents]

# Prepare sentences
all_sentences = []
doc_ids = []
for i, row in unseen_policy_df.iterrows():
    doc_id = row['filename'] if 'filename' in row else f'doc_{i}'
    text = row['clean_text']
    if not pd.isna(text):
        sents = split_sentences(text)
        all_sentences.extend(sents)
        doc_ids.extend([doc_id]*len(sents))

# Set up HuggingFace pipeline for local CUDA inference
nlp_pipe = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1,
)

# Predict topics
results = nlp_pipe(all_sentences, truncation=True, max_length=128, batch_size=32)

# Combine results
results_df = pd.DataFrame({
    'document': doc_ids,
    'sentence': all_sentences,
    'label': [r['label'] for r in results],
    'score': [r['score'] for r in results]
})

# You must have your label2topic mapping available!
# Example:
label2topic = {
    0: "onderwijs",
    1: "milieu",
    # ... fill in all labels as needed
}
results_df['topic'] = results_df['label'].apply(lambda x: label2topic[int(x.replace('LABEL_', ''))])

display(results_df.head())

# Save results
results_df.to_csv('bertje_topic_predictions.csv', index=False)


Using device: cuda


tokenizer_config.json:   0%|          | 0.00/254 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/242k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

HFValidationError: Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are forbidden, '-' and '.' cannot start or end the name, max length is 96: './bertje-finetuned-final'.