In [None]:
# Pre-processing pipeline

import pandas as pd
import string
import spacy

# Load data
df = pd.read_csv("Dutch_Migration_News.csv")

# Expected columns
TITLE_COL = "title"
BODY_COL  = "body"

df[TITLE_COL] = df[TITLE_COL].astype(str)
df[BODY_COL]  = df[BODY_COL].astype(str)

# Combine title + body
df["text"] = df[TITLE_COL] + " " + df[BODY_COL]

# Load SpaCy model (Dutch)
# Run once in your own environment if not installed:
#!python -m spacy download nl_core_news_sm
nlp = spacy.load("nl_core_news_sm")

# preprocessing function
# - POS filtering
# - Lemmatization
# - Stopword removal
# - Remove punctuation & digits
# - Lowercasing

def clean_text_spacy(doc):
    include_pos = ["NOUN", "PROPN", "ADJ", "VERB"]

    text = " ".join([
        t.lemma_.strip()
        for t in doc
        if t.pos_ in include_pos and not t.is_stop
    ])

    # remove punctuation
    text = text.translate(str.maketrans(" ", " ", string.punctuation))
    # remove digits
    text = text.translate(str.maketrans(" ", " ", string.digits))
    # lowercase
    text = text.lower()

    return text

# Apply SpaCy pipeline
df["text_nlp"] = df["text"].apply(lambda x: nlp(x))
df["clean_text"] = df["text_nlp"].apply(clean_text_spacy)

# Token list version (useful for LDA?)
df["clean_tokens"] = df["clean_text"].apply(lambda x: x.split())

# Quick sanity check

df[["text", "clean_text", "clean_tokens"]].head()