# **Load & basic preparation**
The dataset is loaded, inspected, and prepared by combining titles and bodies into a single text field for analysis.

In [None]:
import pandas as pd
import numpy as np
from google.colab import files

# Load data
uploaded = files.upload()
df = pd.read_csv("Dutch_Migration_News.csv")

# Quick exploration
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print("\nMissing values per column:")
print(df.isna().sum())

display(df.head(3))

# Basic cleaning / preparation
# Drop index column if present
if "Unnamed: 0" in df.columns:
    df = df.drop(columns=["Unnamed: 0"])

# Check required text columns
expected_cols = {"title", "body"}
missing = expected_cols - set(df.columns)
if missing:
    raise ValueError(f"Dataset is missing columns: {missing}")

# Fill missing text values
df["title"] = df["title"].fillna("")
df["body"] = df["body"].fillna("")

# Combine title and body
df["text"] = (df["title"].str.strip() + ". " + df["body"].str.strip()).str.strip()

# Remove very short texts
df["text_len"] = df["text"].str.len()
df = df[df["text_len"] >= 50].copy()

print("\nAfter basic cleaning:")
print("Shape:", df.shape)
print("Text length stats:")
print(df["text_len"].describe()[["min", "25%", "50%", "mean", "75%", "max"]])

# Keep relevant columns
keep_cols = ["outlet", "year", "authors", "title", "text"]
df = df[keep_cols].copy()

display(df.sample(3, random_state=1))

# **Text preprocessing**
Text preprocessing is applied to normalize the texts and prepare them for topic modeling and sentiment analysis.

In [None]:
import re
import nltk
from nltk.corpus import stopwords

# Stopwords
nltk.download("stopwords")

dutch_stopwords = set(stopwords.words("dutch"))
english_stopwords = set(stopwords.words("english"))

# Combine Dutch and English stopwords
all_stopwords = dutch_stopwords.union(english_stopwords)

print("Number of Dutch stopwords:", len(dutch_stopwords))
print("Number of English stopwords:", len(english_stopwords))
print("Total stopwords (combined):", len(all_stopwords))

# Text preprocessing function
def preprocess_text(text):
    """
    Basic Dutch text preprocessing:
    - lowercasing
    - remove punctuation and numbers
    - tokenization by whitespace
    - remove Dutch + English stopwords
    - keep tokens with length >= 3
    """
    # Lowercasing
    text = text.lower()

    # Remove punctuation and numbers
    text = re.sub(r"[^a-zà-ÿ\s]", " ", text)

    # Tokenization
    tokens = text.split()

    # Stopword removal and length filtering
    tokens = [
        token for token in tokens
        if token not in all_stopwords and len(token) >= 3
    ]

    return tokens

# Apply preprocessing
df["tokens"] = df["text"].apply(preprocess_text)

# Create cleaned text strings
df["clean_text"] = df["tokens"].apply(lambda x: " ".join(x))

# Sanity checks
print("Example tokens:")
print(df["tokens"].iloc[0][:30])

print("\nExample clean_text:")
print(df["clean_text"].iloc[0][:300])

empty_docs = (df["clean_text"].str.len() == 0).sum()
print("\nEmpty documents after preprocessing:", empty_docs)

# **Topic Modeling (TF-IDF + LDA)**
Topic modeling is performed using TF-IDF features and Latent Dirichlet Allocation (LDA) to identify dominant themes in migration-related news coverage.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(
    max_df=0.9,       # high-frequency term filtering
    min_df=10,        # low-frequency term filtering
    max_features=5000 # feature space size
)

X_tfidf = tfidf_vectorizer.fit_transform(df["clean_text"])

print("TF-IDF matrix shape:", X_tfidf.shape)

# LDA model
N_TOPICS = 8  # number of topics

lda_model = LatentDirichletAllocation(
    n_components=N_TOPICS,
    random_state=42,
    learning_method="batch"
)

lda_model.fit(X_tfidf)

# Topic inspection
feature_names = tfidf_vectorizer.get_feature_names_out()

def display_topics(model, feature_names, n_top_words=12):
    topics = {}
    for topic_idx, topic in enumerate(model.components_):
        top_features = [
            feature_names[i]
            for i in topic.argsort()[:-n_top_words - 1:-1]
        ]
        topics[f"Topic {topic_idx}"] = top_features
    return topics

topics = display_topics(lda_model, feature_names)

for topic, words in topics.items():
    print(f"\n{topic}:")
    print(", ".join(words))

# Dominant topic per article
doc_topic_dist = lda_model.transform(X_tfidf)
df["dominant_topic"] = doc_topic_dist.argmax(axis=1)

df["dominant_topic"].value_counts().sort_index()

# **Sentiment Analysis**
Sentiment scores are computed on the original article texts using a lexicon-based approach and aggregated per dominant topic.

In [None]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# VADER setup
nltk.download("vader_lexicon")
sia = SentimentIntensityAnalyzer()

# Document-level sentiment
def get_sentiment(text):
    """
    Computes compound sentiment score using VADER.
    Range: [-1, 1]
    """
    return sia.polarity_scores(text)["compound"]

df["sentiment"] = df["text"].apply(get_sentiment)

# Distribution check
df["sentiment"].describe()

In [None]:
# Sentiment per topic
topic_sentiment = (
    df.groupby("dominant_topic")["sentiment"]
      .agg(["mean", "median", "count"])
      .sort_index()
)

topic_sentiment

In [None]:
# Sentiment categories
def sentiment_label(score):
    if score > 0.05:
        return "positive"
    elif score < -0.05:
        return "negative"
    else:
        return "neutral"

df["sentiment_label"] = df["sentiment"].apply(sentiment_label)

sentiment_distribution = (
    df.groupby(["dominant_topic", "sentiment_label"])
      .size()
      .unstack(fill_value=0)
)

sentiment_distribution