In [3]:
"""
nb_sentiment_pipeline_v2.py

Pipeline for Nasjonalbibliotekets API:
- Hent avisartikler per parti per månad
- Sample artiklar
- Kjør sentimentanalyse (HuggingFace NB-modell)
- Aggreger til månadlige features
- Lagre til CSV/Parquet

Krav:
pip install requests pandas tqdm transformers sentencepiece torch pyarrow
"""

import os
import time
import random
import urllib.parse
from datetime import datetime, timedelta
import pandas as pd
from tqdm import tqdm
from transformers import pipeline
import requests

# ---------------------------
# Konfigurasjon
# ---------------------------

API_KEY = os.environ.get("NB_API_KEY")  # sett environment variable NB_API_KEY
if not API_KEY:
    print("OBS! Du må sette NB_API_KEY som miljøvariabel eller her direkte.")

NB_BASE = "https://api.nb.no/catalog/v1/items"
HF_MODEL = "NbAiLab/nb-bert-base"
SAMPLE_PER_MONTH = 50
REQUEST_SLEEP = 0.1  # pause mellom kall

# Partinøkkelord – enkle søk utan negative ord
PARTY_QUERIES = {
    "Ap": '"Arbeiderpartiet" OR "Ap"',
    "H": '"Høyre" OR "Høgre"',
    "Frp": '"Fremskrittspartiet" OR "Frp"',
    "SV": '"Sosialistisk Venstreparti" OR "SV"',
    "Sp": '"Senterpartiet" OR "Sp"',
    "KrF": '"Kristelig Folkeparti" OR "KrF"',
    "V": '"Venstre"',
    "MDG": '"Miljøpartiet De Grønne" OR "MDG"',
    "Rødt": '"Rødt"',
}

# ---------------------------
# Hjelfunksjoner
# ---------------------------

def nb_search(query: str, from_date: str, to_date: str, size: int = 100, offset: int = 0) -> dict:
    """Hent metadata fra NB API"""
    url = f"{NB_BASE}?mediatype=aviser&q={urllib.parse.quote(query)}&fromDate={from_date}&toDate={to_date}&size={size}&offset={offset}"
    headers = {"Ocp-Apim-Subscription-Key": API_KEY}
    resp = requests.get(url, headers=headers, timeout=30)
    resp.raise_for_status()
    return resp.json()

def fetch_doc_ids_for_month(query: str, year: int, month: int, max_hits: int = 5000) -> list:
    """Hent document IDs for ein månad"""
    from_date = f"{year}-{month:02d}-01"
    next_month = datetime(year, month, 1) + timedelta(days=32)
    last_day = (next_month.replace(day=1) - timedelta(days=1)).day
    to_date = f"{year}-{month:02d}-{last_day:02d}"
    ids = []
    size = 100
    offset = 0
    while True:
        resp = nb_search(query, from_date, to_date, size=size, offset=offset)
        docs = resp.get("items", [])
        if not docs:
            break
        for d in docs:
            doc_id = d.get("id")
            if doc_id:
                ids.append(doc_id)
        offset += size
        if len(ids) >= max_hits or len(docs) < size:
            break
        time.sleep(REQUEST_SLEEP)
    return ids

def download_document_text(doc_id: str) -> str:
    """Hent tekst for eit dokument (dersom tilgjengeleg)"""
    url = f"{NB_BASE}/{doc_id}"
    headers = {"Ocp-Apim-Subscription-Key": API_KEY}
    resp = requests.get(url, headers=headers, timeout=30)
    if resp.status_code == 200:
        j = resp.json()
        text = j.get("text") or j.get("fullText") or ""
        return text
    return ""

def build_sentiment_pipeline(model_name=HF_MODEL):
    print(f"Initialiserer sentiment-pipeline med {model_name} ...")
    return pipeline("sentiment-analysis", model=model_name, device=-1)

def score_texts(sent_pipeline, texts: list) -> list:
    """Kjør sentiment på tekster, returner [-1, +1]"""
    scores = []
    for t in texts:
        if not t.strip():
            scores.append(0.0)
            continue
        out = sent_pipeline(t[:4096])
        label = out[0]["label"].upper()
        score = out[0]["score"]
        val = score if "POS" in label else -score
        scores.append(val)
    return scores

def aggregate_monthly(party: str, year: int, month: int, sample_ids: list, sent_pipeline) -> dict:
    texts = []
    for doc_id in tqdm(sample_ids, desc=f"{party} {year}-{month:02d}"):
        txt = download_document_text(doc_id)
        if txt:
            texts.append(txt)
        time.sleep(REQUEST_SLEEP)
    if not texts:
        return {"party": party, "year": year, "month": month, "n_articles": 0, "mean_sent": None}
    scores = score_texts(sent_pipeline, texts)
    return {
        "party": party,
        "year": year,
        "month": month,
        "n_articles": len(texts),
        "mean_sent": float(pd.Series(scores).mean()),
        "std_sent": float(pd.Series(scores).std()),
        "pos_share": sum(1 for s in scores if s>0.05)/len(scores),
        "neg_share": sum(1 for s in scores if s<-0.05)/len(scores)
    }

# ---------------------------
# Hovudløype
# ---------------------------

def build_monthly_dataset(start_year, start_month, end_year, end_month, parties):
    sent_pipeline = build_sentiment_pipeline()
    results = []
    y, m = start_year, start_month
    while (y < end_year) or (y == end_year and m <= end_month):
        for party_key, query in parties.items():
            try:
                ids = fetch_doc_ids_for_month(query, y, m, max_hits=5000)
                sample_ids = random.sample(ids, min(len(ids), SAMPLE_PER_MONTH)) if ids else []
                agg = aggregate_monthly(party_key, y, m, sample_ids, sent_pipeline)
                agg["total_hits"] = len(ids)
                results.append(agg)
            except Exception as e:
                print(f"Feil for {party_key} {y}-{m:02d}: {e}")
        next_date = datetime(y, m, 1) + timedelta(days=32)
        y, m = next_date.year, next_date.month
    df = pd.DataFrame(results).sort_values(["party","year","month"]).reset_index(drop=True)
    return df



OBS! Du må sette NB_API_KEY som miljøvariabel eller her direkte.


In [4]:

def example_run():
    start_year, start_month = 2024, 1
    end_year, end_month = 2024, 3
    df = build_monthly_dataset(start_year, start_month, end_year, end_month, PARTY_QUERIES)
    df.to_csv("nb_monthly_party_sentiment.csv", index=False)
    print("Ferdig! CSV")

if __name__ == "__main__":
    example_run()

Initialiserer sentiment-pipeline med NbAiLab/nb-bert-base ...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at NbAiLab/nb-bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Ap 2024-01: 0it [00:00, ?it/s]
H 2024-01: 0it [00:00, ?it/s]
Frp 2024-01: 0it [00:00, ?it/s]
SV 2024-01: 0it [00:00, ?it/s]
Sp 2024-01: 0it [00:00, ?it/s]
KrF 2024-01: 0it [00:00, ?it/s]
V 2024-01: 0it [00:00, ?it/s]
MDG 2024-01: 0it [00:00, ?it/s]
Rødt 2024-01: 0it [00:00, ?it/s]
Ap 2024-02: 0it [00:00, ?it/s]
H 2024-02: 0it [00:00, ?it/s]
Frp 2024-02: 0it [00:00, ?it/s]
SV 2024-02: 0it [00:00, ?it/s]
Sp 2024-02: 0it [00:00, ?it/s]
KrF 2024-02: 0it [00:00, ?it/s]
V 2024-02: 0it [00:00, ?it/s]
MDG 2024-02: 0it [00:00, ?it/s]
Rødt 2024-02: 0it [00:00, ?it/s]
Ap 2024-03: 0it [00:00, ?it/s]
H 2024-03: 0it [00:00, ?it/s]
Frp 2024-03: 0it [00:00, ?it/s]
SV 2024-03: 0it [00:00, ?it/s]
Sp 2024-03: 0it 

Ferdig! CSV



