In [None]:
# This script runs the topic classifier using the Manifestoberta model on manifestos from 1963 to 1995

In [1]:
import torch
import pandas as pd

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [3]:
df_orig = pd.read_csv("https://gist.githubusercontent.com/ogrnz/86dd6d7a1f644d89b7efaf960e435c67/raw/99684de478823932e9a7ccab2d27ce58da2f2b12/gistfile1.txt")
df_orig

Unnamed: 0,party,date,text
0,43320,196310,ARBEITSPROGRAMM 1963 Vor 75 Jahren erfolgte d...
1,43320,196310,In Würdigung dieses historischen und für die w...
2,43320,196310,Recht auf Bildung — Chancen für die Jugend De...
3,43320,196310,Jedermann soll Anteil an den kulturellen Werte...
4,43320,196310,Was dem ganzen Volke und vornehmlich der Jugen...
...,...,...,...
14722,43810,199510,• dass der körperliche Drogenentzug weiterhin ...
14723,43810,199510,• dass sowohl der Entzug mit medikamentöser Be...
14724,43810,199510,Der Entzug muss auch gegen den Willen der Betr...
14725,43810,199510,Es sind die geeigneten Massnahmen dafür bereit...


In [4]:
# Check whether GPU available
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [5]:
# Classification model manifestoberta
model = AutoModelForSequenceClassification.from_pretrained("manifesto-project/manifestoberta-xlm-roberta-56policy-topics-sentence-2023-1-1")
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")
model = model.to(device)

config.json:   0%|          | 0.00/5.59k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [6]:
# This function classifies sentences based on the 56 topics of the Manifesto Project
def classify_sentence(sentence):
    inputs = tokenizer(
        sentence,
        return_tensors="pt",
        # max_length=200,  #we limited the input to 200 tokens during finetuning
        padding="max_length",
        truncation=True
    )

    # Move to GPU if available 
    inputs = inputs.to(device)

    logits = model(**inputs).logits
    predicted_class = model.config.id2label[logits.argmax().item()]
    return predicted_class

In [11]:
df = df_orig.copy()
#df = df.iloc[:50]
df.loc[:, "topic"] = df["text"].apply(classify_sentence)
df

In [13]:
df.to_csv("./data/classified_manifestos_pre99.csv", index=False)