In [None]:
# This script serves to compare the Manifestoberta classifier with already annotated text
# It first chunks the text into sentences
# Second, it adds a topic label 

In [13]:
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import numpy as np
import pandas as pd


[nltk_data] Downloading package punkt to C:\Users\Laura
[nltk_data]     Mauricio\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [33]:
df = pd.read_csv("./data/manifesto_combined_texts_en.csv")
df

Unnamed: 0,party,date,text
0,43110,199910,For a sustainable Switzerland Election platfor...
1,43110,200310,Another world is possible Green Switzerland Ma...
2,43110,200710,GREEN - THE ALTERNATIVE WITH A FUTURE National...
3,43110,201110,Nature has to serve for many things: It is the...
4,43110,201510,""" Shaping the future - voting green."" Election..."
5,43110,201910,ELECTION PLATFORM 2019-2023 INTRODUCTION Switz...
6,43120,200710,Assume responsibility and live and politicize ...
7,43120,201110,With the introduction of an energy tax on non-...
8,43120,201510,"Guidelines Now, but also tomorrow We understan..."
9,43120,201910,It's time - 26 Green-Liberal policy positions ...


In [34]:
df = df.sample(1)
df

Unnamed: 0,party,date,text
19,43420,201510,"Future strategy Freedom, public spirit and pro..."


In [38]:
manifesto_text = df["text"].tolist()
manifesto_text = str(manifesto_text)

# Chunk manifesto text into sentences
sentences = sent_tokenize(manifesto_text)

# Postprocessing (e.g., remove enumerations)
sentences = [sentence for sentence in sentences if len(sentence) > 7]

#This needs to be changed based on previous code chunk
party = "43420"
date = "201510"

chunked_manifesto_df = pd.DataFrame({"party": party, "date": date, "text": sentences})
chunked_manifesto_df

Unnamed: 0,party,date,text
0,43420,201510,"[""Future strategy Freedom, public spirit and p..."
1,43420,201510,Securing this is a major challenge.
2,43420,201510,This requires a party that supports the state ...
3,43420,201510,It has made modern Switzerland possible and bu...
4,43420,201510,With its strong representation in numerous gov...
...,...,...,...
92,43420,201510,4. assume responsibility We are Switzerland tu...
93,43420,201510,"As the founding party of modern Switzerland, w..."
94,43420,201510,Our joint success depends on our own courage.
95,43420,201510,Standing still means going backwards.


In [22]:
# Now apply the topic classifier 
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [23]:
# Check whether GPU available
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [26]:
# Classification model manifestoberta
model = AutoModelForSequenceClassification.from_pretrained("manifesto-project/manifestoberta-xlm-roberta-56policy-topics-sentence-2023-1-1")
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")
model = model.to(device)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [27]:
# This function classifies sentences based on the 56 topics of the Manifesto Project
def classify_sentence(sentence):
    inputs = tokenizer(
        sentence,
        return_tensors="pt",
        # max_length=200,
        padding="max_length",
        truncation=True
    )

    # Move to GPU if available 
    inputs = inputs.to(device)

    logits = model(**inputs).logits
    predicted_class = model.config.id2label[logits.argmax().item()]
    return predicted_class

In [39]:
classified_manifesto = chunked_manifesto_df
classified_manifesto.loc[:, "topic"] = classified_manifesto["text"].apply(classify_sentence)
classified_manifesto

Unnamed: 0,party,date,text,topic
0,43420,201510,"[""Future strategy Freedom, public spirit and p...",601 - National Way of Life: Positive
1,43420,201510,Securing this is a major challenge.,504 - Welfare State Expansion
2,43420,201510,This requires a party that supports the state ...,305 - Political Authority
3,43420,201510,It has made modern Switzerland possible and bu...,601 - National Way of Life: Positive
4,43420,201510,With its strong representation in numerous gov...,305 - Political Authority
...,...,...,...,...
92,43420,201510,4. assume responsibility We are Switzerland tu...,601 - National Way of Life: Positive
93,43420,201510,"As the founding party of modern Switzerland, w...",601 - National Way of Life: Positive
94,43420,201510,Our joint success depends on our own courage.,601 - National Way of Life: Positive
95,43420,201510,Standing still means going backwards.,305 - Political Authority


In [40]:
# Function to map topic to main_topic
def map_to_main_topic(topic):
    if str(topic).startswith('1'):
        return "external relations"
    elif str(topic).startswith('2'):
        return "freedom and democracy"
    elif str(topic).startswith('3'):
        return "political system"
    elif str(topic).startswith('4'):
        return "economy"
    elif str(topic).startswith('5'):
        return "welfare and quality of life"
    elif str(topic).startswith('6'):
        return "fabric of society"
    elif str(topic).startswith('7'):
        return "social groups"
    else:
        return "other" 

# Add a new column 'main_topic' based on the 'topic'
classified_manifesto['main_topic'] = classified_manifesto['topic'].apply(map_to_main_topic)
classified_manifesto

Unnamed: 0,party,date,text,topic,main_topic
0,43420,201510,"[""Future strategy Freedom, public spirit and p...",601 - National Way of Life: Positive,fabric of society
1,43420,201510,Securing this is a major challenge.,504 - Welfare State Expansion,welfare and quality of life
2,43420,201510,This requires a party that supports the state ...,305 - Political Authority,political system
3,43420,201510,It has made modern Switzerland possible and bu...,601 - National Way of Life: Positive,fabric of society
4,43420,201510,With its strong representation in numerous gov...,305 - Political Authority,political system
...,...,...,...,...,...
92,43420,201510,4. assume responsibility We are Switzerland tu...,601 - National Way of Life: Positive,fabric of society
93,43420,201510,"As the founding party of modern Switzerland, w...",601 - National Way of Life: Positive,fabric of society
94,43420,201510,Our joint success depends on our own courage.,601 - National Way of Life: Positive,fabric of society
95,43420,201510,Standing still means going backwards.,305 - Political Authority,political system


In [43]:
# Split 'topic' into 'cmp_code'
manifesto_text["cmp_code"] = manifesto_text["topic"].str.split(" - ", n=1, expand=True).loc[:,0]
manifesto_text

TypeError: string indices must be integers

In [47]:
# Compute the shares of topics in order to compare with already annotated manifestos
counts = classified_manifesto["main_topic"].value_counts()
total = len(classified_manifesto["main_topic"])
share = (counts / total) * 100
share

main_topic
fabric of society              35.051546
economy                        15.463918
political system               14.432990
welfare and quality of life    13.402062
freedom and democracy          10.309278
external relations              9.278351
social groups                   2.061856
Name: count, dtype: float64

In [65]:
# Compute share from annotated manifesto
# Load dataset from siebert sentiment analysis which contains manifesto from 1999 to 2019
annotated_manifestos_1999_2019= pd.read_pickle("./data/df_main_sentiment_1999-2019.pkl")
annotated_manifestos_1999_2019

Unnamed: 0,party,date,text,cmp_code,eu_code,sentiment,score,party_name,main_topic
0,43110,199910,For a sustainable Switzerland,,,POSITIVE,0.998674,GPS,other
1,43110,199910,Election platform 1999,,,POSITIVE,0.987673,GPS,other
2,43110,199910,The Greens are pleased to present an election ...,501,,POSITIVE,0.998852,GPS,welfare and quality of life
3,43110,199910,- to briefly present a sustainable Switzerland.,601,,POSITIVE,0.998062,GPS,fabric of society
4,43110,199910,"In responsibility towards future generations, ...",201,,POSITIVE,0.998758,GPS,freedom and democracy
...,...,...,...,...,...,...,...,...,...
18169,43811,201910,The same is true for other projects that lead ...,414,,POSITIVE,0.998668,BDP,economy
18170,43811,201910,"Various reforms in the financial center, for e...",414,,POSITIVE,0.998879,BDP,economy
18171,43811,201910,They must be seen as an opportunity.,414,,POSITIVE,0.994448,BDP,economy
18172,43811,201910,There is no way around the recognition and imp...,107,,POSITIVE,0.998304,BDP,external relations


In [66]:
#Select manifesto
selected_manifesto = annotated_manifestos_1999_2019[(annotated_manifestos_1999_2019["party"] == 43420) & (annotated_manifestos_1999_2019["date"] == 201510)]
selected_manifesto

Unnamed: 0,party,date,text,cmp_code,eu_code,sentiment,score,party_name,main_topic
10767,43420,201510,Future strategy,H,,POSITIVE,0.992831,FDP,other
10768,43420,201510,"Freedom, public spirit and progress - for the ...",H,,POSITIVE,0.998617,FDP,other
10769,43420,201510,1. success obligates,H,,POSITIVE,0.995387,FDP,other
10770,43420,201510,Switzerland is a model of success.,601.1,,POSITIVE,0.998679,FDP,fabric of society
10771,43420,201510,Securing this is a major challenge.,305.1,,NEGATIVE,0.998764,FDP,political system
...,...,...,...,...,...,...,...,...,...
10881,43420,201510,Let's take responsibility:,410,,POSITIVE,0.995482,FDP,economy
10882,43420,201510,Let's keep our country at the forefront of the...,201.1,,POSITIVE,0.998234,FDP,freedom and democracy
10883,43420,201510,Public spirit,202.1,,POSITIVE,0.995321,FDP,freedom and democracy
10884,43420,201510,and progress,410,,POSITIVE,0.997870,FDP,economy


In [67]:
# Compute the shares of topics in order to compare with already annotated manifestos
counts_selected = selected_manifesto["main_topic"].value_counts()
total_selected = len(selected_manifesto["main_topic"])
share_selected = (counts_selected / total_selected) * 100

In [None]:
# Compare shares of topic between two methods
print(f"Share of main topics using Manifestoberta classifier:",share)
print(f"Share of main topics of annotated manifestos;", share_selected)

Share of main topics using Manifestoberta classifier: main_topic
fabric of society              35.051546
economy                        15.463918
political system               14.432990
welfare and quality of life    13.402062
freedom and democracy          10.309278
external relations              9.278351
social groups                   2.061856
Name: count, dtype: float64
Share of main topics of annotated manifestos; main_topic
freedom and democracy          19.327731
external relations             19.327731
political system               15.966387
fabric of society              15.126050
economy                        14.285714
other                           7.563025
welfare and quality of life     6.722689
social groups                   1.680672
Name: count, dtype: float64
