# Setup

In [1]:
import os
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from transformers import AutoTokenizer, DataCollatorWithPadding, BertModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
import torch.nn as nn
from torch.utils.data import DataLoader
from datasets import Dataset, DatasetDict
import random
from gensim.models import Doc2Vec
from gensim.models.phrases import Phrases, Phraser
from gensim.models.doc2vec import TaggedDocument
from utils.functions import group_texts, sentiment_code, topic_code,party_deu, clean_text_loop, sentiment_code_coalition, topic_code_coalition
from utils.functions import train_loop, eval_loop, tokenize_function, d2v_reduct, scale_func, recode_tw, copy_weights
# Import the new uncertainty module
from utils.uncertainty import (
    ensemble_inference,
    train_deep_ensemble,
    load_ensemble_models,
    save_ensemble_results,
    create_ensemble_summary_dataframe
)
from utils.models import ContextScalePrediction, corpusIterator, phraseIterator
from safetensors.torch import load_file, save_file
from sklearn.decomposition import PCA
import pickle
import nltk
#nltk.download('stopwords') ## Remove comments and do it once if you haven't

In [2]:
torch.cuda.empty_cache() 
device = torch.device('cuda')
torch.cuda.get_device_name(device=None)


'NVIDIA RTX PRO 6000 Blackwell Workstation Edition'

In [3]:
## Pseudo-randomness for reproducibility
seed_val = 1234
torch.manual_seed(seed_val)
random.seed(seed_val)
np.random.seed(seed_val)


# Small test of BERT embeddings

In [None]:
model_name = 'bert-base-cased'

In [None]:
model = BertModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
sentence_a = 'I went to the river bank'
sentence_b = 'I went to the bank by the river'
tok_a = tokenizer(sentence_a, return_tensors='pt')
tok_b = tokenizer(sentence_b, return_tensors='pt')

In [None]:
tok_a

In [None]:
tok_b

In [None]:
with torch.no_grad():
    outputs_a = model(**tok_a)
    outputs_b = model(**tok_b)
# Extract word embeddings from the last hidden layer
last_hidden_states_a = outputs_a.last_hidden_state
last_hidden_states_b = outputs_b.last_hidden_state

# Extract the word embedding for the first token (CLS token)
word_embedding_a = last_hidden_states_a[:, -2, :] ## 0 is the CLS token, river is the last token
word_embedding_b = last_hidden_states_b[:, 5, :]

In [None]:
np.corrcoef(word_embedding_a.numpy(), word_embedding_b.numpy()).round(2)

# Implementation

## Data preparation

In [None]:
manifesto = pd.read_csv(os.path.join("data", "r_outputs","pulled_manifestoes.csv"), encoding="utf-8", dtype = {2: 'str', 18:'str'})

In [None]:
manifesto = manifesto[(manifesto.cmp_code.notna()) & ~(manifesto.cmp_code.isin(['H']))].reset_index(drop=True)
len(manifesto)

In [None]:
manifesto['sentiment'] = manifesto['cmp_code'].apply(sentiment_code)
manifesto['topic'] = manifesto['cmp_code'].apply(topic_code)
manifesto['election'] = manifesto['date'].astype(str).str[:4]

In [None]:
manifesto.groupby(['topic','sentiment']).count()

In [None]:
grouped_result = manifesto.groupby(['topic', 'sentiment', 'cmp_code']).size().reset_index(name='count')
grouped_result.to_csv('data/temps/categorization_table.csv', index=False)




In [None]:
manifesto.groupby('sentiment').count()

In [None]:
len(manifesto[manifesto.topic=="Military"])/len(manifesto)*100 ## minority group: 1.7%

In [None]:
texts = manifesto['text'].tolist()

In [None]:
from statistics import stdev, mean
## Before
seq_len = [len(i.split()) for i in texts]
seq_len_mean = mean(seq_len)
seq_len_std = stdev(seq_len)
seq_len_max = max(seq_len)
seq_len_min = min(seq_len)
print('Mean length (word) is: {}'.format(seq_len_mean))
print('Std length (word) is: {}'.format(seq_len_std))
print('Min length (word) is: {}'.format(seq_len_min))
print('Max length (word) is: {}'.format(seq_len_max))

In [None]:
pd.Series(seq_len).hist(bins = 30)

In [None]:
results = group_texts(manifesto, 
                      ['countryname','election','party','cmp_code'], 'text', 
                      max_group_factor = 5)

In [None]:
manifesto_regrouped = pd.DataFrame(results)
manifesto_regrouped = manifesto_regrouped.explode('text').reset_index(drop=True)

In [None]:
df_cols = manifesto_regrouped['labels'].str.split(';', expand=True)
manifesto_regrouped = pd.concat([manifesto_regrouped, df_cols], axis=1)


In [None]:
manifesto_regrouped.columns = ['text', 'idx', 'country','election', 'party', 'cmp_code']

In [None]:
manifesto_regrouped.head()

In [None]:
manifesto_regrouped.loc[:,'sentiment'] = manifesto_regrouped['cmp_code'].apply(sentiment_code)
manifesto_regrouped.loc[:,'topic'] = manifesto_regrouped['cmp_code'].apply(topic_code)
manifesto_regrouped = manifesto_regrouped.drop_duplicates().reset_index(drop=True)


In [None]:
manifesto_regrouped.groupby(['topic','sentiment']).count()

In [None]:
texts = manifesto_regrouped['text'].tolist()
from statistics import stdev, mean
## Before
seq_len = [len(i.split()) for i in texts]
seq_len_mean = mean(seq_len)
seq_len_std = stdev(seq_len)
seq_len_max = max(seq_len)
seq_len_min = min(seq_len)
print('Mean length (word) is: {}'.format(seq_len_mean))
print('Std length (word) is: {}'.format(seq_len_std))
print('Min length (word) is: {}'.format(seq_len_min))
print('Max length (word) is: {}'.format(seq_len_max))

In [None]:
pd.Series(seq_len).hist(bins = 30)

In [None]:
manifesto_regrouped.to_csv('data/temps/manifesto_regrouped.csv', encoding='utf-8', index=False)
manifesto.to_csv('data/temps/manifesto.csv', encoding='utf-8', index=False)

## Preparing dataloaders 

In [4]:
manifesto = pd.read_csv('data/temps/manifesto.csv', encoding='utf-8', dtype={2:'str',18: 'str'})
manifesto_regrouped = pd.read_csv('data/temps/manifesto_regrouped.csv', encoding='utf-8')

In [5]:
manifesto_reduced = manifesto_regrouped[['topic','sentiment','text']].reset_index(drop=True)

In [6]:
model_name = 'xlm-roberta-base' 
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer)

In [7]:
manifesto_reduced['topic_sentiment'] = manifesto_reduced['topic'] + '_' + manifesto_reduced['sentiment']

In [8]:
manifesto_dataset = Dataset.from_pandas(manifesto_reduced)
manifesto_dataset = manifesto_dataset.class_encode_column('topic')
manifesto_dataset = manifesto_dataset.class_encode_column('sentiment')
manifesto_dataset = manifesto_dataset.class_encode_column('topic_sentiment')



Casting to class labels:   0%|          | 0/337412 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/337412 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/337412 [00:00<?, ? examples/s]

In [9]:
## Save class labels
import pickle
topic_labels = manifesto_dataset.features['topic'].names
file_path = 'data/temps/topic_labels'
with open(file_path, 'wb') as fp:
    pickle.dump(topic_labels, fp)

sentiment_labels = manifesto_dataset.features['sentiment'].names
file_path = 'data/temps/sentiment_labels'
with open(file_path, 'wb') as fp:
    pickle.dump(sentiment_labels, fp)

In [10]:
train_test = manifesto_dataset.train_test_split(test_size=0.1, stratify_by_column='topic_sentiment', seed=seed_val)
train_eval = train_test['train'].train_test_split(test_size=0.3, stratify_by_column='topic_sentiment', seed=seed_val )

In [11]:
manifesto_datasets = DatasetDict({
    'train': train_eval['train'],
    'test': train_test['test'],
    'eval': train_eval['test']
})
manifesto_datasets

DatasetDict({
    train: Dataset({
        features: ['topic', 'sentiment', 'text', 'topic_sentiment'],
        num_rows: 212569
    })
    test: Dataset({
        features: ['topic', 'sentiment', 'text', 'topic_sentiment'],
        num_rows: 33742
    })
    eval: Dataset({
        features: ['topic', 'sentiment', 'text', 'topic_sentiment'],
        num_rows: 91101
    })
})

In [12]:
tokenized_datasets = manifesto_datasets.map(tokenize_function, 
                                            fn_kwargs={'tokenizer': tokenizer, 'text_var': 'text', 'max_length': 512}, 
                                            remove_columns=['text', 'topic_sentiment'])
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

Map:   0%|          | 0/212569 [00:00<?, ? examples/s]

Map:   0%|          | 0/33742 [00:00<?, ? examples/s]

Map:   0%|          | 0/91101 [00:00<?, ? examples/s]

['topic', 'sentiment', 'input_ids', 'attention_mask']

In [13]:
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=16, shuffle=True, collate_fn = data_collator)
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=16, shuffle=False, collate_fn = data_collator)
eval_dataloader = DataLoader(tokenized_datasets['eval'], batch_size=16, shuffle=False, collate_fn = data_collator)

## Deep Ensemble Inference


### Configuration 

In [None]:
# Define model factory function for ensemble training
num_topics = len(set(manifesto_dataset['topic']))
num_sentiments = len(set(manifesto_dataset['sentiment']))
model_base = ContextScalePrediction(
        roberta_model=model_name, 
        num_topics=num_topics, 
        num_sentiments=num_sentiments,
        lora=False,
        use_shared_attention=True  # Using shared attention architecture
    )

print("Model defined")

### Ensemble inference on the test set

Training is done using train.py script. Here we implement ensemble inference on test set.

In [None]:
# Generate checkpoint paths for the ensemble models trained with different splits
checkpoint_paths_splits = [
    os.path.join('results/models/ensemble', f"model_ensemble_{i}.safetensors")
    for i in range(5)
]

print("Checkpoint paths for split-based ensemble:")
for i, path in enumerate(checkpoint_paths_splits):
    print(f"  Model {i}: {path}")

# Load the ensemble models
ensemble_models = load_ensemble_models(
    model_base=model_base,
    checkpoint_paths=checkpoint_paths_splits,
    device=device
)

In [None]:
# Perform ensemble inference with uncertainty estimation
print("Performing ensemble inference with uncertainty estimation...")
print("The ensemble will compute:")
print("  - Mean position scores across all 5 models")
print("  - Position score variance for each text sequence")
print("  - Epistemic uncertainty (model disagreement)")
print("  - Aleatoric uncertainty (inherent data uncertainty)")

ensemble_results = ensemble_inference(
    models=ensemble_models,
    dataloader=test_dataloader,
    device=device,
    beta=1.0,
    topic_label='topic',
    sentiment_label='sentiment',
    use_ground_truth_topic=True
)

print(f"\nEnsemble inference completed!")
print(f"Final position scores are the mean of {len(ensemble_models)} models")
print(f"Position score variance included for each sequence")

save_ensemble_results(
    ensemble_results,
    'results/datasets/ensemble_results_test.pkl'
)

### Merge back to original test dataset

In [None]:
## Load ensemble results with pickle
file_path = 'results/datasets/ensemble_results_test.pkl'
with open(file_path, "rb") as file:
    ensemble_results = pickle.load(file)

In [None]:
ensemble_summary_df = create_ensemble_summary_dataframe(ensemble_results)

In [None]:
## Merge ensemble_summary_df with original test dataset for analysis
test_dataset = manifesto_datasets['test'].to_pandas().reset_index(drop=True)
merged_df = pd.concat([test_dataset, ensemble_summary_df], axis=1)

In [None]:
## Save to csv
merged_df.to_csv('results/datasets/ensemble_test_dataset.csv', index=False)

### Inference on the entire dataset 


In [None]:
# Generate checkpoint paths for the ensemble models trained with different splits
checkpoint_paths_splits = [
    os.path.join('results/models/ensemble_scaling', f"model_ensemble_{i}.safetensors")
    for i in range(5)
]

print("Checkpoint paths for split-based ensemble:")
for i, path in enumerate(checkpoint_paths_splits):
    print(f"  Model {i}: {path}")

# Load the ensemble models
ensemble_models = load_ensemble_models(
    model_base=model_base,
    checkpoint_paths=checkpoint_paths_splits,
    device=device
)

In [None]:
full_dataset = manifesto_dataset.map(tokenize_function, 
                                            fn_kwargs={'tokenizer': tokenizer, 'text_var': 'text', 'max_length': 512}, 
                                            remove_columns=['text'])
full_dataset.set_format("torch")
full_dataloader = DataLoader(full_dataset, batch_size=64, shuffle=False, collate_fn=data_collator)

full_ensemble_results = ensemble_inference(
    models=ensemble_models,
    dataloader=full_dataloader,
    device=device,
    beta=1.0,
    topic_label='topic',
    sentiment_label='sentiment',
    use_ground_truth_topic=True
)

save_ensemble_results(
    full_ensemble_results,
    'results/datasets/ensemble_results_full.pkl'
)



In [16]:
## Load ensemble results with pickle
file_path = 'data/py_outputs/ensemble_results_full.pkl'
with open(file_path, "rb") as file:
    full_ensemble_results = pickle.load(file)

In [17]:
ensemble_summary_df = create_ensemble_summary_dataframe(full_ensemble_results)

In [20]:
## Merge ensemble_summary_df with original test dataset for analysis
merged_df = pd.concat([manifesto_regrouped, ensemble_summary_df], axis=1)

In [None]:
## Save to csv
merged_df.to_csv('results/datasets/ensemble_full_dataset.csv', index=False)

## Ablation: Model architecture comparison

### Base Model

In [None]:
num_topics = len(set(manifesto_reduced['topic']))
num_sentiments = len(set(manifesto_reduced['sentiment']))
model = ContextScalePrediction(roberta_model=model_name, 
                               num_topics=num_topics, 
                               num_sentiments=num_sentiments,
                               lora=False).to(device)



In [None]:
n_epochs=5
total_steps = len(train_dataloader)*n_epochs
warmup = total_steps*0.1
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5) 
scheduler = get_linear_schedule_with_warmup(optimizer, num_training_steps=total_steps, num_warmup_steps=warmup)
criterion_sent = nn.CrossEntropyLoss()
criterion_topic =  nn.CrossEntropyLoss()


In [None]:
for epoch in range(n_epochs):
    print(f"Epoch: {epoch+1}")
    timing_log = train_loop(train_dataloader, model,optimizer, scheduler, device, criterion_sent, criterion_topic, sentiment_var='sentiment',
               topic_var='topic', timing_log=True)
    eval_loop(eval_dataloader, model, device, criterion_sent, criterion_topic, sentiment_var='sentiment', topic_var='topic')

    

In [None]:
state_dict = model.state_dict()
save_file(state_dict, 'results/models/manifesto_ContextScalePrediction_base/model.safetensors')

In [None]:
outputs_base = scale_func(test_dataloader, 
               model, 
               device, 
               topic_label='topic', 
               sentiment_label='sentiment', 
               timing_log=True,
               use_ground_truth_topic=True)

In [None]:
outputs_base['res_table_topic']

In [None]:
outputs_base['res_table_topic']['f1'].mean().round(2)

In [None]:
outputs_base['res_table_sentiment']

In [None]:
outputs_base['res_table_sentiment']['f1'].mean().round(2)

In [None]:
file_path = 'data/temps/outputs_base'
with open(file_path, "wb") as file:
    pickle.dump(outputs_base, file)

In [None]:
outputs_base['res_table_sentiment'].to_csv('results/classification results/base_sentiment.csv', index=False)
outputs_base['res_table_topic'].to_csv('results/classification results/base_topic.csv', index=False)


### Model with simple flow of information

In [None]:
num_topics = len(set(manifesto_reduced['topic']))
num_sentiments = len(set(manifesto_reduced['sentiment']))
model = ContextScalePrediction(roberta_model=model_name, 
                               num_topics=num_topics, 
                               num_sentiments=num_sentiments,
                               lora=False,
                               use_simple_flow=True).to(device)



In [None]:
n_epochs=5
total_steps = len(train_dataloader)*n_epochs
warmup = total_steps*0.1
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5) 
scheduler = get_linear_schedule_with_warmup(optimizer, num_training_steps=total_steps, num_warmup_steps=warmup)
criterion_sent = nn.CrossEntropyLoss()
criterion_topic =  nn.CrossEntropyLoss()


In [None]:
for epoch in range(n_epochs):
    print(f"Epoch: {epoch+1}")
    timing_log = train_loop(train_dataloader, model,optimizer, scheduler, device, criterion_sent, criterion_topic, sentiment_var='sentiment',
               topic_var='topic', timing_log=True)
    eval_loop(eval_dataloader, model, device, criterion_sent, criterion_topic, sentiment_var='sentiment', topic_var='topic')

    

In [None]:
state_dict = model.state_dict()
save_file(state_dict, 'results/models/manifesto_ContextScalePrediction_sf/model.safetensors')

In [None]:
outputs_sf = scale_func(test_dataloader, 
               model, 
               device, 
               topic_label='topic', 
               sentiment_label='sentiment', 
               timing_log=True,
               use_ground_truth_topic=True)

In [None]:
outputs_sf['res_table_topic']

In [None]:
outputs_sf['res_table_topic']['f1'].mean().round(2)

In [None]:
outputs_sf['res_table_sentiment']

In [None]:
outputs_sf['res_table_sentiment']['f1'].mean().round(2)

In [None]:
file_path = 'data/temps/outputs_sf'
with open(file_path, "wb") as file:
    pickle.dump(outputs_sf, file)

In [None]:
outputs_sf['res_table_sentiment'].to_csv('results/classification results/sf_sentiment.csv', index=False)
outputs_sf['res_table_topic'].to_csv('results/classification results/sf_topic.csv', index=False)


### Model with shared attention

In [None]:
num_topics = len(set(manifesto_reduced['topic']))
num_sentiments = len(set(manifesto_reduced['sentiment']))
model = ContextScalePrediction(roberta_model=model_name, 
                               num_topics=num_topics, 
                               num_sentiments=num_sentiments,
                               lora=False,
                               use_shared_attention=True).to(device)



In [None]:
n_epochs=5
total_steps = len(train_dataloader)*n_epochs
warmup = total_steps*0.1
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5) 
scheduler = get_linear_schedule_with_warmup(optimizer, num_training_steps=total_steps, num_warmup_steps=warmup)
criterion_sent = nn.CrossEntropyLoss()
criterion_topic =  nn.CrossEntropyLoss()


In [None]:
for epoch in range(n_epochs):
    print(f"Epoch: {epoch+1}")
    timing_log = train_loop(train_dataloader, model,optimizer, scheduler, device, criterion_sent, criterion_topic, sentiment_var='sentiment',
               topic_var='topic', timing_log=True)
    eval_loop(eval_dataloader, model, device, criterion_sent, criterion_topic, sentiment_var='sentiment', topic_var='topic')

    

In [None]:
state_dict = model.state_dict()
save_file(state_dict, 'results/models/manifesto_ContextScalePrediction_sa/model.safetensors')

In [None]:
outputs_sa = scale_func(test_dataloader, 
               model, 
               device, 
               topic_label='topic', 
               sentiment_label='sentiment', 
               timing_log=True,
               use_ground_truth_topic=True)

In [None]:
outputs_sa['res_table_topic']

In [None]:
outputs_sa['res_table_topic']['f1'].mean().round(2)

In [None]:
outputs_sa['res_table_sentiment']

In [None]:
outputs_sa['res_table_sentiment']['f1'].mean().round(2)

In [None]:
file_path = 'data/temps/outputs_sa'
with open(file_path, "wb") as file:
    pickle.dump(outputs_sa, file)

In [None]:
outputs_sa['res_table_sentiment'].to_csv('results/classification results/sa_sentiment.csv', index=False)
outputs_sa['res_table_topic'].to_csv('results/classification results/sa_topic.csv', index=False)


### Model with dynamic gating

In [None]:
num_topics = len(set(manifesto_reduced['topic']))
num_sentiments = len(set(manifesto_reduced['sentiment']))
model = ContextScalePrediction(roberta_model=model_name, 
                               num_topics=num_topics, 
                               num_sentiments=num_sentiments,
                               lora=False,
                               use_dynamic_gating=True).to(device)



In [None]:
n_epochs=5
total_steps = len(train_dataloader)*n_epochs
warmup = total_steps*0.1
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5) 
scheduler = get_linear_schedule_with_warmup(optimizer, num_training_steps=total_steps, num_warmup_steps=warmup)
criterion_sent = nn.CrossEntropyLoss()
criterion_topic =  nn.CrossEntropyLoss()


In [None]:
for epoch in range(n_epochs):
    print(f"Epoch: {epoch+1}")
    timing_log = train_loop(train_dataloader, model,optimizer, scheduler, device, criterion_sent, criterion_topic, sentiment_var='sentiment',
               topic_var='topic', timing_log=True)
    eval_loop(eval_dataloader, model, device, criterion_sent, criterion_topic, sentiment_var='sentiment', topic_var='topic')

    

In [None]:
state_dict = model.state_dict()
save_file(state_dict, 'results/models/manifesto_ContextScalePrediction_dg/model.safetensors')

In [None]:
outputs_dg = scale_func(test_dataloader, 
               model, 
               device, 
               topic_label='topic', 
               sentiment_label='sentiment', 
               timing_log=True,
               use_ground_truth_topic=True)

In [None]:
outputs_dg['res_table_topic']

In [None]:
outputs_dg['res_table_topic']['f1'].mean().round(2)

In [None]:
outputs_dg['res_table_sentiment']

In [None]:
outputs_dg['res_table_sentiment']['f1'].mean().round(2)

In [None]:
file_path = 'data/temps/outputs_dg'
with open(file_path, "wb") as file:
    pickle.dump(outputs_dg, file)

In [None]:
outputs_dg['res_table_sentiment'].to_csv('results/classification results/dg_sentiment.csv', index=False)
outputs_dg['res_table_topic'].to_csv('results/classification results/dg_topic.csv', index=False)


# Validity checks

### Different languages

#### Testing on test languages unseen during training

In [4]:
manifesto_test = pd.read_csv(os.path.join("data", "r_outputs","pulled_manifestoes_test.csv"), encoding="utf-8")

In [5]:
manifesto_test.head()

Unnamed: 0,text,cmp_code,eu_code,pos,manifesto_id,party,date,language,annotations,translation_en,country,party_code,countryname,abbrev,name,edate,parfam
0,安倍政権の暴走ストップ！,H,,1,71220_201412,71220,201412,japanese,True,False,71,71220,Japan,JCP,Nihon Kyōsan-tō,14/12/2014,20.0
1,国民の声が生きる新しい政治を,H,,2,71220_201412,71220,201412,japanese,True,False,71,71220,Japan,JCP,Nihon Kyōsan-tō,14/12/2014,20.0
2,日本共産党の総選挙政策,H,,3,71220_201412,71220,201412,japanese,True,False,71,71220,Japan,JCP,Nihon Kyōsan-tō,14/12/2014,20.0
3,日本共産党,H,,4,71220_201412,71220,201412,japanese,True,False,71,71220,Japan,JCP,Nihon Kyōsan-tō,14/12/2014,20.0
4,安倍政権の暴走ストップ、政治を変えるチャンスです……,305.1,,5,71220_201412,71220,201412,japanese,True,False,71,71220,Japan,JCP,Nihon Kyōsan-tō,14/12/2014,20.0


In [6]:
manifesto_test = manifesto_test[(manifesto_test.cmp_code.notna()) & ~(manifesto_test.cmp_code == 'H')].reset_index(drop=True)

In [7]:
manifesto_test['sentiment'] = manifesto_test['cmp_code'].apply(sentiment_code)
manifesto_test['topic'] = manifesto_test['cmp_code'].apply(topic_code)
manifesto_test['election'] = manifesto_test['date'].astype(str).str[:4]

In [8]:
results = group_texts(manifesto_test, 
                      ['countryname','election','party','cmp_code'], 'text', 
                      max_group_factor = 5)

In [9]:
manifesto_regrouped = pd.DataFrame(results)
manifesto_regrouped = manifesto_regrouped.explode('text').reset_index(drop=True)

In [10]:
df_cols = manifesto_regrouped['labels'].str.split(';', expand=True)
manifesto_regrouped = pd.concat([manifesto_regrouped, df_cols], axis=1)


In [11]:
manifesto_regrouped.columns = ['text', 'country_election_party_code', 'country','election', 'party', 'cmp_code']

In [12]:
manifesto_regrouped.head()

Unnamed: 0,text,country_election_party_code,country,election,party,cmp_code
0,Za základ považujeme povinnou školní docházku....,Czech Republic;2002;82320;000,Czech Republic,2002,82320,0
1,Proto chceme modernizovat armádu - Profesional...,Czech Republic;2002;82320;104,Czech Republic,2002,82320,104
2,Chceme prosadit zrušení základní vojenské služ...,Czech Republic;2002;82320;105,Czech Republic,2002,82320,105
3,"- Prosazujeme vstup do EU tak, aby se ČR stala...",Czech Republic;2002;82320;108,Czech Republic,2002,82320,108
4,"- Těm, kdo chtějí zkusit štěstí v zahraničí, o...",Czech Republic;2002;82320;108,Czech Republic,2002,82320,108


In [13]:
manifesto_regrouped.loc[:,'sentiment'] = manifesto_regrouped['cmp_code'].apply(sentiment_code)
manifesto_regrouped.loc[:,'topic'] = manifesto_regrouped['cmp_code'].apply(topic_code)
manifesto_regrouped = manifesto_regrouped.drop_duplicates().reset_index(drop=True)


In [14]:
manifesto_regrouped.groupby(['topic','sentiment']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,text,country_election_party_code,country,election,party,cmp_code
topic,sentiment,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Agriculture - Protectionism,left,1341,1341,1341,1341,1341,1341
Agriculture - Protectionism,right,123,123,123,123,123,123
Economics,left,2070,2070,2070,2070,2070,2070
Economics,neutral,3703,3703,3703,3703,3703,3703
Economics,right,2421,2421,2421,2421,2421,2421
Education,left,1534,1534,1534,1534,1534,1534
Education,right,4,4,4,4,4,4
Environment - Growth,left,1241,1241,1241,1241,1241,1241
Environment - Growth,neutral,155,155,155,155,155,155
Environment - Growth,right,1292,1292,1292,1292,1292,1292


In [15]:
manifesto_reduced = manifesto_regrouped[['topic','sentiment','text']].reset_index(drop=True)

In [16]:
model_name = 'xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer)

In [17]:
manifesto_dataset = Dataset.from_pandas(manifesto_reduced)
manifesto_dataset = manifesto_dataset.class_encode_column('topic')
manifesto_dataset = manifesto_dataset.class_encode_column('sentiment')



Casting to class labels:   0%|          | 0/32343 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/32343 [00:00<?, ? examples/s]

In [18]:
tokenized_dataset = manifesto_dataset.map(tokenize_function, 
                                            fn_kwargs={'tokenizer': tokenizer, 'text_var': 'text', 'max_length': 512}, 
                                            remove_columns=['text'])

Map:   0%|          | 0/32343 [00:00<?, ? examples/s]

In [19]:
pred_dataloader = DataLoader(tokenized_dataset, batch_size=64, shuffle=False, collate_fn = data_collator)

In [20]:
## Load ensemble models for validity checks
num_topics = len(set(manifesto_reduced['topic']))
num_sentiments = len(set(manifesto_reduced['sentiment']))

model_base = ContextScalePrediction(
        roberta_model=model_name,
        num_topics=num_topics,
        num_sentiments=num_sentiments,
        lora=False,
        use_shared_attention=True
    )

ensemble_checkpoint_paths = [
    os.path.join('results/models/ensemble_scaling', f"model_ensemble_{i}.safetensors")
    for i in range(5)
]

validity_ensemble_models = load_ensemble_models(
    model_base=model_base,
    checkpoint_paths=ensemble_checkpoint_paths,
    device=device
)

Loading ensemble member 0 from results/models/ensemble_scaling/model_ensemble_0.safetensors
Loading ensemble member 1 from results/models/ensemble_scaling/model_ensemble_1.safetensors
Loading ensemble member 2 from results/models/ensemble_scaling/model_ensemble_2.safetensors
Loading ensemble member 3 from results/models/ensemble_scaling/model_ensemble_3.safetensors
Loading ensemble member 4 from results/models/ensemble_scaling/model_ensemble_4.safetensors


In [21]:
outputs_dl = ensemble_inference(
    models=validity_ensemble_models,
    dataloader=pred_dataloader,
    device=device,
    beta=1.0,
    topic_label='topic',
    sentiment_label='sentiment',
    use_ground_truth_topic=False
)

true_topics = outputs_dl.get('ground_truth_topics')
if true_topics is not None:
    true_topics = np.asarray(true_topics).ravel()
    topic_precision, topic_recall, topic_f1, _ = precision_recall_fscore_support(
        true_topics,
        outputs_dl['ensemble_pred_topics'],
        average=None
    )
    matrix_topic = confusion_matrix(true_topics, outputs_dl['ensemble_pred_topics'])
    accuracy_topic = matrix_topic.diagonal() / matrix_topic.sum(axis=1)
    outputs_dl['res_table_topic'] = pd.DataFrame({
        'f1': np.round(topic_f1, 2),
        'precision': np.round(topic_precision, 2),
        'recall': np.round(topic_recall, 2),
        'accuracy': np.round(accuracy_topic, 2)
    })
else:
    outputs_dl['res_table_topic'] = None

true_sentiments = outputs_dl.get('ground_truth_sentiments')
if true_sentiments is not None:
    true_sentiments = np.asarray(true_sentiments).ravel()
    sent_precision, sent_recall, sent_f1, _ = precision_recall_fscore_support(
        true_sentiments,
        outputs_dl['ensemble_pred_sentiments'],
        average=None
    )
    matrix_sentiment = confusion_matrix(true_sentiments, outputs_dl['ensemble_pred_sentiments'])
    accuracy_sentiment = matrix_sentiment.diagonal() / matrix_sentiment.sum(axis=1)
    outputs_dl['res_table_sentiment'] = pd.DataFrame({
        'f1': np.round(sent_f1, 2),
        'precision': np.round(sent_precision, 2),
        'recall': np.round(sent_recall, 2),
        'accuracy': np.round(accuracy_sentiment, 2)
    })
else:
    outputs_dl['res_table_sentiment'] = None

Running ensemble inference with 5 models...

Running inference with model 1/5
  Batch 100/506 | Elapsed: 5.16s, Remaining: 20.96s
  Batch 200/506 | Elapsed: 9.90s, Remaining: 15.15s
  Batch 300/506 | Elapsed: 14.69s, Remaining: 10.08s
  Batch 400/506 | Elapsed: 19.38s, Remaining: 5.14s
  Batch 500/506 | Elapsed: 24.90s, Remaining: 0.30s
  Using ground truth topic labels for position score computation

Running inference with model 2/5
  Batch 100/506 | Elapsed: 5.09s, Remaining: 20.68s
  Batch 200/506 | Elapsed: 9.97s, Remaining: 15.25s
  Batch 300/506 | Elapsed: 14.89s, Remaining: 10.22s
  Batch 400/506 | Elapsed: 19.71s, Remaining: 5.22s
  Batch 500/506 | Elapsed: 25.38s, Remaining: 0.30s

Running inference with model 3/5
  Batch 100/506 | Elapsed: 5.23s, Remaining: 21.21s
  Batch 200/506 | Elapsed: 10.22s, Remaining: 15.63s
  Batch 300/506 | Elapsed: 15.25s, Remaining: 10.47s
  Batch 400/506 | Elapsed: 20.18s, Remaining: 5.35s
  Batch 500/506 | Elapsed: 25.96s, Remaining: 0.31s

Runn

In [22]:
outputs_dl['res_table_topic'].mean().round(2)

f1           0.76
precision    0.75
recall       0.78
accuracy     0.78
dtype: float64

In [23]:
outputs_dl['res_table_sentiment'].mean().round(2)

f1           0.76
precision    0.76
recall       0.76
accuracy     0.76
dtype: float64

In [24]:
file_path = 'data/temps/outputs_dl'
with open(file_path, "wb") as file:
    pickle.dump(outputs_dl, file)

In [25]:
outputs_dl['res_table_sentiment'].to_csv('results/classification results/dl_sentiment.csv', index=False)
outputs_dl['res_table_topic'].to_csv('results/classification results/dl_topic.csv', index=False)


#### Train a model using only 10% of labelled data

In [26]:
manifesto_reduced['topic_sentiment'] = manifesto_reduced['topic'] + '_' + manifesto_reduced['sentiment']

In [27]:
manifesto_dataset = Dataset.from_pandas(manifesto_reduced)
manifesto_dataset = manifesto_dataset.class_encode_column('topic')
manifesto_dataset = manifesto_dataset.class_encode_column('sentiment')
manifesto_dataset = manifesto_dataset.class_encode_column('topic_sentiment')



Casting to class labels:   0%|          | 0/32343 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/32343 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/32343 [00:00<?, ? examples/s]

In [28]:
train_test = manifesto_dataset.train_test_split(test_size=0.9, stratify_by_column='topic_sentiment', seed=seed_val)

In [29]:
manifesto_datasets = DatasetDict({
    'train': train_test['train'],
    'test': train_test['test'],
})
manifesto_datasets

DatasetDict({
    train: Dataset({
        features: ['topic', 'sentiment', 'text', 'topic_sentiment'],
        num_rows: 3234
    })
    test: Dataset({
        features: ['topic', 'sentiment', 'text', 'topic_sentiment'],
        num_rows: 29109
    })
})

In [30]:
tokenized_datasets = manifesto_datasets.map(tokenize_function, 
                                            fn_kwargs={'tokenizer': tokenizer, 'text_var': 'text', 'max_length': 512}, 
                                            remove_columns=['text', 'topic_sentiment'])
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

Map:   0%|          | 0/3234 [00:00<?, ? examples/s]

Map:   0%|          | 0/29109 [00:00<?, ? examples/s]

['topic', 'sentiment', 'input_ids', 'attention_mask']

In [31]:
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=64, shuffle=True, collate_fn = data_collator)
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=64, shuffle=False, collate_fn = data_collator)

In [32]:
## Define ensemble model factories for 10% labelled subset with borrowed weights
num_topics = len(set(manifesto_reduced['topic']))
num_sentiments = len(set(manifesto_reduced['sentiment']))

checkpoint_path = os.path.join('results/models/ensemble_scaling/model_ensemble_1.safetensors')

print("Manifesto ensemble members used for 10% supervision adaptation:")



def copy_source_model(checkpoint_path):
    source_model = ContextScalePrediction(
        roberta_model=model_name,
        num_topics=num_topics,
        num_sentiments=num_sentiments,
        lora=False,
        use_shared_attention=True
    ).to(device)
    source_state = load_file(checkpoint_path)
    source_model.load_state_dict(source_state)

    return source_model


model_base = copy_source_model(checkpoint_path=checkpoint_path)

Manifesto ensemble members used for 10% supervision adaptation:


In [33]:

ensemble_training_info_dl_10 = train_deep_ensemble(
    model_base=model_base,
    train_dataloader=train_dataloader,
    eval_dataloader=None,
    device=device,
    num_models=5,
    n_epochs=5,
    lr=2e-5,
    sentiment_var='sentiment',
    topic_var='topic',
    save_dir='results/models/manifesto_ensemble_dl_10',
    model_prefix='model_ensemble_dl_10',
    org_seed=seed_val
)


Training ensemble member 1/5
Using shuffled training data with seed 1234
Training ensemble member 1
Using shuffled data with seed 1234
Epoch: 1/5

Training...

Training epoch took: 4.14s
  Skipping evaluation (no eval_dataloader provided)
Epoch: 2/5

Training...

Training epoch took: 3.94s
  Skipping evaluation (no eval_dataloader provided)
Epoch: 3/5

Training...

Training epoch took: 3.89s
  Skipping evaluation (no eval_dataloader provided)
Epoch: 4/5

Training...

Training epoch took: 3.96s
  Skipping evaluation (no eval_dataloader provided)
Epoch: 5/5

Training...

Training epoch took: 3.98s
  Skipping evaluation (no eval_dataloader provided)
Saved model checkpoint to results/models/manifesto_ensemble_dl_10/model_ensemble_dl_10_0.safetensors

Training ensemble member 2/5
Using shuffled training data with seed 1235
Training ensemble member 2
Using shuffled data with seed 1235
Epoch: 1/5

Training...

Training epoch took: 3.96s
  Skipping evaluation (no eval_dataloader provided)
Epo

In [34]:
checkpoint_paths_dl_10 = [
    os.path.join('results/models/manifesto_ensemble_dl_10', f"model_ensemble_dl_10_{i}.safetensors")
    for i in range(5)
]

print("Checkpoint paths for 10% ensemble:")
for idx, path in enumerate(checkpoint_paths_dl_10):
    print(f"  Model {idx}: {path}")

manifesto_ensemble_dl_10_models = load_ensemble_models(
    model_base=model_base,
    checkpoint_paths=checkpoint_paths_dl_10,
    device=device
)

Checkpoint paths for 10% ensemble:
  Model 0: results/models/manifesto_ensemble_dl_10/model_ensemble_dl_10_0.safetensors
  Model 1: results/models/manifesto_ensemble_dl_10/model_ensemble_dl_10_1.safetensors
  Model 2: results/models/manifesto_ensemble_dl_10/model_ensemble_dl_10_2.safetensors
  Model 3: results/models/manifesto_ensemble_dl_10/model_ensemble_dl_10_3.safetensors
  Model 4: results/models/manifesto_ensemble_dl_10/model_ensemble_dl_10_4.safetensors
Loading ensemble member 0 from results/models/manifesto_ensemble_dl_10/model_ensemble_dl_10_0.safetensors
Loading ensemble member 1 from results/models/manifesto_ensemble_dl_10/model_ensemble_dl_10_1.safetensors
Loading ensemble member 2 from results/models/manifesto_ensemble_dl_10/model_ensemble_dl_10_2.safetensors
Loading ensemble member 3 from results/models/manifesto_ensemble_dl_10/model_ensemble_dl_10_3.safetensors
Loading ensemble member 4 from results/models/manifesto_ensemble_dl_10/model_ensemble_dl_10_4.safetensors


In [35]:
outputs_dl_10 = ensemble_inference(
    models=manifesto_ensemble_dl_10_models,
    dataloader=test_dataloader,
    device=device,
    beta=1.0,
    topic_label='topic',
    sentiment_label='sentiment',
    use_ground_truth_topic=False
)

true_topics_dl_10 = outputs_dl_10.get('ground_truth_topics')
if true_topics_dl_10 is not None:
    true_topics_dl_10 = np.asarray(true_topics_dl_10).ravel()
    topic_precision_dl_10, topic_recall_dl_10, topic_f1_dl_10, _ = precision_recall_fscore_support(
        true_topics_dl_10,
        outputs_dl_10['ensemble_pred_topics'],
        average=None
    )
    matrix_topic_dl_10 = confusion_matrix(true_topics_dl_10, outputs_dl_10['ensemble_pred_topics'])
    accuracy_topic_dl_10 = matrix_topic_dl_10.diagonal() / matrix_topic_dl_10.sum(axis=1)
    outputs_dl_10['res_table_topic'] = pd.DataFrame({
        'f1': np.round(topic_f1_dl_10, 2),
        'precision': np.round(topic_precision_dl_10, 2),
        'recall': np.round(topic_recall_dl_10, 2),
        'accuracy': np.round(accuracy_topic_dl_10, 2)
    })
else:
    outputs_dl_10['res_table_topic'] = None

true_sentiments_dl_10 = outputs_dl_10.get('ground_truth_sentiments')
if true_sentiments_dl_10 is not None:
    true_sentiments_dl_10 = np.asarray(true_sentiments_dl_10).ravel()
    sent_precision_dl_10, sent_recall_dl_10, sent_f1_dl_10, _ = precision_recall_fscore_support(
        true_sentiments_dl_10,
        outputs_dl_10['ensemble_pred_sentiments'],
        average=None
    )
    matrix_sentiment_dl_10 = confusion_matrix(true_sentiments_dl_10, outputs_dl_10['ensemble_pred_sentiments'])
    accuracy_sentiment_dl_10 = matrix_sentiment_dl_10.diagonal() / matrix_sentiment_dl_10.sum(axis=1)
    outputs_dl_10['res_table_sentiment'] = pd.DataFrame({
        'f1': np.round(sent_f1_dl_10, 2),
        'precision': np.round(sent_precision_dl_10, 2),
        'recall': np.round(sent_recall_dl_10, 2),
        'accuracy': np.round(accuracy_sentiment_dl_10, 2)
    })
else:
    outputs_dl_10['res_table_sentiment'] = None

outputs_dl_10['position_scores'] = outputs_dl_10['mean_position_scores']
outputs_dl_10['pred_topics'] = outputs_dl_10['ensemble_pred_topics']
outputs_dl_10['pred_sentiment'] = outputs_dl_10['ensemble_pred_sentiments']

Running ensemble inference with 5 models...

Running inference with model 1/5
  Batch 100/455 | Elapsed: 6.10s, Remaining: 21.64s
  Batch 200/455 | Elapsed: 12.56s, Remaining: 16.01s
  Batch 300/455 | Elapsed: 18.97s, Remaining: 9.80s
  Batch 400/455 | Elapsed: 25.39s, Remaining: 3.49s
  Using ground truth topic labels for position score computation

Running inference with model 2/5
  Batch 100/455 | Elapsed: 6.20s, Remaining: 22.01s
  Batch 200/455 | Elapsed: 12.74s, Remaining: 16.24s
  Batch 300/455 | Elapsed: 19.21s, Remaining: 9.92s
  Batch 400/455 | Elapsed: 25.59s, Remaining: 3.52s

Running inference with model 3/5
  Batch 100/455 | Elapsed: 6.24s, Remaining: 22.15s
  Batch 200/455 | Elapsed: 12.82s, Remaining: 16.34s
  Batch 300/455 | Elapsed: 19.31s, Remaining: 9.98s
  Batch 400/455 | Elapsed: 25.73s, Remaining: 3.54s

Running inference with model 4/5
  Batch 100/455 | Elapsed: 6.27s, Remaining: 22.25s
  Batch 200/455 | Elapsed: 12.86s, Remaining: 16.40s
  Batch 300/455 | Elaps

In [36]:
outputs_dl_10['res_table_topic']['f1'].mean().round(2)

0.82

In [37]:
outputs_dl_10['res_table_sentiment']

Unnamed: 0,f1,precision,recall,accuracy
0,0.87,0.85,0.88,0.88
1,0.83,0.84,0.81,0.81
2,0.79,0.79,0.78,0.78


In [38]:
outputs_dl_10['res_table_sentiment']['f1'].mean().round(2)

0.83

In [39]:
file_path = 'data/temps/outputs_dl_10'
with open(file_path, "wb") as file:
    pickle.dump(outputs_dl_10, file)

In [40]:
outputs_dl_10['res_table_sentiment'].to_csv('results/classification results/dl_10_sentiment.csv', index=False)
outputs_dl_10['res_table_topic'].to_csv('results/classification results/dl_10_topic.csv', index=False)


### COALITIONAGREE, same coding style

#### No supervision

In [41]:
coalitionagree = pd.read_csv('data/r_outputs/coalitionagree_texts.csv', encoding='utf-8', index_col=0).reset_index(drop=True)

In [42]:
coalitionagree.head()

Unnamed: 0,sentence,domain,category2,category3,level,id,country_init,cabinet_year,country
0,1. Abkommen vom Dezember 1945,8,800,80000,0,1,AT,1945,Austria
1,Der Proporz soll nicht nur bei der Bildung der...,9,900,90001,0,2,AT,1945,Austria
2,Staatssekretäre sollen nur in Ausnahmefällen n...,9,900,90004,0,3,AT,1945,Austria
3,Das Programm der Parteien soll in der Erklärun...,9,900,90001,0,4,AT,1945,Austria
4,Die Österreichische Volkspartei bietet den Soz...,9,900,90002,0,5,AT,1945,Austria


In [43]:
results = group_texts(coalitionagree, ['country','cabinet_year','category2','category3'], 'sentence', max_group_factor = 5)

In [44]:
coalition_regrouped = pd.DataFrame(results)
coalition_regrouped = coalition_regrouped.explode('text').reset_index(drop=True)
df_cols = coalition_regrouped['labels'].str.split(';', expand=True)
coalition_regrouped = pd.concat([coalition_regrouped, df_cols], axis=1)
coalition_regrouped.columns =['text','labels', 'country','year', 'cmp_short','cmp_long']


In [45]:
coalition_regrouped.head()

Unnamed: 0,text,labels,country,year,cmp_short,cmp_long
0,"Abmachungen über die Beamtenbesoldung, ebenso...",Austria;1945;303;30301,Austria,1945,303,30301
1,"Der Gemeinde Wien soll, wenn die Sozialistisch...",Austria;1945;303;30303,Austria,1945,303,30303
2,über die Behandlung der Nationalsozialisten,Austria;1945;305;30506,Austria,1945,305,30506
3,sowie über die Verstaatlichung werden in Aussi...,Austria;1945;413;41301,Austria,1945,413,41301
4,Die Schaffung eines einheitlichen Dienst- und ...,Austria;1945;506;50602,Austria,1945,506,50602


In [46]:
coalition_regrouped['sentiment'] = coalition_regrouped.apply(lambda x: sentiment_code_coalition(x['cmp_short'], x['cmp_long']), axis=1)
coalition_regrouped['topic'] = coalition_regrouped['cmp_short'].apply(topic_code_coalition)

In [47]:
coalition_regrouped.groupby(['topic','sentiment']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,text,labels,country,year,cmp_short,cmp_long
topic,sentiment,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Agriculture - Protectionism,left,877,877,877,877,877,877
Agriculture - Protectionism,right,86,86,86,86,86,86
Economics,left,2025,2025,2025,2025,2025,2025
Economics,neutral,3564,3564,3564,3564,3564,3564
Economics,right,2592,2592,2592,2592,2592,2592
Education,left,1883,1883,1883,1883,1883,1883
Education,right,37,37,37,37,37,37
Environment - Growth,left,2045,2045,2045,2045,2045,2045
Environment - Growth,neutral,337,337,337,337,337,337
Environment - Growth,right,754,754,754,754,754,754


In [48]:
coalition_regrouped.to_csv('data/temps/coalitionagree_regrouped_processed.csv', encoding='utf-8',index=False)

In [49]:
cagree_reduced = coalition_regrouped[['sentiment', 'topic','text']].copy()

In [50]:
cagree_dataset = Dataset.from_pandas(cagree_reduced)
cagree_dataset = cagree_dataset.class_encode_column('sentiment')
cagree_dataset = cagree_dataset.class_encode_column('topic')


Casting to class labels:   0%|          | 0/39287 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/39287 [00:00<?, ? examples/s]

In [51]:
tokenized_dataset = cagree_dataset.map(tokenize_function, 
                                            fn_kwargs={'tokenizer': tokenizer, 'text_var': 'text', 'max_length': 512}, 
                                            remove_columns=['text'])

Map:   0%|          | 0/39287 [00:00<?, ? examples/s]

In [52]:
pred_dataloader = DataLoader(tokenized_dataset, batch_size=16, shuffle=False, collate_fn = data_collator)

In [53]:
## Load ensemble models for validity checks
num_topics = len(set(cagree_reduced['topic']))
num_sentiments = len(set(cagree_reduced['sentiment']))

model_base = ContextScalePrediction(
        roberta_model=model_name,
        num_topics=num_topics,
        num_sentiments=num_sentiments,
        lora=False,
        use_shared_attention=True
    )

ensemble_checkpoint_paths = [
    os.path.join('results/models/ensemble_scaling', f"model_ensemble_{i}.safetensors")
    for i in range(5)
]

validity_ensemble_models = load_ensemble_models(
    model_base=model_base,
    checkpoint_paths=ensemble_checkpoint_paths,
    device=device
)

Loading ensemble member 0 from results/models/ensemble_scaling/model_ensemble_0.safetensors
Loading ensemble member 1 from results/models/ensemble_scaling/model_ensemble_1.safetensors
Loading ensemble member 2 from results/models/ensemble_scaling/model_ensemble_2.safetensors
Loading ensemble member 3 from results/models/ensemble_scaling/model_ensemble_3.safetensors
Loading ensemble member 4 from results/models/ensemble_scaling/model_ensemble_4.safetensors


In [54]:
outputs_ca_test = ensemble_inference(
    models=validity_ensemble_models,
    dataloader=pred_dataloader,
    device=device,
    beta=1.0,
    topic_label='topic',
    sentiment_label='sentiment',
    use_ground_truth_topic=False
)

true_topics_ca = outputs_ca_test.get('ground_truth_topics')
if true_topics_ca is not None:
    true_topics_ca = np.asarray(true_topics_ca).ravel()
    topic_precision_ca, topic_recall_ca, topic_f1_ca, _ = precision_recall_fscore_support(
        true_topics_ca,
        outputs_ca_test['ensemble_pred_topics'],
        average=None
    )
    matrix_topic_ca = confusion_matrix(true_topics_ca, outputs_ca_test['ensemble_pred_topics'])
    accuracy_topic_ca = matrix_topic_ca.diagonal() / matrix_topic_ca.sum(axis=1)
    outputs_ca_test['res_table_topic'] = pd.DataFrame({
        'f1': np.round(topic_f1_ca, 2),
        'precision': np.round(topic_precision_ca, 2),
        'recall': np.round(topic_recall_ca, 2),
        'accuracy': np.round(accuracy_topic_ca, 2)
    })
else:
    outputs_ca_test['res_table_topic'] = None

true_sentiments_ca = outputs_ca_test.get('ground_truth_sentiments')
if true_sentiments_ca is not None:
    true_sentiments_ca = np.asarray(true_sentiments_ca).ravel()
    sent_precision_ca, sent_recall_ca, sent_f1_ca, _ = precision_recall_fscore_support(
        true_sentiments_ca,
        outputs_ca_test['ensemble_pred_sentiments'],
        average=None
    )
    matrix_sentiment_ca = confusion_matrix(true_sentiments_ca, outputs_ca_test['ensemble_pred_sentiments'])
    accuracy_sentiment_ca = matrix_sentiment_ca.diagonal() / matrix_sentiment_ca.sum(axis=1)
    outputs_ca_test['res_table_sentiment'] = pd.DataFrame({
        'f1': np.round(sent_f1_ca, 2),
        'precision': np.round(sent_precision_ca, 2),
        'recall': np.round(sent_recall_ca, 2),
        'accuracy': np.round(accuracy_sentiment_ca, 2)
    })
else:
    outputs_ca_test['res_table_sentiment'] = None

outputs_ca_test['position_scores'] = outputs_ca_test['mean_position_scores']
outputs_ca_test['pred_topics'] = outputs_ca_test['ensemble_pred_topics']
outputs_ca_test['pred_sentiment'] = outputs_ca_test['ensemble_pred_sentiments']

Running ensemble inference with 5 models...

Running inference with model 1/5
  Batch 100/2456 | Elapsed: 1.54s, Remaining: 36.26s
  Batch 200/2456 | Elapsed: 2.91s, Remaining: 32.84s
  Batch 300/2456 | Elapsed: 4.20s, Remaining: 30.15s
  Batch 400/2456 | Elapsed: 5.65s, Remaining: 29.06s
  Batch 500/2456 | Elapsed: 7.25s, Remaining: 28.35s
  Batch 600/2456 | Elapsed: 8.85s, Remaining: 27.37s
  Batch 700/2456 | Elapsed: 10.45s, Remaining: 26.21s
  Batch 800/2456 | Elapsed: 12.08s, Remaining: 25.00s
  Batch 900/2456 | Elapsed: 13.15s, Remaining: 22.74s
  Batch 1000/2456 | Elapsed: 14.20s, Remaining: 20.68s
  Batch 1100/2456 | Elapsed: 15.33s, Remaining: 18.90s
  Batch 1200/2456 | Elapsed: 16.51s, Remaining: 17.28s
  Batch 1300/2456 | Elapsed: 17.66s, Remaining: 15.70s
  Batch 1400/2456 | Elapsed: 18.91s, Remaining: 14.26s
  Batch 1500/2456 | Elapsed: 20.22s, Remaining: 12.89s
  Batch 1600/2456 | Elapsed: 21.47s, Remaining: 11.49s
  Batch 1700/2456 | Elapsed: 22.65s, Remaining: 10.07s
  

In [55]:
outputs_ca_test['res_table_sentiment']['f1'].mean().round(2)

0.72

In [56]:
outputs_ca_test['res_table_topic']['f1'].mean().round(2)

0.71

In [57]:
file_path = 'data/temps/outputs_ca_test'
with open(file_path, "wb") as file:
    pickle.dump(outputs_ca_test, file)

In [58]:
outputs_ca_test['res_table_sentiment'].to_csv('results/classification results/cagree_noft_sentiment.csv', index=False)
outputs_ca_test['res_table_topic'].to_csv('results/classification results/cagree_noft_topic.csv', index=False)


#### 10% supervision

In [59]:
cagree_reduced.loc[:,'topic_sentiment'] = cagree_reduced.loc[:,'topic'] + '_' + cagree_reduced.loc[:,'sentiment']

In [60]:
cagree_dataset = Dataset.from_pandas(cagree_reduced)
cagree_dataset = cagree_dataset.class_encode_column('sentiment')
cagree_dataset = cagree_dataset.class_encode_column('topic')
cagree_dataset = cagree_dataset.class_encode_column('topic_sentiment')

Casting to class labels:   0%|          | 0/39287 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/39287 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/39287 [00:00<?, ? examples/s]

In [61]:
train_test = cagree_dataset.train_test_split(test_size=0.9, stratify_by_column='topic_sentiment', seed=seed_val)

In [62]:
cagree_datasets = DatasetDict({
    'train': train_test['train'],
    'test': train_test['test'],
})
cagree_datasets

DatasetDict({
    train: Dataset({
        features: ['sentiment', 'topic', 'text', 'topic_sentiment'],
        num_rows: 3928
    })
    test: Dataset({
        features: ['sentiment', 'topic', 'text', 'topic_sentiment'],
        num_rows: 35359
    })
})

In [63]:
tokenized_datasets = cagree_datasets.map(tokenize_function, 
                                            fn_kwargs={'tokenizer': tokenizer, 'text_var': 'text', 'max_length': 512}, 
                                            remove_columns=['text','topic_sentiment'])
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

Map:   0%|          | 0/3928 [00:00<?, ? examples/s]

Map:   0%|          | 0/35359 [00:00<?, ? examples/s]

['sentiment', 'topic', 'input_ids', 'attention_mask']

In [64]:
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=64, shuffle=True, collate_fn = data_collator)
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=64, shuffle=False, collate_fn = data_collator)


In [65]:
## Define ensemble model factories for 10% labelled subset with borrowed weights
num_topics = len(set(cagree_reduced['topic']))
num_sentiments = len(set(cagree_reduced['sentiment']))

checkpoint_path = os.path.join('results/models/ensemble_scaling/model_ensemble_1.safetensors')

print("Manifesto ensemble members used for 10% supervision adaptation:")



def copy_source_model(checkpoint_path):
    source_model = ContextScalePrediction(
        roberta_model=model_name,
        num_topics=num_topics,
        num_sentiments=num_sentiments,
        lora=False,
        use_shared_attention=True
    ).to(device)
    source_state = load_file(checkpoint_path)
    source_model.load_state_dict(source_state)

    return source_model


model_base = copy_source_model(checkpoint_path=checkpoint_path)

Manifesto ensemble members used for 10% supervision adaptation:


In [66]:
ensemble_training_info_ca_10 = train_deep_ensemble(
    model_base=model_base,
    train_dataloader=train_dataloader,
    eval_dataloader=None,
    device=device,
    num_models=5,
    n_epochs=5,
    lr=2e-5,
    sentiment_var='sentiment',
    topic_var='topic',
    save_dir='results/models/coalitionagree_ensemble_10',
    model_prefix='model_ensemble_ca_10',
    org_seed=seed_val
)


Training ensemble member 1/5
Using shuffled training data with seed 1234
Training ensemble member 1
Using shuffled data with seed 1234
Epoch: 1/5

Training...

Training epoch took: 4.76s
  Skipping evaluation (no eval_dataloader provided)
Epoch: 2/5

Training...

Training epoch took: 4.78s
  Skipping evaluation (no eval_dataloader provided)
Epoch: 3/5

Training...

Training epoch took: 4.83s
  Skipping evaluation (no eval_dataloader provided)
Epoch: 4/5

Training...

Training epoch took: 4.79s
  Skipping evaluation (no eval_dataloader provided)
Epoch: 5/5

Training...

Training epoch took: 4.83s
  Skipping evaluation (no eval_dataloader provided)
Saved model checkpoint to results/models/coalitionagree_ensemble_10/model_ensemble_ca_10_0.safetensors

Training ensemble member 2/5
Using shuffled training data with seed 1235
Training ensemble member 2
Using shuffled data with seed 1235
Epoch: 1/5

Training...

Training epoch took: 4.82s
  Skipping evaluation (no eval_dataloader provided)
E

In [67]:
sorted(os.listdir('results/models/coalitionagree_ensemble_10'))

['model_ensemble_ca_10_0.safetensors',
 'model_ensemble_ca_10_1.safetensors',
 'model_ensemble_ca_10_2.safetensors',
 'model_ensemble_ca_10_3.safetensors',
 'model_ensemble_ca_10_4.safetensors']

In [68]:
coalition_ensemble_checkpoint_paths = [
    os.path.join('results/models/coalitionagree_ensemble_10', f"model_ensemble_ca_10_{i}.safetensors")
    for i in range(5)
]

print("Checkpoint paths for COALITIONAGREE 10% ensemble:")
for idx, path in enumerate(coalition_ensemble_checkpoint_paths):
    print(f"  Model {idx}: {path}")

coalitionagree_ensemble_10_models = load_ensemble_models(
    model_base=model_base,
    checkpoint_paths=coalition_ensemble_checkpoint_paths,
    device=device
)

Checkpoint paths for COALITIONAGREE 10% ensemble:
  Model 0: results/models/coalitionagree_ensemble_10/model_ensemble_ca_10_0.safetensors
  Model 1: results/models/coalitionagree_ensemble_10/model_ensemble_ca_10_1.safetensors
  Model 2: results/models/coalitionagree_ensemble_10/model_ensemble_ca_10_2.safetensors
  Model 3: results/models/coalitionagree_ensemble_10/model_ensemble_ca_10_3.safetensors
  Model 4: results/models/coalitionagree_ensemble_10/model_ensemble_ca_10_4.safetensors
Loading ensemble member 0 from results/models/coalitionagree_ensemble_10/model_ensemble_ca_10_0.safetensors
Loading ensemble member 1 from results/models/coalitionagree_ensemble_10/model_ensemble_ca_10_1.safetensors
Loading ensemble member 2 from results/models/coalitionagree_ensemble_10/model_ensemble_ca_10_2.safetensors
Loading ensemble member 3 from results/models/coalitionagree_ensemble_10/model_ensemble_ca_10_3.safetensors
Loading ensemble member 4 from results/models/coalitionagree_ensemble_10/model

In [69]:
outputs_ca_10 = ensemble_inference(
    models=coalitionagree_ensemble_10_models,
    dataloader=test_dataloader,
    device=device,
    beta=1.0,
    topic_label='topic',
    sentiment_label='sentiment',
    use_ground_truth_topic=False
)

true_topics_ca_10 = outputs_ca_10.get('ground_truth_topics')
if true_topics_ca_10 is not None:
    true_topics_ca_10 = np.asarray(true_topics_ca_10).ravel()
    topic_precision_ca_10, topic_recall_ca_10, topic_f1_ca_10, _ = precision_recall_fscore_support(
        true_topics_ca_10,
        outputs_ca_10['ensemble_pred_topics'],
        average=None
    )
    matrix_topic_ca_10 = confusion_matrix(true_topics_ca_10, outputs_ca_10['ensemble_pred_topics'])
    accuracy_topic_ca_10 = matrix_topic_ca_10.diagonal() / matrix_topic_ca_10.sum(axis=1)
    outputs_ca_10['res_table_topic'] = pd.DataFrame({
        'f1': np.round(topic_f1_ca_10, 2),
        'precision': np.round(topic_precision_ca_10, 2),
        'recall': np.round(topic_recall_ca_10, 2),
        'accuracy': np.round(accuracy_topic_ca_10, 2)
    })
else:
    outputs_ca_10['res_table_topic'] = None

true_sentiments_ca_10 = outputs_ca_10.get('ground_truth_sentiments')
if true_sentiments_ca_10 is not None:
    true_sentiments_ca_10 = np.asarray(true_sentiments_ca_10).ravel()
    sent_precision_ca_10, sent_recall_ca_10, sent_f1_ca_10, _ = precision_recall_fscore_support(
        true_sentiments_ca_10,
        outputs_ca_10['ensemble_pred_sentiments'],
        average=None
    )
    matrix_sentiment_ca_10 = confusion_matrix(true_sentiments_ca_10, outputs_ca_10['ensemble_pred_sentiments'])
    accuracy_sentiment_ca_10 = matrix_sentiment_ca_10.diagonal() / matrix_sentiment_ca_10.sum(axis=1)
    outputs_ca_10['res_table_sentiment'] = pd.DataFrame({
        'f1': np.round(sent_f1_ca_10, 2),
        'precision': np.round(sent_precision_ca_10, 2),
        'recall': np.round(sent_recall_ca_10, 2),
        'accuracy': np.round(accuracy_sentiment_ca_10, 2)
    })
else:
    outputs_ca_10['res_table_sentiment'] = None

outputs_ca_10['position_scores'] = outputs_ca_10['mean_position_scores']
outputs_ca_10['pred_topics'] = outputs_ca_10['ensemble_pred_topics']
outputs_ca_10['pred_sentiment'] = outputs_ca_10['ensemble_pred_sentiments']

Running ensemble inference with 5 models...

Running inference with model 1/5
  Batch 100/553 | Elapsed: 6.48s, Remaining: 29.35s
  Batch 200/553 | Elapsed: 13.00s, Remaining: 22.94s
  Batch 300/553 | Elapsed: 19.57s, Remaining: 16.51s
  Batch 400/553 | Elapsed: 26.02s, Remaining: 9.95s
  Batch 500/553 | Elapsed: 32.61s, Remaining: 3.46s
  Using ground truth topic labels for position score computation

Running inference with model 2/5
  Batch 100/553 | Elapsed: 6.59s, Remaining: 29.85s
  Batch 200/553 | Elapsed: 13.18s, Remaining: 23.27s
  Batch 300/553 | Elapsed: 19.82s, Remaining: 16.71s
  Batch 400/553 | Elapsed: 26.31s, Remaining: 10.06s
  Batch 500/553 | Elapsed: 32.95s, Remaining: 3.49s

Running inference with model 3/5
  Batch 100/553 | Elapsed: 6.63s, Remaining: 30.02s
  Batch 200/553 | Elapsed: 13.25s, Remaining: 23.39s
  Batch 300/553 | Elapsed: 19.91s, Remaining: 16.79s
  Batch 400/553 | Elapsed: 26.43s, Remaining: 10.11s
  Batch 500/553 | Elapsed: 33.09s, Remaining: 3.51s



In [71]:
outputs_ca_10['res_table_sentiment']['f1'].mean().round(2)

0.82

In [72]:
outputs_ca_10['res_table_topic']['f1'].mean().round(2)

0.83

In [73]:
file_path = 'data/temps/outputs_ca_10'
with open(file_path, "wb") as file:
    pickle.dump(outputs_ca_10, file)

In [74]:
outputs_ca_10['res_table_sentiment'].to_csv('results/classification results/cagree_10ft_sentiment.csv', index=False)
outputs_ca_10['res_table_topic'].to_csv('results/classification results/cagree_10ft_topic.csv', index=False)


### Adapting to twitter data (Sentiment is not Stance)

In [75]:
tw_trump = pd.read_csv('data/MOTN/MOTN_responses_groundtruth.csv', encoding='utf-8')
tw_kav = pd.read_csv('data/MOTN/kavanaugh_tweets_groundtruth.csv', encoding='utf-8')
tw_wm = pd.read_csv('data/MOTN/WM_tweets_groundtruth.csv', encoding='utf-8')

In [76]:
tw_trump.head()

Unnamed: 0,wavenum,ideo5,edits_clean_text,qpos,trump_stance_auto,lexicoder_sentiment,fold,vader_sentiment,SVM_sentiment,BERT_sentiment,SVM_stance,BERT_stance,vader_scores
0,3,Moderate,the recent election of donald trump the freedo...,1,1,1.0,3,1.0,1,1,1,0,0.6369
1,3,Very conservative,donald trump won,1,1,1.0,1,1.0,1,1,1,1,0.5719
2,3,Conservative,that donald trump beat hillary clinton,1,1,,5,,1,1,1,1,0.0
3,3,Conservative,donald trump was elected president,1,1,,5,,1,1,1,1,0.0
4,3,Conservative,the american people saw through the obfuscatio...,1,1,1.0,3,1.0,0,1,1,1,0.4019


In [77]:
tw_kav.head()

Unnamed: 0,text,sentiment,stance,fold,vader_sentiment,SVM_sentiment,BERT_sentiment,SVM_stance,BERT_stance,lexicoder_sentiment,vader_scores
0,RT @willchamberlain Ms. Ford sent an anonymou...,0,1,3,0.0,0,0,1,1,0.0,-0.7579
1,RT @dbongino Is there ever going to come a da...,0,1,1,0.0,0,0,1,1,0.0,-0.4767
2,RT @SuzeOrmanShow He violates every one of my...,0,0,5,1.0,0,0,0,0,0.0,0.5423
3,RT @funder Dear Judge Kavanaugh- We request ...,0,0,5,0.0,0,0,0,0,0.0,-0.802
4,RT @BrianKarem BREAKING: Montgomery MD PD Ch...,0,0,3,0.0,0,0,0,0,0.0,-0.296


In [78]:
tw_wm.head()

Unnamed: 0,text,stance,sentiment,balanced_train,vader_scores
0,YES! I'm still with her and always will be. ht...,1,1.0,0.0,0.5754
1,Pics or it didn't happen. https://t.co/o1GddSmwk2,1,0.0,0.0,0.0
2,I love this nasty woman. @MaribethMonroe #wome...,1,1.0,1.0,-0.0129
3,RT @YiawayYeh: Marching for love. Nashville #...,1,1.0,1.0,0.6369
4,These people are just Sad. https://t.co/0LK6iG...,0,0.0,1.0,-0.4767


In [79]:
tw_trump = tw_trump[['edits_clean_text','trump_stance_auto']].copy()
tw_trump = tw_trump.rename(columns={'edits_clean_text': 'text', 'trump_stance_auto': 'stance'})
tw_trump['topic'] = 'trump'
tw_kav = tw_kav[['text', 'stance']].copy()
tw_kav['topic'] = 'kavanaugh'
tw_wm = tw_wm[['text','stance']].copy()
tw_wm['topic'] = 'women march'


In [80]:
tw_df = pd.concat([tw_trump, tw_kav,tw_wm]).reset_index(drop=True)

In [81]:
tw_df.loc[:,'lr'] = tw_df.apply(lambda x: recode_tw(x['topic'], x['stance']), axis=1)
tw_df.loc[:,'topic_lr'] = tw_df['topic'] + '_' + tw_df['lr']

In [82]:
tw_df.groupby(['topic','lr']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,text,stance,topic_lr
topic,lr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
kavanaugh,left,1672,1672,1672
kavanaugh,right,1988,1988,1988
trump,left,4312,4312,4312
trump,right,2834,2834,2834
women march,left,16965,16965,16965
women march,right,2647,2647,2647


In [83]:
tw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30418 entries, 0 to 30417
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      30418 non-null  object
 1   stance    30418 non-null  int64 
 2   topic     30418 non-null  object
 3   lr        30418 non-null  object
 4   topic_lr  30418 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.2+ MB


In [84]:
tw_dataset = Dataset.from_pandas(tw_df[['text','lr','topic', 'topic_lr']].copy())
tw_dataset = tw_dataset.class_encode_column('lr')
tw_dataset = tw_dataset.class_encode_column('topic')
tw_dataset = tw_dataset.class_encode_column('topic_lr')


Casting to class labels:   0%|          | 0/30418 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/30418 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/30418 [00:00<?, ? examples/s]

In [85]:
train_test = tw_dataset.train_test_split(test_size=0.9, stratify_by_column='topic_lr',seed=seed_val)

In [86]:
tw_datasets = DatasetDict({
    'train': train_test['train'],
    'test': train_test['test'],
})
tw_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'lr', 'topic', 'topic_lr'],
        num_rows: 3041
    })
    test: Dataset({
        features: ['text', 'lr', 'topic', 'topic_lr'],
        num_rows: 27377
    })
})

In [87]:
model_name = 'xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer)

In [88]:
tokenized_datasets = tw_datasets.map(tokenize_function, 
                                            fn_kwargs={'tokenizer': tokenizer, 'text_var': 'text', 'max_length':512}, 
                                            remove_columns=['text', 'topic_lr'])
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

Map:   0%|          | 0/3041 [00:00<?, ? examples/s]

Map:   0%|          | 0/27377 [00:00<?, ? examples/s]

['lr', 'topic', 'input_ids', 'attention_mask']

In [89]:
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=64, shuffle=True, collate_fn = data_collator)
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=64, shuffle=False, collate_fn = data_collator)


In [92]:
checkpoint_path = os.path.join('results/models/ensemble_scaling/model_ensemble_1.safetensors')

print("Manifesto ensemble members used for 10% supervision adaptation:")



def copy_source_model(checkpoint_path):
    source_model = ContextScalePrediction(
        roberta_model=model_name,
        num_topics=12,
        num_sentiments=3,
        lora=False,
        use_shared_attention=True
    ).to(device)
    source_state = load_file(checkpoint_path)
    source_model.load_state_dict(source_state)
    target_model = ContextScalePrediction(
        roberta_model=model_name,
        num_topics=3,
        num_sentiments=2,
        lora=False,
        use_shared_attention=True
    ).to(device)

    copy_weights(source_model, target_model, patterns = ('topic', 'sentiment'), freeze_copied=False)
    del source_model
    return target_model


model_base = copy_source_model(checkpoint_path=checkpoint_path)

Manifesto ensemble members used for 10% supervision adaptation:
Skipping topic.weight as it is not present or should be skipped in the scaling model.
Skipping topic.bias as it is not present or should be skipped in the scaling model.
Skipping sentiment.weight as it is not present or should be skipped in the scaling model.
Skipping sentiment.bias as it is not present or should be skipped in the scaling model.
Trainable Parameters after copying:
roberta.embeddings.word_embeddings.weight
roberta.embeddings.position_embeddings.weight
roberta.embeddings.token_type_embeddings.weight
roberta.embeddings.LayerNorm.weight
roberta.embeddings.LayerNorm.bias
roberta.encoder.layer.0.attention.self.query.weight
roberta.encoder.layer.0.attention.self.query.bias
roberta.encoder.layer.0.attention.self.key.weight
roberta.encoder.layer.0.attention.self.key.bias
roberta.encoder.layer.0.attention.self.value.weight
roberta.encoder.layer.0.attention.self.value.bias
roberta.encoder.layer.0.attention.output.den

In [93]:
ensemble_training_info_tw = train_deep_ensemble(
    model_base=model_base,
    train_dataloader=train_dataloader,
    eval_dataloader=None,
    device=device,
    num_models=5,
    n_epochs=5,
    lr=2e-5,
    sentiment_var='lr',
    topic_var='topic',
    save_dir='results/models/tw_ensemble',
    model_prefix='model_ensemble_tw',
    org_seed=seed_val
)


Training ensemble member 1/5
Using shuffled training data with seed 1234
Training ensemble member 1
Using shuffled data with seed 1234
Epoch: 1/5

Training...

Training epoch took: 2.03s
  Skipping evaluation (no eval_dataloader provided)
Epoch: 2/5

Training...

Training epoch took: 2.02s
  Skipping evaluation (no eval_dataloader provided)
Epoch: 3/5

Training...

Training epoch took: 2.05s
  Skipping evaluation (no eval_dataloader provided)
Epoch: 4/5

Training...

Training epoch took: 2.03s
  Skipping evaluation (no eval_dataloader provided)
Epoch: 5/5

Training...

Training epoch took: 2.03s
  Skipping evaluation (no eval_dataloader provided)
Saved model checkpoint to results/models/tw_ensemble/model_ensemble_tw_0.safetensors

Training ensemble member 2/5
Using shuffled training data with seed 1235
Training ensemble member 2
Using shuffled data with seed 1235
Epoch: 1/5

Training...

Training epoch took: 2.04s
  Skipping evaluation (no eval_dataloader provided)
Epoch: 2/5

Trainin

In [94]:
twitter_ensemble_checkpoint_paths = [
    os.path.join('results/models/tw_ensemble', f"model_ensemble_tw_{i}.safetensors")
    for i in range(5)
]

print("Checkpoint paths for adapted Twitter ensemble:")
for idx, path in enumerate(twitter_ensemble_checkpoint_paths):
    print(f"  Model {idx}: {path}")

twitter_ensemble_models = load_ensemble_models(
    model_base=model_base,
    checkpoint_paths=twitter_ensemble_checkpoint_paths,
    device=device
)

Checkpoint paths for adapted Twitter ensemble:
  Model 0: results/models/tw_ensemble/model_ensemble_tw_0.safetensors
  Model 1: results/models/tw_ensemble/model_ensemble_tw_1.safetensors
  Model 2: results/models/tw_ensemble/model_ensemble_tw_2.safetensors
  Model 3: results/models/tw_ensemble/model_ensemble_tw_3.safetensors
  Model 4: results/models/tw_ensemble/model_ensemble_tw_4.safetensors
Loading ensemble member 0 from results/models/tw_ensemble/model_ensemble_tw_0.safetensors
Loading ensemble member 1 from results/models/tw_ensemble/model_ensemble_tw_1.safetensors
Loading ensemble member 2 from results/models/tw_ensemble/model_ensemble_tw_2.safetensors
Loading ensemble member 3 from results/models/tw_ensemble/model_ensemble_tw_3.safetensors
Loading ensemble member 4 from results/models/tw_ensemble/model_ensemble_tw_4.safetensors


In [95]:
outputs_tw_10 = ensemble_inference(
    models=twitter_ensemble_models,
    dataloader=test_dataloader,
    device=device,
    beta=1.0,
    topic_label='topic',
    sentiment_label='lr',
    use_ground_truth_topic=False
)

true_topics_tw = outputs_tw_10.get('ground_truth_topics')
if true_topics_tw is not None:
    true_topics_tw = np.asarray(true_topics_tw).ravel()
    topic_precision_tw, topic_recall_tw, topic_f1_tw, _ = precision_recall_fscore_support(
        true_topics_tw,
        outputs_tw_10['ensemble_pred_topics'],
        average=None
    )
    matrix_topic_tw = confusion_matrix(true_topics_tw, outputs_tw_10['ensemble_pred_topics'])
    accuracy_topic_tw = matrix_topic_tw.diagonal() / matrix_topic_tw.sum(axis=1)
    outputs_tw_10['res_table_topic'] = pd.DataFrame({
        'f1': np.round(topic_f1_tw, 2),
        'precision': np.round(topic_precision_tw, 2),
        'recall': np.round(topic_recall_tw, 2),
        'accuracy': np.round(accuracy_topic_tw, 2)
    })
else:
    outputs_tw_10['res_table_topic'] = None

true_sentiments_tw = outputs_tw_10.get('ground_truth_sentiments')
if true_sentiments_tw is not None:
    true_sentiments_tw = np.asarray(true_sentiments_tw).ravel()
    sent_precision_tw, sent_recall_tw, sent_f1_tw, _ = precision_recall_fscore_support(
        true_sentiments_tw,
        outputs_tw_10['ensemble_pred_sentiments'],
        average=None
    )
    matrix_sentiment_tw = confusion_matrix(true_sentiments_tw, outputs_tw_10['ensemble_pred_sentiments'])
    accuracy_sentiment_tw = matrix_sentiment_tw.diagonal() / matrix_sentiment_tw.sum(axis=1)
    outputs_tw_10['res_table_sentiment'] = pd.DataFrame({
        'f1': np.round(sent_f1_tw, 2),
        'precision': np.round(sent_precision_tw, 2),
        'recall': np.round(sent_recall_tw, 2),
        'accuracy': np.round(accuracy_sentiment_tw, 2)
    })
else:
    outputs_tw_10['res_table_sentiment'] = None

outputs_tw_10['position_scores'] = outputs_tw_10['mean_position_scores']
outputs_tw_10['pred_topics'] = outputs_tw_10['ensemble_pred_topics']
outputs_tw_10['pred_sentiment'] = outputs_tw_10['ensemble_pred_sentiments']

Running ensemble inference with 5 models...

Running inference with model 1/5
  Batch 100/428 | Elapsed: 2.50s, Remaining: 8.19s
  Batch 200/428 | Elapsed: 4.90s, Remaining: 5.58s
  Batch 300/428 | Elapsed: 7.29s, Remaining: 3.11s
  Batch 400/428 | Elapsed: 9.73s, Remaining: 0.68s
  Using ground truth topic labels for position score computation

Running inference with model 2/5
  Batch 100/428 | Elapsed: 2.53s, Remaining: 8.29s
  Batch 200/428 | Elapsed: 4.95s, Remaining: 5.64s
  Batch 300/428 | Elapsed: 7.37s, Remaining: 3.14s
  Batch 400/428 | Elapsed: 9.83s, Remaining: 0.69s

Running inference with model 3/5
  Batch 100/428 | Elapsed: 2.55s, Remaining: 8.37s
  Batch 200/428 | Elapsed: 4.99s, Remaining: 5.69s
  Batch 300/428 | Elapsed: 7.43s, Remaining: 3.17s
  Batch 400/428 | Elapsed: 9.91s, Remaining: 0.69s

Running inference with model 4/5
  Batch 100/428 | Elapsed: 2.58s, Remaining: 8.45s
  Batch 200/428 | Elapsed: 5.04s, Remaining: 5.74s
  Batch 300/428 | Elapsed: 7.49s, Remaini

In [96]:
outputs_tw_10['res_table_sentiment']['f1'].mean().round(2)

0.78

In [97]:
outputs_tw_10['res_table_topic']['f1'].mean().round(2)

0.99

In [98]:
file_path = 'data/temps/outputs_tw_10'
with open(file_path, "wb") as file:
    pickle.dump(outputs_tw_10, file)

In [99]:
outputs_tw_10['res_table_sentiment'].to_csv('results/classification results/tw_10ft_sentiment.csv', index=False)
outputs_tw_10['res_table_topic'].to_csv('results/classification results/tw_10ft_topic.csv', index=False)

# Doc2Vec scaling

In [None]:
manifesto_d2v = pd.read_csv('data/temps/manifesto.csv', encoding='utf-8', dtype={'cmp_code':'str', 'is_copy_of':'str'})


In [None]:
outputs = clean_text_loop(manifesto_d2v, 'countryname')

In [None]:
manifesto_d2v.loc[:,'text_cleaned'] = outputs

In [None]:
manifesto_d2v.loc[:,'party_election'] = manifesto_d2v.party.astype(str).str.cat(manifesto_d2v[['election']].astype(str).values, sep='_')
manifesto_d2v.loc[:,'country_party_election'] = manifesto_d2v.countryname.str.cat(manifesto_d2v[['party','election']].astype(str).values, sep='_')

## Doc2Vec scaling - original approach by R&C 

In [None]:
# Create an empty list to store the country-level dataframes
country_dfs = []

# Get the unique list of countries from your data
unique_countries = manifesto_d2v['countryname'].unique()

# Loop through each country and process separately
for country in unique_countries:
    print(f"Processing country: {country}")
    
    # Filter the dataset for the current country
    country_data = manifesto_d2v[manifesto_d2v['countryname'] == country]
    
    # Build the corpus iterator for this country's data
    outputs_stream = phraseIterator(country_data, 'text_cleaned')
    bigram = Phraser(Phrases(outputs_stream, min_count=1, threshold=5))
    trigram = Phrases(bigram[outputs_stream], min_count=1, threshold=5)
    
    # Create the Doc2Vec model and build vocabulary
    model = Doc2Vec(vector_size=500, window=6, min_count=1, workers=16, epochs=20, seed=seed_val)
    model.build_vocab(corpusIterator(country_data, bigram=bigram, trigram=trigram, text='text_cleaned', labels='party_election'))
    
    # Train the model
    model.train(corpusIterator(country_data, bigram=bigram, trigram=trigram, text='text_cleaned', labels='party_election'),
                total_examples=model.corpus_count, epochs=20)
    
    # Generate embeddings and apply dimensionality reduction
    embed_dict = d2v_reduct(model)
    df_d2v = pd.DataFrame.from_dict(embed_dict).transpose()
    df_d2v.index.name = 'party_election'
    df_d2v.reset_index(inplace=True)
    pca = PCA(n_components=2, random_state=seed_val)
    df_d2v[['d2v_d1', 'd2v_d2']] = pca.fit_transform(df_d2v.iloc[:, 1:])
    df_d2v = df_d2v[['party_election', 'd2v_d1', 'd2v_d2']]
    
    # Split the 'party_election' label into separate columns
    df_d2v[['party', 'election']] = df_d2v['party_election'].str.split('_', expand=True)
    df_d2v.loc[:, 'election'] = df_d2v['election'].astype(int)
    df_d2v['country'] = country  # Add country column for merging later
    
    # Append the country-level dataframe to the list
    country_dfs.append(df_d2v)

# Merge all country-level datasets into a single dataframe
final_df_d2v = pd.concat(country_dfs, ignore_index=True)

# Save the final dataframe to a CSV file
final_df_d2v.to_csv('data/py_outputs/r&c_gen_party_election.csv', index=False)

# Print a summary
print(final_df_d2v.info())


In [None]:
final_df_d2v.head()

In [None]:
d2v_germany = final_df_d2v[final_df_d2v.country == 'Germany'].copy()
d2v_germany.loc[:,'party_name'] = d2v_germany['party'].astype(str).apply(party_deu)
d2v_germany = d2v_germany[d2v_germany.party_name != 'Other'].reset_index(drop=True)
d2v_germany.head()

In [None]:
fig, ax = plt.subplots()
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
for name, group in d2v_germany.groupby('party_name'):
    ax.plot(group.election, group.d2v_d1, marker='o',  ms=4, label=name)
ax.legend()

plt.show()

In [None]:
df_d2v.to_csv('data/py_outputs/r&c_gen.csv', index=False)

## Doc2Vec scaling - relevant topics
 

In [None]:
set(manifesto_d2v['topic'])

In [None]:
# Create an empty list to store the country-topic level dataframes
country_topic_dfs = []

# Get the unique list of countries from your data
unique_countries = manifesto_d2v['countryname'].unique()

# Loop through each country
for country in unique_countries:
    print(f"Processing country: {country}")
    
    # Filter the dataset for the current country
    country_data = manifesto_d2v[manifesto_d2v['countryname'] == country]
    country_data = country_data[country_data['topic'].isin(['Environment - Growth', 'Political System', 'Economics',
                                                            'European Integration','Labour and Social Welfare',
                                                            'Immigration'])]
    # Get the unique list of topics within this country
    unique_topics = country_data['topic'].unique()
    
    # Loop through each topic in the country
    for topic in unique_topics:
        print(f"Processing topic: {topic}")

        # Filter the dataset for the current country and topic
        country_topic_data = country_data[country_data['topic'] == topic]

        # Build the corpus iterator for this country's topic-specific data
        outputs_stream = phraseIterator(country_topic_data, 'text_cleaned')
        bigram = Phraser(Phrases(outputs_stream, min_count=1, threshold=5))
        trigram = Phrases(bigram[outputs_stream], min_count=1, threshold=5)

        # Create the Doc2Vec model and build vocabulary
        model = Doc2Vec(vector_size=500, window=6, min_count=1, workers=16, epochs=20, seed=seed_val)
        model.build_vocab(corpusIterator(country_topic_data, bigram=bigram, trigram=trigram, text='text_cleaned', labels='party_election'))

        # Train the model
        model.train(corpusIterator(country_topic_data, bigram=bigram, trigram=trigram, text='text_cleaned', labels='party_election'),
                    total_examples=model.corpus_count, epochs=20)

        # Generate embeddings and apply dimensionality reduction
        embed_dict = d2v_reduct(model)
        df_d2v = pd.DataFrame.from_dict(embed_dict).transpose()
        df_d2v.index.name = 'party_election'
        df_d2v.reset_index(inplace=True)
        pca = PCA(n_components=2, random_state=seed_val)
        df_d2v[['d2v_d1', 'd2v_d2']] = pca.fit_transform(df_d2v.iloc[:, 1:])
        df_d2v = df_d2v[['party_election', 'd2v_d1', 'd2v_d2']]

        # Split the 'party_election' label into separate columns
        df_d2v[['party', 'election']] = df_d2v['party_election'].str.split('_', expand=True)
        df_d2v.loc[:, 'election'] = df_d2v['election'].astype(int)
        df_d2v['country'] = country  # Add country column
        df_d2v['topic'] = topic  # Add topic column

        # Append the country-topic-level dataframe to the list
        country_topic_dfs.append(df_d2v)

# Merge all country-topic-level datasets into a single dataframe
final_df_d2v = pd.concat(country_topic_dfs, ignore_index=True)

# Save the final dataframe to a CSV file
final_df_d2v.to_csv('data/py_outputs/r&c_party_election_topic.csv', index=False)

# Print a summary
print(final_df_d2v.info())


## Doc2Vec scaling - Environment Protection
 

In [None]:
manifesto_ep = manifesto_d2v[manifesto_d2v.cmp_code.isin(['501'])].reset_index(drop=True)


In [None]:
manifesto_ep.head()

In [None]:
# Create an empty list to store the country-level dataframes
country_dfs = []

# Get the unique list of countries from your data
unique_countries = manifesto_ep['countryname'].unique()

# Loop through each country and process separately
for country in unique_countries:
    print(f"Processing country: {country}")
    
    # Filter the dataset for the current country
    country_data = manifesto_ep[manifesto_ep['countryname'] == country]
    
    # Build the corpus iterator for this country's data
    outputs_stream = phraseIterator(country_data, 'text_cleaned')
    bigram = Phraser(Phrases(outputs_stream, min_count=1, threshold=5))
    trigram = Phrases(bigram[outputs_stream], min_count=1, threshold=5)
    
    # Create the Doc2Vec model and build vocabulary
    model = Doc2Vec(vector_size=500, window=6, min_count=1, workers=16, epochs=20, seed=seed_val)
    model.build_vocab(corpusIterator(country_data, bigram=bigram, trigram=trigram, text='text_cleaned', labels='party_election'))
    
    # Train the model
    model.train(corpusIterator(country_data, bigram=bigram, trigram=trigram, text='text_cleaned', labels='party_election'),
                total_examples=model.corpus_count, epochs=20)
    
    # Generate embeddings and apply dimensionality reduction
    embed_dict = d2v_reduct(model)
    df_d2v = pd.DataFrame.from_dict(embed_dict).transpose()
    df_d2v.index.name = 'party_election'
    df_d2v.reset_index(inplace=True)
    pca = PCA(n_components=2, random_state=seed_val)
    df_d2v[['d2v_d1', 'd2v_d2']] = pca.fit_transform(df_d2v.iloc[:, 1:])
    df_d2v = df_d2v[['party_election', 'd2v_d1', 'd2v_d2']]
    
    # Split the 'party_election' label into separate columns
    df_d2v[['party', 'election']] = df_d2v['party_election'].str.split('_', expand=True)
    df_d2v.loc[:, 'election'] = df_d2v['election'].astype(int)
    df_d2v['country'] = country  # Add country column for merging later
    
    # Append the country-level dataframe to the list
    country_dfs.append(df_d2v)

# Merge all country-level datasets into a single dataframe
final_df_d2v = pd.concat(country_dfs, ignore_index=True)

# Save the final dataframe to a CSV file
final_df_d2v.to_csv('data/py_outputs/r&c_ep_party_election.csv', index=False)

# Print a summary
print(final_df_d2v.info())


## Doc2Vec scaling - Germany, growth vs anti growth
 

In [None]:
manifesto_welfare = manifesto_d2v[manifesto_d2v.cmp_code.isin(['504', '505'])].reset_index(drop=True)


In [None]:
manifesto_welfare.head()

In [None]:
# Create an empty list to store the country-level dataframes
country_dfs = []

# Get the unique list of countries from your data
unique_countries = manifesto_welfare['countryname'].unique()

# Loop through each country and process separately
for country in unique_countries:
    print(f"Processing country: {country}")
    
    # Filter the dataset for the current country
    country_data = manifesto_welfare[manifesto_welfare['countryname'] == country]
    
    # Build the corpus iterator for this country's data
    outputs_stream = phraseIterator(country_data, 'text_cleaned')
    bigram = Phraser(Phrases(outputs_stream, min_count=1, threshold=5))
    trigram = Phrases(bigram[outputs_stream], min_count=1, threshold=5)
    
    # Create the Doc2Vec model and build vocabulary
    model = Doc2Vec(vector_size=500, window=6, min_count=1, workers=16, epochs=20, seed=seed_val)
    model.build_vocab(corpusIterator(country_data, bigram=bigram, trigram=trigram, text='text_cleaned', labels='party_election'))
    
    # Train the model
    model.train(corpusIterator(country_data, bigram=bigram, trigram=trigram, text='text_cleaned', labels='party_election'),
                total_examples=model.corpus_count, epochs=20)
    
    # Generate embeddings and apply dimensionality reduction
    embed_dict = d2v_reduct(model)
    df_d2v = pd.DataFrame.from_dict(embed_dict).transpose()
    df_d2v.index.name = 'party_election'
    df_d2v.reset_index(inplace=True)
    pca = PCA(n_components=2, random_state=seed_val)
    df_d2v[['d2v_d1', 'd2v_d2']] = pca.fit_transform(df_d2v.iloc[:, 1:])
    df_d2v = df_d2v[['party_election', 'd2v_d1', 'd2v_d2']]
    
    # Split the 'party_election' label into separate columns
    df_d2v[['party', 'election']] = df_d2v['party_election'].str.split('_', expand=True)
    df_d2v.loc[:, 'election'] = df_d2v['election'].astype(int)
    df_d2v['country'] = country  # Add country column for merging later
    
    # Append the country-level dataframe to the list
    country_dfs.append(df_d2v)

# Merge all country-level datasets into a single dataframe
final_df_d2v = pd.concat(country_dfs, ignore_index=True)

# Save the final dataframe to a CSV file
final_df_d2v.to_csv('data/py_outputs/r&c_welfare_party_election.csv', index=False)

# Print a summary
print(final_df_d2v.info())


# Scale position scores for all countries (released dataset + model)

## Retrain for the entire dataset with all languages

In [None]:
manifesto_org = pd.read_csv(os.path.join("data", "r_outputs","pulled_manifestoes.csv"), encoding="utf-8", dtype={2:'str',18:'str'})

In [None]:
manifesto_other = pd.read_csv(os.path.join("data", "r_outputs","pulled_manifestoes_test.csv"), encoding="utf-8", dtype={2:'str',18:'str'})

In [None]:
manifesto_org_cleaned = manifesto_org.dropna(axis=1, how='all')
manifesto_other_cleaned = manifesto_other.dropna(axis=1, how='all')
manifesto_full = pd.concat([manifesto_org_cleaned, manifesto_other_cleaned]).reset_index(drop=True)

In [None]:
manifesto_full.head()

In [None]:
len(manifesto_full)

In [None]:
manifesto_full = manifesto_full[(manifesto_full.cmp_code.notna()) & ~(manifesto_full.cmp_code == 'H')].reset_index(drop=True)

In [None]:
manifesto_full['sentiment'] = manifesto_full['cmp_code'].apply(sentiment_code)
manifesto_full['topic'] = manifesto_full['cmp_code'].apply(topic_code)
manifesto_full['election'] = manifesto_full['date'].astype(str).str[:4]

In [None]:
manifesto_full.groupby('sentiment').count()

In [None]:
manifesto_full.groupby(['topic'])['sentiment'].value_counts().unstack(fill_value=0)

In [None]:
results = group_texts(manifesto_full, 
                      ['countryname','election','party','cmp_code'], 'text', 
                      max_group_factor = 5)

In [None]:
manifesto_regrouped = pd.DataFrame(results)
manifesto_regrouped = manifesto_regrouped.explode('text').reset_index(drop=True)

In [None]:
df_cols = manifesto_regrouped['labels'].str.split(';', expand=True)
manifesto_regrouped = pd.concat([manifesto_regrouped, df_cols], axis=1)


In [None]:
manifesto_regrouped.columns = ['text', 'country_election_party_code', 'country','election', 'party', 'cmp_code']

In [None]:
manifesto_regrouped.head()

In [None]:
manifesto_regrouped.loc[:,'sentiment'] = manifesto_regrouped['cmp_code'].apply(sentiment_code)
manifesto_regrouped.loc[:,'topic'] = manifesto_regrouped['cmp_code'].apply(topic_code)
manifesto_regrouped = manifesto_regrouped.drop_duplicates().reset_index(drop=True)
 

In [None]:
manifesto_regrouped.head()

In [None]:
manifesto_regrouped.to_csv('data/temps/manifesto_regrouped_full_processed.csv', encoding='utf-8', index=False)
manifesto_full.to_csv('data/temps/manifesto_full_processed.csv', encoding='utf-8', index=False)

In [None]:
coalition_regrouped = pd.read_csv('data/temps/coalitionagree_regrouped_processed.csv', encoding='utf-8')

In [None]:
manifesto_regrouped.loc[:, 'source'] = 'manifestos'
coalition_regrouped.loc[:, 'source'] = 'coalition_contracts'

In [None]:
coalition_regrouped.info()

In [None]:
coalition_regrouped.groupby(['topic'])['sentiment'].value_counts().unstack(fill_value=0)

In [None]:
final_df = pd.concat([manifesto_regrouped[['text','sentiment','topic', 'source']], coalition_regrouped[['text','sentiment','topic','source']]]).reset_index(drop=True)

In [None]:
model_name = 'xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer)

In [None]:
final_df.loc[:,'topic_sentiment'] = final_df['topic'] + '_' + final_df['sentiment']

In [None]:
final_dataset = Dataset.from_pandas(final_df)
final_dataset = final_dataset.class_encode_column('sentiment')
final_dataset = final_dataset.class_encode_column('topic')
final_dataset = final_dataset.class_encode_column('topic_sentiment')
final_dataset = final_dataset.class_encode_column('source')



In [None]:
train_test = final_dataset.train_test_split(test_size=0.1, stratify_by_column='topic_sentiment', seed=seed_val)

In [None]:
final_datasets = DatasetDict({
    'train': train_test['train'],
    'test': train_test['test']
})
final_datasets

In [None]:
tokenized_datasets = final_datasets.map(tokenize_function, 
                                            fn_kwargs={'tokenizer': tokenizer, 'text_var': 'text', 'max_length': 512}, 
                                            remove_columns=['text','topic_sentiment'])
tokenized_datasets.set_format("torch")
tokenized_datasets.column_names

In [None]:
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=16, shuffle=True, collate_fn = data_collator)
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=16, shuffle=False, collate_fn = data_collator)


In [None]:
num_topics = len(set(final_df['topic']))
num_sentiments = len(set(final_df['sentiment']))
model = ContextScalePrediction(roberta_model=model_name, num_topics=12, num_sentiments=3,lora=False, use_shared_attention=True).to(device)



In [None]:
n_epochs=5
total_steps = len(train_dataloader)*n_epochs
warmup = total_steps*0.1
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5) ## Recommended for LoRA. Without LoRA, can use 2e-5 instead.
scheduler = get_linear_schedule_with_warmup(optimizer, num_training_steps=total_steps, num_warmup_steps=warmup)
criterion = nn.CrossEntropyLoss()

In [None]:
for epoch in range(n_epochs):
    print(f"Epoch: {epoch+1}")
    train_loop(train_dataloader, model,optimizer, scheduler, device, criterion, criterion, sentiment_var='sentiment',
               topic_var='topic')

In [None]:
state_dict = model.state_dict()
save_file(state_dict, 'results/models/contextscale_full_released/model.safetensors')

## Scale manifestos and coalition contracts

In [None]:
coalition_regrouped = pd.read_csv('data/temps/coalitionagree_regrouped_processed.csv', encoding='utf-8')
manifesto_regrouped = pd.read_csv('data/temps/manifesto_regrouped_full_processed.csv', encoding='utf-8')

In [None]:
model_name = 'xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer)

In [None]:
num_topics = 12
num_sentiments = 3
scaling_model = ContextScalePrediction(roberta_model=model_name, num_topics=12, num_sentiments=3,lora=False, use_shared_attention=True).to(device)

model=None

In [None]:
loaded_tensors = load_file('results/models/contextscale_full_released/model.safetensors')
scaling_model.load_state_dict(loaded_tensors)

In [None]:
manifesto_dataset = Dataset.from_pandas(manifesto_regrouped[['text','topic','sentiment']].copy())
coalition_dataset = Dataset.from_pandas(coalition_regrouped[['text','topic','sentiment']].copy())
manifesto_dataset = manifesto_dataset.class_encode_column('topic') 
coalition_dataset = coalition_dataset.class_encode_column('topic')
manifesto_dataset = manifesto_dataset.class_encode_column('sentiment') 
coalition_dataset = coalition_dataset.class_encode_column('sentiment')

In [None]:
tokenized_manifesto_dataset = manifesto_dataset.map(tokenize_function, 
                                            fn_kwargs={'tokenizer': tokenizer, 'text_var': 'text', 'max_length': 512}, 
                                            remove_columns=['text'])
tokenized_manifesto_dataset.set_format("torch")
tokenized_coalition_dataset = coalition_dataset.map(tokenize_function, 
                                            fn_kwargs={'tokenizer': tokenizer, 'text_var': 'text', 'max_length': 512}, 
                                            remove_columns=['text'])
tokenized_coalition_dataset.set_format("torch")


In [None]:
manifesto_dataloader = DataLoader(tokenized_manifesto_dataset, batch_size=16, shuffle=False, collate_fn= data_collator)
coalition_dataloader = DataLoader(tokenized_coalition_dataset, batch_size=16, shuffle=False, collate_fn=data_collator)

In [None]:
## Compute position scores
output_manifesto_final = scale_func(manifesto_dataloader, 
               scaling_model, 
               device, 
               topic_label='topic', 
               sentiment_label='sentiment', 
               timing_log=True,
               use_ground_truth_topic=True)

In [None]:
file_path = 'data/temps/topic_labels'
with open(file_path, 'rb') as fp:
    topic_labels = pickle.load(fp)
name_topic_dict = dict([(x,y) for x,y in enumerate(topic_labels)])


file_path = 'data/temps/sentiment_labels'
with open(file_path, 'rb') as fp:
    sentiment_labels = pickle.load(fp)
name_sentiment_dict = dict([(x,y) for x,y in enumerate(sentiment_labels)])



In [None]:
output_manifesto_final.keys()

In [None]:
manifesto_regrouped.loc[:,'position_scores'] = output_manifesto_final['position_scores'].flatten()
manifesto_regrouped.loc[:,'pred_sentiment'] = output_manifesto_final['pred_sentiment']
manifesto_regrouped.loc[:,'pred_sentiment_name'] = manifesto_regrouped.pred_sentiment.map(name_sentiment_dict)

In [None]:
manifesto_regrouped.head()

In [None]:
manifesto_regrouped.to_csv('results/datasets/manifesto_full_scaled.csv', encoding='utf-8', index=False)

In [None]:
## Compute position scores
output_coalition_final = scale_func(coalition_dataloader, 
               scaling_model, 
               device, 
               topic_label='topic', 
               sentiment_label='sentiment', 
               timing_log=True,
               use_ground_truth_topic=True)

In [None]:
coalition_regrouped.loc[:,'position_scores'] = output_coalition_final['position_scores'].flatten()
coalition_regrouped.loc[:,'pred_sentiment'] = output_coalition_final['pred_sentiment']
coalition_regrouped.loc[:,'pred_sentiment_name'] = coalition_regrouped.pred_sentiment.map(name_topic_dict)

In [None]:
coalition_regrouped.to_csv('results/datasets/coalition_full_scaled.csv', encoding='utf-8', index=False)

## Create released dataset (position scores by country-party-election)

In [None]:
columns  =['country','party', 'election','topic','cs_mean_score', 'cs_se_score']
df = pd.DataFrame(columns=columns)

for name, group in manifesto_regrouped.groupby(['country','party','election','topic']):
    mean_score = group['position_scores'].mean()
    se_score = group['position_scores'].std()/np.sqrt(len(group))
    df_temp = pd.DataFrame([[str(group.iloc[0,group.columns.get_loc('country')]),
                             str(group.iloc[0,group.columns.get_loc('party')]), 
                    str(group.iloc[0,group.columns.get_loc('election')]), 
                    str(group.iloc[0,group.columns.get_loc('topic')]),
               mean_score, se_score]], columns = columns)
    df = (df_temp if df.empty else pd.concat([df, df_temp], ignore_index=True))

In [None]:
df.to_csv('results/datasets/contextscale_manifesto_dataset.csv', encoding='utf-8')

In [None]:
df.head()

In [None]:
columns  =['country', 'year','topic','cs_mean_score', 'cs_se_score']
df = pd.DataFrame(columns=columns)

for name, group in coalition_regrouped.groupby(['country','year','topic']):
    mean_score = group['position_scores'].mean()
    se_score = group['position_scores'].std()/np.sqrt(len(group))
    df_temp = pd.DataFrame([[str(group.iloc[0,group.columns.get_loc('country')]),
                    str(group.iloc[0,group.columns.get_loc('year')]), 
                    str(group.iloc[0,group.columns.get_loc('topic')]),
               mean_score, se_score]], columns = columns)
    df = (df_temp if df.empty else pd.concat([df, df_temp], ignore_index=True))

In [None]:
df.head()

In [None]:
df.to_csv('results/datasets/contextscale_coalition_dataset.csv', encoding='utf-8')