# Setup

In [1]:
import os
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from transformers import AutoTokenizer, DataCollatorWithPadding, BertModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
import torch.nn as nn
from torch.utils.data import DataLoader
from datasets import Dataset, DatasetDict
import random
from gensim.models import Doc2Vec
from gensim.models.phrases import Phrases, Phraser
from gensim.models.doc2vec import TaggedDocument
from utils.functions import group_texts, sentiment_code, topic_code,party_deu, clean_text_loop, copy_weights, sentiment_code_coalition, topic_code_coalition
from utils.functions import train_loop, eval_loop, tokenize_function, cmp_scale, scale_func, d2v_reduct, check_weights_similar, compare_architectures, get_architecture_details, recode_tw
from utils.models import ContextScalePrediction, corpusIterator, phraseIterator
from safetensors.torch import load_file, save_file
from sklearn.decomposition import PCA
import pickle
import nltk
#nltk.download('stopwords') ## Remove comments and do it once if you haven't

In [2]:
torch.cuda.empty_cache() 
device = torch.device('cuda')
torch.cuda.get_device_name(device=None)


'NVIDIA RTX PRO 6000 Blackwell Workstation Edition'

In [3]:
## Pseudo-randomness for reproducibility
seed_val = 1234
torch.manual_seed(seed_val)
random.seed(seed_val)
np.random.seed(seed_val)


# Small test of BERT embeddings

In [None]:
model_name = 'bert-base-cased'

In [None]:
model = BertModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
sentence_a = 'I went to the river bank'
sentence_b = 'I went to the bank by the river'
tok_a = tokenizer(sentence_a, return_tensors='pt')
tok_b = tokenizer(sentence_b, return_tensors='pt')

In [None]:
tok_a

In [None]:
tok_b

In [None]:
with torch.no_grad():
    outputs_a = model(**tok_a)
    outputs_b = model(**tok_b)
# Extract word embeddings from the last hidden layer
last_hidden_states_a = outputs_a.last_hidden_state
last_hidden_states_b = outputs_b.last_hidden_state

# Extract the word embedding for the first token (CLS token)
word_embedding_a = last_hidden_states_a[:, -2, :] ## 0 is the CLS token, river is the last token
word_embedding_b = last_hidden_states_b[:, 5, :]

In [None]:
np.corrcoef(word_embedding_a.numpy(), word_embedding_b.numpy()).round(2)

# Implementation

## Data preparation

In [None]:
manifesto = pd.read_csv(os.path.join("data", "r_outputs","pulled_manifestoes.csv"), encoding="utf-8", dtype = {2: 'str', 18:'str'})

In [None]:
manifesto = manifesto[(manifesto.cmp_code.notna()) & ~(manifesto.cmp_code.isin(['H']))].reset_index(drop=True)
len(manifesto)

In [None]:
manifesto['sentiment'] = manifesto['cmp_code'].apply(sentiment_code)
manifesto['topic'] = manifesto['cmp_code'].apply(topic_code)
manifesto['election'] = manifesto['date'].astype(str).str[:4]

In [None]:
manifesto.groupby(['topic','sentiment']).count()

In [None]:
grouped_result = manifesto.groupby(['topic', 'sentiment', 'cmp_code']).size().reset_index(name='count')
grouped_result.to_csv('data/temps/categorization_table.csv', index=False)




In [None]:
manifesto.groupby('sentiment').count()

In [None]:
len(manifesto[manifesto.topic=="Military"])/len(manifesto)*100 ## minority group: 1.7%

In [None]:
texts = manifesto['text'].tolist()

In [None]:
from statistics import stdev, mean
## Before
seq_len = [len(i.split()) for i in texts]
seq_len_mean = mean(seq_len)
seq_len_std = stdev(seq_len)
seq_len_max = max(seq_len)
seq_len_min = min(seq_len)
print('Mean length (word) is: {}'.format(seq_len_mean))
print('Std length (word) is: {}'.format(seq_len_std))
print('Min length (word) is: {}'.format(seq_len_min))
print('Max length (word) is: {}'.format(seq_len_max))

In [None]:
pd.Series(seq_len).hist(bins = 30)

In [None]:
results = group_texts(manifesto, 
                      ['countryname','election','party','cmp_code'], 'text', 
                      max_group_factor = 5)

In [None]:
manifesto_regrouped = pd.DataFrame(results)
manifesto_regrouped = manifesto_regrouped.explode('text').reset_index(drop=True)

In [None]:
df_cols = manifesto_regrouped['labels'].str.split(';', expand=True)
manifesto_regrouped = pd.concat([manifesto_regrouped, df_cols], axis=1)


In [None]:
manifesto_regrouped.columns = ['text', 'idx', 'country','election', 'party', 'cmp_code']

In [None]:
manifesto_regrouped.head()

In [None]:
manifesto_regrouped.loc[:,'sentiment'] = manifesto_regrouped['cmp_code'].apply(sentiment_code)
manifesto_regrouped.loc[:,'topic'] = manifesto_regrouped['cmp_code'].apply(topic_code)
manifesto_regrouped = manifesto_regrouped.drop_duplicates().reset_index(drop=True)


In [None]:
manifesto_regrouped.groupby(['topic','sentiment']).count()

In [None]:
texts = manifesto_regrouped['text'].tolist()
from statistics import stdev, mean
## Before
seq_len = [len(i.split()) for i in texts]
seq_len_mean = mean(seq_len)
seq_len_std = stdev(seq_len)
seq_len_max = max(seq_len)
seq_len_min = min(seq_len)
print('Mean length (word) is: {}'.format(seq_len_mean))
print('Std length (word) is: {}'.format(seq_len_std))
print('Min length (word) is: {}'.format(seq_len_min))
print('Max length (word) is: {}'.format(seq_len_max))

In [None]:
pd.Series(seq_len).hist(bins = 30)

In [None]:
manifesto_regrouped.to_csv('data/temps/manifesto_regrouped.csv', encoding='utf-8', index=False)
manifesto.to_csv('data/temps/manifesto.csv', encoding='utf-8', index=False)

## Preparing dataloaders 

In [4]:
manifesto = pd.read_csv('data/temps/manifesto.csv', encoding='utf-8', dtype={2:'str',18: 'str'})
manifesto_regrouped = pd.read_csv('data/temps/manifesto_regrouped.csv', encoding='utf-8')

In [5]:
manifesto_reduced = manifesto_regrouped[['topic','sentiment','text']].reset_index(drop=True)

In [6]:
model_name = 'xlm-roberta-base' 
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer)

In [7]:
manifesto_reduced['topic_sentiment'] = manifesto_reduced['topic'] + '_' + manifesto_reduced['sentiment']

In [8]:
manifesto_dataset = Dataset.from_pandas(manifesto_reduced)
manifesto_dataset = manifesto_dataset.class_encode_column('topic')
manifesto_dataset = manifesto_dataset.class_encode_column('sentiment')
manifesto_dataset = manifesto_dataset.class_encode_column('topic_sentiment')



Casting to class labels:   0%|          | 0/337412 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/337412 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/337412 [00:00<?, ? examples/s]

In [9]:
## Save class labels
import pickle
topic_labels = manifesto_dataset.features['topic'].names
file_path = 'data/temps/topic_labels'
with open(file_path, 'wb') as fp:
    pickle.dump(topic_labels, fp)

sentiment_labels = manifesto_dataset.features['sentiment'].names
file_path = 'data/temps/sentiment_labels'
with open(file_path, 'wb') as fp:
    pickle.dump(sentiment_labels, fp)

In [10]:
train_test = manifesto_dataset.train_test_split(test_size=0.1, stratify_by_column='topic_sentiment', seed=seed_val)
train_eval = train_test['train'].train_test_split(test_size=0.3, stratify_by_column='topic_sentiment', seed=seed_val )

In [11]:
manifesto_datasets = DatasetDict({
    'train': train_eval['train'],
    'test': train_test['test'],
    'eval': train_eval['test']
})
manifesto_datasets

DatasetDict({
    train: Dataset({
        features: ['topic', 'sentiment', 'text', 'topic_sentiment'],
        num_rows: 212569
    })
    test: Dataset({
        features: ['topic', 'sentiment', 'text', 'topic_sentiment'],
        num_rows: 33742
    })
    eval: Dataset({
        features: ['topic', 'sentiment', 'text', 'topic_sentiment'],
        num_rows: 91101
    })
})

In [12]:
tokenized_datasets = manifesto_datasets.map(tokenize_function, 
                                            fn_kwargs={'tokenizer': tokenizer, 'text_var': 'text', 'max_length': 512}, 
                                            remove_columns=['text', 'topic_sentiment'])
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

Map:   0%|          | 0/212569 [00:00<?, ? examples/s]

Map:   0%|          | 0/33742 [00:00<?, ? examples/s]

Map:   0%|          | 0/91101 [00:00<?, ? examples/s]

['topic', 'sentiment', 'input_ids', 'attention_mask']

In [13]:
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=16, shuffle=True, collate_fn = data_collator)
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=16, shuffle=False, collate_fn = data_collator)
eval_dataloader = DataLoader(tokenized_datasets['eval'], batch_size=16, shuffle=False, collate_fn = data_collator)

## Deep Ensemble Training and Uncertainty Estimation

This section implements deep ensemble training with uncertainty estimation

### Configuration 

In [None]:
# Import the new uncertainty module
from utils.uncertainty import (
    ensemble_inference,
    load_ensemble_models,
    save_ensemble_results,
    create_ensemble_summary_dataframe
)

In [None]:
# Configuration for ensemble training and uncertainty estimation
ENSEMBLE_CONFIG = {
    'num_models': 5,  # Number of ensemble members
    'beta': 1.0,      # Beta parameter for exponential position score
    'n_epochs': 5,    # Epochs per model
    'lr': 2e-5,       # Learning rate
    'save_dir': 'results/models/ensemble',
    'model_prefix': 'model_ensemble'
}

print("Ensemble Configuration:")
for key, value in ENSEMBLE_CONFIG.items():
    print(f"  {key}: {value}")

### Define model



In [None]:
# Define model factory function for ensemble training
num_topics = len(set(manifesto_dataset['topic']))
num_sentiments = len(set(manifesto_dataset['sentiment']))
def create_model():
    """Factory function to create a new model instance for ensemble training."""
    return ContextScalePrediction(
        roberta_model=model_name, 
        num_topics=num_topics, 
        num_sentiments=num_sentiments,
        lora=False,
        use_shared_attention=True  # Using shared attention architecture
    )

print("Model factory function defined")

### Ensemble inference on the test set

Training is done using train.py script. Here we implement ensemble inference on test set.

In [None]:
# Generate checkpoint paths for the ensemble models trained with different splits
checkpoint_paths_splits = [
    os.path.join(ENSEMBLE_CONFIG['save_dir'], f"{ENSEMBLE_CONFIG['model_prefix']}_{i}.safetensors")
    for i in range(5)
]

print("Checkpoint paths for split-based ensemble:")
for i, path in enumerate(checkpoint_paths_splits):
    print(f"  Model {i}: {path}")

# Load the ensemble models
ensemble_models = load_ensemble_models(
    model_factory=create_model,
    checkpoint_paths=checkpoint_paths_splits,
    device=device
)

In [None]:
# Perform ensemble inference with uncertainty estimation
print("Performing ensemble inference with uncertainty estimation...")
print(f"Using beta = {ENSEMBLE_CONFIG['beta']} for exponential position score computation")
print("The ensemble will compute:")
print("  - Mean position scores across all 5 models")
print("  - Position score variance for each text sequence")
print("  - Epistemic uncertainty (model disagreement)")
print("  - Aleatoric uncertainty (inherent data uncertainty)")

ensemble_results = ensemble_inference(
    models=ensemble_models,
    dataloader=test_dataloader,
    device=device,
    beta=1.0,
    topic_label='topic',
    sentiment_label='sentiment',
    use_ground_truth_topic=True
)

print(f"\nEnsemble inference completed!")
print(f"Final position scores are the mean of {len(ensemble_models)} models")
print(f"Position score variance included for each sequence")

save_ensemble_results(
    ensemble_results,
    'results/datasets/ensemble_manifesto_results.pkl'
)

### Merge back to original test dataset

In [None]:
## Load ensemble results with pickle
file_path = 'results/datasets/ensemble_manifesto_results.pkl'
with open(file_path, "rb") as file:
    ensemble_results = pickle.load(file)

In [None]:
ensemble_summary_df = create_ensemble_summary_dataframe(ensemble_results)

In [None]:
## Merge ensemble_summary_df with original test dataset for analysis
test_dataset = manifesto_datasets['test'].to_pandas().reset_index(drop=True)
merged_df = pd.concat([test_dataset, ensemble_summary_df], axis=1)

In [None]:
## Save to csv
merged_df.to_csv('results/datasets/ensemble_test_dataset.csv', index=False)

### Inference on the entire dataset 


In [None]:
# Configuration for ensemble training and uncertainty estimation
ENSEMBLE_CONFIG = {
    'num_models': 5,  # Number of ensemble members
    'beta': 1.0,      # Beta parameter for exponential position score
    'n_epochs': 5,    # Epochs per model
    'lr': 2e-5,       # Learning rate
    'save_dir': 'results/models/ensemble_scaling',
    'model_prefix': 'model_ensemble'
}

print("Ensemble Configuration:")
for key, value in ENSEMBLE_CONFIG.items():
    print(f"  {key}: {value}")

In [None]:
# Define model factory function for ensemble training
num_topics = len(set(manifesto_dataset['topic']))
num_sentiments = len(set(manifesto_dataset['sentiment']))
def create_model():
    """Factory function to create a new model instance for ensemble training."""
    return ContextScalePrediction(
        roberta_model=model_name, 
        num_topics=num_topics, 
        num_sentiments=num_sentiments,
        lora=False,
        use_shared_attention=True  # Using shared attention architecture
    )

print("Model factory function defined")

In [None]:
# Generate checkpoint paths for the ensemble models trained with different splits
checkpoint_paths_splits = [
    os.path.join(ENSEMBLE_CONFIG['save_dir'], f"{ENSEMBLE_CONFIG['model_prefix']}_{i}.safetensors")
    for i in range(5)
]

print("Checkpoint paths for split-based ensemble:")
for i, path in enumerate(checkpoint_paths_splits):
    print(f"  Model {i}: {path}")

# Load the ensemble models
ensemble_models = load_ensemble_models(
    model_factory=create_model,
    checkpoint_paths=checkpoint_paths_splits,
    device=device
)

In [None]:
full_dataset = manifesto_dataset.map(tokenize_function, 
                                            fn_kwargs={'tokenizer': tokenizer, 'text_var': 'text', 'max_length': 512}, 
                                            remove_columns=['text'])
full_dataset.set_format("torch")
full_dataloader = DataLoader(full_dataset, batch_size=64, shuffle=False, collate_fn=data_collator)

full_ensemble_results = ensemble_inference(
    models=ensemble_models,
    dataloader=full_dataloader,
    device=device,
    beta=1.0,
    topic_label='topic',
    sentiment_label='sentiment',
    use_ground_truth_topic=True
)


## Ablation: Model architecture comparison

### Base Model

In [None]:
num_topics = len(set(manifesto_reduced['topic']))
num_sentiments = len(set(manifesto_reduced['sentiment']))
model = ContextScalePrediction(roberta_model=model_name, 
                               num_topics=num_topics, 
                               num_sentiments=num_sentiments,
                               lora=False).to(device)



In [None]:
n_epochs=5
total_steps = len(train_dataloader)*n_epochs
warmup = total_steps*0.1
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5) 
scheduler = get_linear_schedule_with_warmup(optimizer, num_training_steps=total_steps, num_warmup_steps=warmup)
criterion_sent = nn.CrossEntropyLoss()
criterion_topic =  nn.CrossEntropyLoss()


In [None]:
for epoch in range(n_epochs):
    print(f"Epoch: {epoch+1}")
    timing_log = train_loop(train_dataloader, model,optimizer, scheduler, device, criterion_sent, criterion_topic, sentiment_var='sentiment',
               topic_var='topic', timing_log=True)
    eval_loop(eval_dataloader, model, device, criterion_sent, criterion_topic, sentiment_var='sentiment', topic_var='topic')

    

In [None]:
state_dict = model.state_dict()
save_file(state_dict, 'results/models/manifesto_ContextScalePrediction_base/model.safetensors')

In [None]:
outputs_base = scale_func(test_dataloader, 
               model, 
               device, 
               topic_label='topic', 
               sentiment_label='sentiment', 
               timing_log=True,
               use_ground_truth_topic=True)

In [None]:
outputs_base['res_table_topic']

In [None]:
outputs_base['res_table_topic']['f1'].mean().round(2)

In [None]:
outputs_base['res_table_sentiment']

In [None]:
outputs_base['res_table_sentiment']['f1'].mean().round(2)

In [None]:
file_path = 'data/temps/outputs_base'
with open(file_path, "wb") as file:
    pickle.dump(outputs_base, file)

In [None]:
outputs_base['res_table_sentiment'].to_csv('results/classification results/base_sentiment.csv', index=False)
outputs_base['res_table_topic'].to_csv('results/classification results/base_topic.csv', index=False)


### Model with simple flow of information

In [None]:
num_topics = len(set(manifesto_reduced['topic']))
num_sentiments = len(set(manifesto_reduced['sentiment']))
model = ContextScalePrediction(roberta_model=model_name, 
                               num_topics=num_topics, 
                               num_sentiments=num_sentiments,
                               lora=False,
                               use_simple_flow=True).to(device)



In [None]:
n_epochs=5
total_steps = len(train_dataloader)*n_epochs
warmup = total_steps*0.1
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5) 
scheduler = get_linear_schedule_with_warmup(optimizer, num_training_steps=total_steps, num_warmup_steps=warmup)
criterion_sent = nn.CrossEntropyLoss()
criterion_topic =  nn.CrossEntropyLoss()


In [None]:
for epoch in range(n_epochs):
    print(f"Epoch: {epoch+1}")
    timing_log = train_loop(train_dataloader, model,optimizer, scheduler, device, criterion_sent, criterion_topic, sentiment_var='sentiment',
               topic_var='topic', timing_log=True)
    eval_loop(eval_dataloader, model, device, criterion_sent, criterion_topic, sentiment_var='sentiment', topic_var='topic')

    

In [None]:
state_dict = model.state_dict()
save_file(state_dict, 'results/models/manifesto_ContextScalePrediction_sf/model.safetensors')

In [None]:
outputs_sf = scale_func(test_dataloader, 
               model, 
               device, 
               topic_label='topic', 
               sentiment_label='sentiment', 
               timing_log=True,
               use_ground_truth_topic=True)

In [None]:
outputs_sf['res_table_topic']

In [None]:
outputs_sf['res_table_topic']['f1'].mean().round(2)

In [None]:
outputs_sf['res_table_sentiment']

In [None]:
outputs_sf['res_table_sentiment']['f1'].mean().round(2)

In [None]:
file_path = 'data/temps/outputs_sf'
with open(file_path, "wb") as file:
    pickle.dump(outputs_sf, file)

In [None]:
outputs_sf['res_table_sentiment'].to_csv('results/classification results/sf_sentiment.csv', index=False)
outputs_sf['res_table_topic'].to_csv('results/classification results/sf_topic.csv', index=False)


### Model with shared attention

In [None]:
num_topics = len(set(manifesto_reduced['topic']))
num_sentiments = len(set(manifesto_reduced['sentiment']))
model = ContextScalePrediction(roberta_model=model_name, 
                               num_topics=num_topics, 
                               num_sentiments=num_sentiments,
                               lora=False,
                               use_shared_attention=True).to(device)



In [None]:
n_epochs=5
total_steps = len(train_dataloader)*n_epochs
warmup = total_steps*0.1
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5) 
scheduler = get_linear_schedule_with_warmup(optimizer, num_training_steps=total_steps, num_warmup_steps=warmup)
criterion_sent = nn.CrossEntropyLoss()
criterion_topic =  nn.CrossEntropyLoss()


In [None]:
for epoch in range(n_epochs):
    print(f"Epoch: {epoch+1}")
    timing_log = train_loop(train_dataloader, model,optimizer, scheduler, device, criterion_sent, criterion_topic, sentiment_var='sentiment',
               topic_var='topic', timing_log=True)
    eval_loop(eval_dataloader, model, device, criterion_sent, criterion_topic, sentiment_var='sentiment', topic_var='topic')

    

In [None]:
state_dict = model.state_dict()
save_file(state_dict, 'results/models/manifesto_ContextScalePrediction_sa/model.safetensors')

In [None]:
outputs_sa = scale_func(test_dataloader, 
               model, 
               device, 
               topic_label='topic', 
               sentiment_label='sentiment', 
               timing_log=True,
               use_ground_truth_topic=True)

In [None]:
outputs_sa['res_table_topic']

In [None]:
outputs_sa['res_table_topic']['f1'].mean().round(2)

In [None]:
outputs_sa['res_table_sentiment']

In [None]:
outputs_sa['res_table_sentiment']['f1'].mean().round(2)

In [None]:
file_path = 'data/temps/outputs_sa'
with open(file_path, "wb") as file:
    pickle.dump(outputs_sa, file)

In [None]:
outputs_sa['res_table_sentiment'].to_csv('results/classification results/sa_sentiment.csv', index=False)
outputs_sa['res_table_topic'].to_csv('results/classification results/sa_topic.csv', index=False)


### Model with dynamic gating

In [None]:
num_topics = len(set(manifesto_reduced['topic']))
num_sentiments = len(set(manifesto_reduced['sentiment']))
model = ContextScalePrediction(roberta_model=model_name, 
                               num_topics=num_topics, 
                               num_sentiments=num_sentiments,
                               lora=False,
                               use_dynamic_gating=True).to(device)



In [None]:
n_epochs=5
total_steps = len(train_dataloader)*n_epochs
warmup = total_steps*0.1
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5) 
scheduler = get_linear_schedule_with_warmup(optimizer, num_training_steps=total_steps, num_warmup_steps=warmup)
criterion_sent = nn.CrossEntropyLoss()
criterion_topic =  nn.CrossEntropyLoss()


In [None]:
for epoch in range(n_epochs):
    print(f"Epoch: {epoch+1}")
    timing_log = train_loop(train_dataloader, model,optimizer, scheduler, device, criterion_sent, criterion_topic, sentiment_var='sentiment',
               topic_var='topic', timing_log=True)
    eval_loop(eval_dataloader, model, device, criterion_sent, criterion_topic, sentiment_var='sentiment', topic_var='topic')

    

In [None]:
state_dict = model.state_dict()
save_file(state_dict, 'results/models/manifesto_ContextScalePrediction_dg/model.safetensors')

In [None]:
outputs_dg = scale_func(test_dataloader, 
               model, 
               device, 
               topic_label='topic', 
               sentiment_label='sentiment', 
               timing_log=True,
               use_ground_truth_topic=True)

In [None]:
outputs_dg['res_table_topic']

In [None]:
outputs_dg['res_table_topic']['f1'].mean().round(2)

In [None]:
outputs_dg['res_table_sentiment']

In [None]:
outputs_dg['res_table_sentiment']['f1'].mean().round(2)

In [None]:
file_path = 'data/temps/outputs_dg'
with open(file_path, "wb") as file:
    pickle.dump(outputs_dg, file)

In [None]:
outputs_dg['res_table_sentiment'].to_csv('results/classification results/dg_sentiment.csv', index=False)
outputs_dg['res_table_topic'].to_csv('results/classification results/dg_topic.csv', index=False)


# Validity checks

### Different languages

#### Testing on test languages unseen during training

In [None]:
manifesto_test = pd.read_csv(os.path.join("data", "r_outputs","pulled_manifestoes_test.csv"), encoding="utf-8")

In [None]:
manifesto_test.head()

In [None]:
manifesto_test = manifesto_test[(manifesto_test.cmp_code.notna()) & ~(manifesto_test.cmp_code == 'H')].reset_index(drop=True)

In [None]:
manifesto_test['sentiment'] = manifesto['cmp_code'].apply(sentiment_code)
manifesto_test['topic'] = manifesto['cmp_code'].apply(topic_code)
manifesto_test['election'] = manifesto['date'].astype(str).str[:4]

In [None]:
results = group_texts(manifesto_test, 
                      ['countryname','election','party','cmp_code'], 'text', 
                      max_group_factor = 5)

In [None]:
manifesto_regrouped = pd.DataFrame(results)
manifesto_regrouped = manifesto_regrouped.explode('text').reset_index(drop=True)

In [None]:
df_cols = manifesto_regrouped['labels'].str.split(';', expand=True)
manifesto_regrouped = pd.concat([manifesto_regrouped, df_cols], axis=1)


In [None]:
manifesto_regrouped.columns = ['text', 'country_election_party_code', 'country','election', 'party', 'cmp_code']

In [None]:
manifesto_regrouped.head()

In [None]:
manifesto_regrouped.loc[:,'sentiment'] = manifesto_regrouped['cmp_code'].apply(sentiment_code)
manifesto_regrouped.loc[:,'topic'] = manifesto_regrouped['cmp_code'].apply(topic_code)
manifesto_regrouped = manifesto_regrouped.drop_duplicates().reset_index(drop=True)


In [None]:
manifesto_regrouped.groupby(['topic','sentiment']).count()

In [None]:
manifesto_reduced = manifesto_regrouped[['topic','sentiment','text']].reset_index(drop=True)

In [None]:
model_name = 'xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer)

In [None]:
manifesto_dataset = Dataset.from_pandas(manifesto_reduced)
manifesto_dataset = manifesto_dataset.class_encode_column('topic')
manifesto_dataset = manifesto_dataset.class_encode_column('sentiment')



In [None]:
tokenized_dataset = manifesto_dataset.map(tokenize_function, 
                                            fn_kwargs={'tokenizer': tokenizer, 'text_var': 'text', 'max_length': 512}, 
                                            remove_columns=['text'])

In [None]:
train_dataloader = DataLoader(tokenized_dataset, batch_size=16, shuffle=True, collate_fn = data_collator)
pred_dataloader = DataLoader(tokenized_dataset, batch_size=16, shuffle=False, collate_fn = data_collator)

In [None]:
## Load pre-trained models
num_topics = len(set(manifesto_reduced['topic']))
num_sentiments = len(set(manifesto_reduced['sentiment']))
scaling_model = ContextScalePrediction(roberta_model=model_name, num_topics=num_topics, num_sentiments=num_sentiments,lora=False,
                                       use_shared_attention=True).to(device)

loaded_tensors = load_file('results/models/manifesto_ContextScalePrediction_main/model.safetensors')
scaling_model.load_state_dict(loaded_tensors)
model=None

In [None]:
outputs_dl = scale_func(pred_dataloader, 
               scaling_model, 
               device, 
               topic_label='topic', 
               sentiment_label='sentiment', 
               timing_log=True,
               use_ground_truth_topic=True)

In [None]:
outputs_dl['res_table_topic'].mean().round(2)

In [None]:
outputs_dl['res_table_sentiment'].mean().round(2)

In [None]:
file_path = 'data/temps/outputs_dl'
with open(file_path, "wb") as file:
    pickle.dump(outputs_dl, file)

In [None]:
outputs_dl['res_table_sentiment'].to_csv('results/classification results/dl_sentiment.csv', index=False)
outputs_dl['res_table_topic'].to_csv('results/classification results/dl_topic.csv', index=False)


#### Train a model using only 10% of labelled data

In [None]:
manifesto_reduced['topic_sentiment'] = manifesto_reduced['topic'] + '_' + manifesto_reduced['sentiment']

In [None]:
manifesto_dataset = Dataset.from_pandas(manifesto_reduced)
manifesto_dataset = manifesto_dataset.class_encode_column('topic')
manifesto_dataset = manifesto_dataset.class_encode_column('sentiment')
manifesto_dataset = manifesto_dataset.class_encode_column('topic_sentiment')



In [None]:
train_test = manifesto_dataset.train_test_split(test_size=0.9, stratify_by_column='topic_sentiment', seed=seed_val)
train_eval = train_test['train'].train_test_split(test_size=0.3, stratify_by_column='topic_sentiment', seed=seed_val )

In [None]:
manifesto_datasets = DatasetDict({
    'train': train_eval['train'],
    'test': train_test['test'],
    'eval': train_eval['test']
})
manifesto_datasets

In [None]:
tokenized_datasets = manifesto_datasets.map(tokenize_function, 
                                            fn_kwargs={'tokenizer': tokenizer, 'text_var': 'text', 'max_length': 512}, 
                                            remove_columns=['text', 'topic_sentiment'])
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

In [None]:
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=16, shuffle=True, collate_fn = data_collator)
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=16, shuffle=False, collate_fn = data_collator)
eval_dataloader = DataLoader(tokenized_datasets['eval'], batch_size=16, shuffle=False, collate_fn = data_collator)

In [None]:
num_topics = len(set(manifesto_reduced['topic']))
num_sentiments = len(set(manifesto_reduced['sentiment']))
model = ContextScalePrediction(roberta_model=model_name, 
                               num_topics=num_topics, 
                               num_sentiments=num_sentiments,
                               lora=False,
                               use_shared_attention=True).to(device)
loaded_tensors = load_file('results/models/manifesto_ContextScalePrediction_main/model.safetensors')
model.load_state_dict(loaded_tensors)
scaling_model = None


In [None]:
n_epochs=5
total_steps = len(train_dataloader)*n_epochs
warmup = total_steps*0.1
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5) 
scheduler = get_linear_schedule_with_warmup(optimizer, num_training_steps=total_steps, num_warmup_steps=warmup)
criterion_sent = nn.CrossEntropyLoss()
criterion_topic =  nn.CrossEntropyLoss()


In [None]:
for epoch in range(n_epochs):
    print(f"Epoch: {epoch+1}")
    timing_log = train_loop(train_dataloader, model,optimizer, scheduler, device, criterion_sent, criterion_topic, sentiment_var='sentiment',
               topic_var='topic', timing_log=True)
    eval_loop(eval_dataloader, model, device, criterion_sent, criterion_topic, sentiment_var='sentiment', topic_var='topic')

    

In [None]:
state_dict = model.state_dict()
save_file(state_dict, 'results/models/manifesto_ContextScalePrediction_dl_10/model.safetensors')

In [None]:
## Load pre-trained models
num_topics = len(set(manifesto_reduced['topic']))
num_sentiments = len(set(manifesto_reduced['sentiment']))
scaling_model = ContextScalePrediction(roberta_model=model_name, num_topics=num_topics, num_sentiments=num_sentiments,lora=False,
                                       use_shared_attention=True).to(device)

loaded_tensors = load_file('results/models/manifesto_ContextScalePrediction_dl_10/model.safetensors')
scaling_model.load_state_dict(loaded_tensors)
model=None

In [None]:
outputs_dl_10 = scale_func(test_dataloader, 
               scaling_model, 
               device, 
               topic_label='topic', 
               sentiment_label='sentiment', 
               timing_log=True,
               use_ground_truth_topic=True)

In [None]:
outputs_dl_10['res_table_topic']

In [None]:
outputs_dl_10['res_table_topic']['f1'].mean().round(2)

In [None]:
outputs_dl_10['res_table_sentiment']

In [None]:
outputs_dl_10['res_table_sentiment']['f1'].mean().round(2)

In [None]:
file_path = 'data/temps/outputs_dl_10'
with open(file_path, "wb") as file:
    pickle.dump(outputs_dl_10, file)

In [None]:
outputs_dl_10['res_table_sentiment'].to_csv('results/classification results/dl_10_sentiment.csv', index=False)
outputs_dl_10['res_table_topic'].to_csv('results/classification results/dl_10_topic.csv', index=False)


In [None]:
outputs_dl_10_all = scale_func(pred_dataloader, 
               scaling_model, 
               device, 
               topic_label='topic', 
               sentiment_label='sentiment', 
               timing_log=True,
               use_ground_truth_topic=True)

In [None]:
manifesto_regrouped.loc[:,'position_scores'] = outputs_dl_10_all['position_scores'].flatten()
manifesto_regrouped.loc[:,'pred_sentiment_index'] = outputs_dl_10_all['pred_sentiment']
manifesto_regrouped.loc[:,'pred_sentiment'] = manifesto_regrouped.pred_sentiment_index.map(name_sentiment_dict)
manifesto_regrouped.loc[:,'pred_topic_index'] = outputs_dl_10_all['pred_topics']
manifesto_regrouped.loc[:,'pred_topic'] = manifesto_regrouped.pred_topic_index.map(name_topic_dict)

In [None]:
manifesto_regrouped.to_csv('data/py_outputs/manifesto_dl_10_all.csv', index=False)

### COALITIONAGREE, same coding style

#### No supervision

In [None]:
coalitionagree = pd.read_csv('data/r_outputs/coalitionagree_texts.csv', encoding='utf-8', index_col=0).reset_index(drop=True)

In [None]:
coalitionagree.head()

In [None]:
results = group_texts(coalitionagree, ['country','cabinet_year','category2','category3'], 'sentence', max_group_factor = 5)

In [None]:
coalition_regrouped = pd.DataFrame(results)
coalition_regrouped = coalition_regrouped.explode('text').reset_index(drop=True)
df_cols = coalition_regrouped['labels'].str.split(';', expand=True)
coalition_regrouped = pd.concat([coalition_regrouped, df_cols], axis=1)
coalition_regrouped.columns =['text','labels', 'country','year', 'cmp_short','cmp_long']


In [None]:
coalition_regrouped.head()

In [None]:
coalition_regrouped['sentiment'] = coalition_regrouped.apply(lambda x: sentiment_code_coalition(x['cmp_short'], x['cmp_long']), axis=1)
coalition_regrouped['topic'] = coalition_regrouped['cmp_short'].apply(topic_code_coalition)

In [None]:
coalition_regrouped.groupby(['topic','sentiment']).count()

In [None]:
coalition_regrouped.to_csv('data/temps/coalitionagree_regrouped_processed.csv', encoding='utf-8',index=False)

In [None]:
cagree_reduced = coalition_regrouped[['sentiment', 'topic','text']].copy()

In [None]:
cagree_dataset = Dataset.from_pandas(cagree_reduced)
cagree_dataset = cagree_dataset.class_encode_column('sentiment')
cagree_dataset = cagree_dataset.class_encode_column('topic')


In [None]:
tokenized_dataset = cagree_dataset.map(tokenize_function, 
                                            fn_kwargs={'tokenizer': tokenizer, 'text_var': 'text', 'max_length': 512}, 
                                            remove_columns=['text'])

In [None]:
pred_dataloader = DataLoader(tokenized_dataset, batch_size=16, shuffle=False, collate_fn = data_collator)

In [None]:
## Load pre-trained models
num_topics = len(set(manifesto_reduced['topic']))
num_sentiments = len(set(manifesto_reduced['sentiment']))
scaling_model = ContextScalePrediction(roberta_model=model_name, num_topics=num_topics, num_sentiments=num_sentiments,lora=False,
                                       use_shared_attention=True).to(device)

loaded_tensors = load_file('results/models/manifesto_ContextScalePrediction_main/model.safetensors')
scaling_model.load_state_dict(loaded_tensors)
model=None

In [None]:
outputs_ca_test = scale_func(pred_dataloader, 
               scaling_model, 
               device, 
               topic_label='topic', 
               sentiment_label='sentiment', 
               timing_log=True,
               use_ground_truth_topic=True)

In [None]:
outputs_ca_test['res_table_sentiment']['f1'].mean().round(2)

In [None]:
outputs_ca_test['res_table_topic']['f1'].mean().round(2)

In [None]:
file_path = 'data/temps/outputs_ca_test'
with open(file_path, "wb") as file:
    pickle.dump(outputs_ca_test, file)

In [None]:
outputs_ca_test['res_table_sentiment'].to_csv('results/classification results/cagree_noft_sentiment.csv', index=False)
outputs_ca_test['res_table_topic'].to_csv('results/classification results/cagree_noft_topic.csv', index=False)


#### 10% supervision

In [None]:
cagree_reduced.loc[:,'topic_sentiment'] = cagree_reduced.loc[:,'topic'] + '_' + cagree_reduced.loc[:,'sentiment']

In [None]:
cagree_dataset = Dataset.from_pandas(cagree_reduced)
cagree_dataset = cagree_dataset.class_encode_column('sentiment')
cagree_dataset = cagree_dataset.class_encode_column('topic')
cagree_dataset = cagree_dataset.class_encode_column('topic_sentiment')

In [None]:
train_test = cagree_dataset.train_test_split(test_size=0.9, stratify_by_column='topic_sentiment', seed=seed_val)

In [None]:
cagree_datasets = DatasetDict({
    'train': train_test['train'],
    'test': train_test['test'],
})
cagree_datasets

In [None]:
tokenized_datasets = cagree_datasets.map(tokenize_function, 
                                            fn_kwargs={'tokenizer': tokenizer, 'text_var': 'text', 'max_length': 512}, 
                                            remove_columns=['text','topic_sentiment'])
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

In [None]:
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=16, shuffle=True, collate_fn = data_collator)
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=16, shuffle=False, collate_fn = data_collator)


In [None]:
## Load pre-trained models
num_topics = len(set(manifesto_reduced['topic']))
num_sentiments = len(set(manifesto_reduced['sentiment']))
model = ContextScalePrediction(roberta_model=model_name, 
                               num_topics=num_topics, 
                               num_sentiments=num_sentiments,
                               lora=False,
                               use_shared_attention=True).to(device)
loaded_tensors = load_file('results/models/manifesto_ContextScalePrediction_main/model.safetensors')
model.load_state_dict(loaded_tensors)
scaling_model=None

In [None]:
n_epochs=5
total_steps = len(train_dataloader)*n_epochs
warmup = total_steps*0.1
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5) ## Recommended for LoRA. Without LoRA, can use 2e-5 instead.
scheduler = get_linear_schedule_with_warmup(optimizer, num_training_steps=total_steps, num_warmup_steps=warmup)
criterion_sentiment = nn.CrossEntropyLoss()
criterion_topic =  nn.CrossEntropyLoss()

In [None]:
for epoch in range(n_epochs):
    print(f"Epoch: {epoch+1}")
    train_loop(train_dataloader, model,optimizer, scheduler, device, criterion_sentiment, criterion_topic, sentiment_var='sentiment',
               topic_var='topic', timing_log=True)

    

In [None]:
state_dict = model.state_dict()
save_file(state_dict, 'results/models/coalitionagree_ContextScalePrediction_10/model.safetensors')

In [None]:
## Load pre-trained models
num_topics = 12
num_sentiments = 3
scaling_model = ContextScalePrediction(roberta_model=model_name, 
                               num_topics=num_topics, 
                               num_sentiments=num_sentiments,
                               lora=False,
                               use_shared_attention=True).to(device)
loaded_tensors = load_file('results/models/coalitionagree_ContextScalePrediction_10/model.safetensors')
scaling_model.load_state_dict(loaded_tensors)
model=None

In [None]:
outputs_ca_10 = scale_func(test_dataloader, 
               scaling_model, 
               device, 
               topic_label='topic', 
               sentiment_label='sentiment', 
               timing_log=True,
               use_ground_truth_topic=True)

In [None]:
outputs_ca_10['res_table_sentiment']['f1'].mean().round(2)

In [None]:
outputs_ca_10['res_table_topic']['f1'].mean().round(2)

In [None]:
file_path = 'data/temps/outputs_ca_10'
with open(file_path, "wb") as file:
    pickle.dump(outputs_ca_10, file)

In [None]:
outputs_ca_10['res_table_sentiment'].to_csv('results/classification results/cagree_10ft_sentiment.csv', index=False)
outputs_ca_10['res_table_topic'].to_csv('results/classification results/cagree_10ft_topic.csv', index=False)


#### Scale the entire corpus with 10% training

In [None]:
cagree_dataset = Dataset.from_pandas(cagree_reduced)
cagree_dataset = cagree_dataset.class_encode_column('sentiment')
cagree_dataset = cagree_dataset.class_encode_column('topic')


In [None]:
tokenized_dataset = cagree_dataset.map(tokenize_function, 
                                            fn_kwargs={'tokenizer': tokenizer, 'text_var': 'text', 'max_length': 512}, 
                                            remove_columns=['text', 'topic_sentiment'])

In [None]:
pred_dataloader = DataLoader(tokenized_dataset, batch_size=16, shuffle=False, collate_fn = data_collator)

In [None]:
outputs_ca_10_all = scale_func(pred_dataloader, 
               scaling_model, 
               device, 
               topic_label='topic', 
               sentiment_label='sentiment', 
               timing_log=True,
               use_ground_truth_topic=True)

In [None]:
outputs_ca_10_all['res_table_sentiment']['f1'].mean().round(2)

In [None]:
outputs_ca_10_all['res_table_topic']['f1'].mean().round(2)

In [None]:
file_path = 'data/temps/outputs_ca_10_all'
with open(file_path, "wb") as file:
    pickle.dump(outputs_ca_10, file)

In [None]:
coalition_regrouped.loc[:,'position_scores'] = outputs_ca_10_all['position_scores'].flatten()
coalition_regrouped.loc[:,'pred_sentiment_index'] = outputs_ca_10_all['pred_sentiment']
coalition_regrouped.loc[:,'pred_sentiment'] = coalition_regrouped.pred_sentiment_index.map(name_sentiment_dict)
coalition_regrouped.loc[:,'pred_topic_index'] = outputs_ca_10_all['pred_topics']
coalition_regrouped.loc[:,'pred_topic'] = coalition_regrouped.pred_topic_index.map(name_topic_dict)

In [None]:
coalition_regrouped.to_csv('data/py_outputs/cagree_10ft_all.csv', index=False)

#### Scale the COALITIONAGREE corpus with full labels information (for official release)

In [None]:
model_name = 'xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer)

In [None]:
coalition_regrouped = pd.read_csv('data/temps/coalitionagree_regrouped_processed.csv', encoding='utf-8')

In [None]:
coalition_regrouped.head()

In [None]:
cagree_reduced = coalition_regrouped[['sentiment', 'topic','text']].copy()

In [None]:
cagree_dataset = Dataset.from_pandas(cagree_reduced)
cagree_dataset = cagree_dataset.class_encode_column('sentiment')
cagree_dataset = cagree_dataset.class_encode_column('topic')


In [None]:
## Load pre-trained models
num_topics = 12
num_sentiments = 3
model = ContextScalePrediction(roberta_model=model_name, 
                               num_topics=num_topics, 
                               num_sentiments=num_sentiments,
                               lora=False,
                               use_shared_attention=True).to(device)



In [None]:
tokenized_dataset = cagree_dataset.map(tokenize_function, 
                                            fn_kwargs={'tokenizer': tokenizer, 'text_var': 'text', 'max_length': 512}, 
                                            remove_columns=['text'])
tokenized_dataset.set_format("torch")
tokenized_dataset.column_names

In [None]:
train_dataloader = DataLoader(tokenized_dataset, batch_size=16, shuffle=True, collate_fn = data_collator)
pred_dataloader = DataLoader(tokenized_dataset, batch_size=16, shuffle=False, collate_fn = data_collator)


In [None]:
n_epochs=5
total_steps = len(train_dataloader)*n_epochs
warmup = total_steps*0.1
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5) ## Recommended for LoRA. Without LoRA, can use 2e-5 instead.
scheduler = get_linear_schedule_with_warmup(optimizer, num_training_steps=total_steps, num_warmup_steps=warmup)
criterion =  nn.CrossEntropyLoss()


In [None]:
for epoch in range(n_epochs):
    print(f"Epoch: {epoch+1}")
    train_loop(train_dataloader, model,optimizer, scheduler, device, criterion, criterion, sentiment_var='sentiment',
               topic_var='topic')

In [None]:
state_dict = model.state_dict()
save_file(state_dict, 'results/models/coalitionagree_ContextScalePrediction_full/model.safetensors')

In [None]:
## Load pre-trained models
num_topics = 12
num_sentiments = 3
scaling_model = ContextScalePrediction(roberta_model=model_name, 
                               num_topics=num_topics, 
                               num_sentiments=num_sentiments,
                               lora=False,
                               use_shared_attention=True).to(device)
loaded_tensors = load_file('results/models/coalitionagree_ContextScalePrediction_full/model.safetensors')
scaling_model.load_state_dict(loaded_tensors)
model=None

In [None]:
outputs_ca_all = scale_func(pred_dataloader, 
               scaling_model, 
               device, 
               topic_label='topic', 
               sentiment_label='sentiment', 
               timing_log=True,
               use_ground_truth_topic=True)

In [None]:
outputs_ca_all['res_table_sentiment']['f1'].mean().round(2)

In [None]:
outputs_ca_all['res_table_topic']['f1'].mean().round(2)

In [None]:
file_path = 'data/temps/outputs_ca_all'
with open(file_path, "wb") as file:
    pickle.dump(outputs_ca_all, file)

In [None]:
outputs_ca_all['res_table_sentiment'].to_csv('results/classification results/cagree_all_sentiment.csv', index=False)
outputs_ca_all['res_table_topic'].to_csv('results/classification results/cagree_all_topic.csv', index=False)

In [None]:
coalition_regrouped.loc[:,'position_scores'] = outputs_ca_all['position_scores'].flatten()
coalition_regrouped.loc[:,'pred_sentiment_index'] = outputs_ca_all['pred_sentiment']
coalition_regrouped.loc[:,'pred_sentiment'] = coalition_regrouped.pred_sentiment_index.map(name_sentiment_dict)
coalition_regrouped.loc[:,'pred_topic_index'] = outputs_ca_all['pred_topics']
coalition_regrouped.loc[:,'pred_topic'] = coalition_regrouped.pred_topic_index.map(name_topic_dict)

In [None]:
coalition_regrouped.to_csv('data/py_outputs/cagree_all.csv', index=False)

### Adapting to twitter data (Sentiment is not Stance)

#### Adaptation training 

In [None]:
tw_trump = pd.read_csv('data/MOTN/MOTN_responses_groundtruth.csv', encoding='utf-8')
tw_kav = pd.read_csv('data/MOTN/kavanaugh_tweets_groundtruth.csv', encoding='utf-8')
tw_wm = pd.read_csv('data/MOTN/WM_tweets_groundtruth.csv', encoding='utf-8')

In [None]:
tw_trump.head()

In [None]:
tw_kav.head()

In [None]:
tw_wm.head()

In [None]:
tw_trump = tw_trump[['edits_clean_text','trump_stance_auto']].copy()
tw_trump = tw_trump.rename(columns={'edits_clean_text': 'text', 'trump_stance_auto': 'stance'})
tw_trump['topic'] = 'trump'
tw_kav = tw_kav[['text', 'stance']].copy()
tw_kav['topic'] = 'kavanaugh'
tw_wm = tw_wm[['text','stance']].copy()
tw_wm['topic'] = 'women march'


In [None]:
tw_df = pd.concat([tw_trump, tw_kav,tw_wm]).reset_index(drop=True)

In [None]:
tw_df.loc[:,'lr'] = tw_df.apply(lambda x: recode_tw(x['topic'], x['stance']), axis=1)
tw_df.loc[:,'topic_lr'] = tw_df['topic'] + '_' + tw_df['lr']

In [None]:
tw_df.groupby(['topic','lr']).count()

In [None]:
tw_df.info()

In [None]:
tw_dataset = Dataset.from_pandas(tw_df[['text','lr','topic', 'topic_lr']].copy())
tw_dataset = tw_dataset.class_encode_column('lr')
tw_dataset = tw_dataset.class_encode_column('topic')
tw_dataset = tw_dataset.class_encode_column('topic_lr')


In [None]:
train_test = tw_dataset.train_test_split(test_size=0.9, stratify_by_column='topic_lr',seed=seed_val)

In [None]:
tw_datasets = DatasetDict({
    'train': train_test['train'],
    'test': train_test['test'],
})
tw_datasets

In [None]:
model_name = 'xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer)

In [None]:
tokenized_datasets = tw_datasets.map(tokenize_function, 
                                            fn_kwargs={'tokenizer': tokenizer, 'text_var': 'text', 'max_length':512}, 
                                            remove_columns=['text', 'topic_lr'])
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

In [None]:
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=16, shuffle=True, collate_fn = data_collator)
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=16, shuffle=False, collate_fn = data_collator)


In [None]:
## Load pre-trained models
source_model = ContextScalePrediction(roberta_model=model_name, num_topics=12, num_sentiments=3,lora=False, use_shared_attention=True).to(device)
loaded_tensors = load_file('results/models/manifesto_ContextScalePrediction_main/model.safetensors')
source_model.load_state_dict(loaded_tensors)
model=None
scaling_model=None


In [None]:
target_model = ContextScalePrediction(roberta_model=model_name, num_topics=3, num_sentiments=2,lora=False, use_shared_attention=True).to(device)


In [None]:
target_model

In [None]:
architecture1 = get_architecture_details(target_model)
architecture2 = get_architecture_details(source_model)

In [None]:
compare_architectures(architecture1, architecture2)

In [None]:
copy_weights(source_model, target_model, patterns=('topic','sentiment'), freeze_copied=False)

In [None]:
check_weights_similar(source_model, target_model, patterns=('topic','sentiment'))

In [None]:
n_epochs=5
total_steps = len(train_dataloader)*n_epochs
warmup = total_steps*0.1
optimizer = torch.optim.AdamW(target_model.parameters(), lr=2e-5) ## Recommended for LoRA. Without LoRA, can use 2e-5 instead.
scheduler = get_linear_schedule_with_warmup(optimizer, num_training_steps=total_steps, num_warmup_steps=warmup)
criterion = nn.CrossEntropyLoss()



In [None]:
## Nullify existing models (if any)
scaling_model=None
source_model=None
model=None

In [None]:
for epoch in range(n_epochs):
    print(f"Epoch: {epoch+1}")
    train_loop(train_dataloader, target_model,optimizer, scheduler, device, criterion_sent=criterion, criterion_topic=criterion, sentiment_var='lr', topic_var='topic', timing_log=True)

    

In [None]:
state_dict = target_model.state_dict()
save_file(state_dict, 'results/models/tw_ContextScalePrediction/model.safetensors')

In [None]:
outputs_tw_10 = scale_func(test_dataloader, 
               target_model, 
               device, 
               topic_label='topic', 
               sentiment_label='lr', 
               timing_log=True,
               use_ground_truth_topic=False)

In [None]:
outputs_tw_10['res_table_sentiment']['f1'].mean().round(2)

In [None]:
outputs_tw_10['res_table_topic']['f1'].mean().round(2)

In [None]:
file_path = 'data/temps/outputs_tw_10'
with open(file_path, "wb") as file:
    pickle.dump(outputs_tw_10, file)

In [None]:
outputs_tw_10['res_table_sentiment'].to_csv('results/classification results/tw_10ft_sentiment.csv', index=False)
outputs_tw_10['res_table_topic'].to_csv('results/classification results/tw_10ft_topic.csv', index=False)


#### Scale with fine-tuned model

In [None]:
## Load pre-trained models
target_model = ContextScalePrediction(roberta_model=model_name, num_topics=3, num_sentiments=2,lora=False, use_shared_attention=True).to(device)
loaded_tensors = load_file('results/models/tw_ContextScalePrediction/model.safetensors')
target_model.load_state_dict(loaded_tensors)

In [None]:
tokenized_dataset = tw_dataset.map(tokenize_function, 
                                            fn_kwargs={'tokenizer': tokenizer, 'text_var': 'text', 'max_length': 512}, 
                                            remove_columns=['text','topic_lr'])

In [None]:
pred_dataloader = DataLoader(tokenized_dataset, batch_size=16, shuffle=False, collate_fn=data_collator)

In [None]:
outputs_tw_10_all = scale_func(pred_dataloader, 
               target_model, 
               device, 
               topic_label='topic', 
               sentiment_label='lr', 
               timing_log=True,
               use_ground_truth_topic=True)

In [None]:
outputs_tw_10_all['res_table_sentiment']['f1'].mean().round(2)

In [None]:
outputs_tw_10_all['res_table_topic']['f1'].mean().round(2)

In [None]:
file_path = 'data/temps/outputs_tw_10_all'
with open(file_path, "wb") as file:
    pickle.dump(outputs_tw_10_all, file)

In [None]:
list_names = tw_dataset.features['lr'].names
name_sentiment_dict = dict([(x,y) for x,y in enumerate(list_names)])
list_names = tw_dataset.features['topic'].names
name_topic_dict = dict([(x,y) for x,y in enumerate(list_names)])


In [None]:
tw_df.loc[:,'position_scores'] = outputs_tw_10_all['position_scores'].flatten()
tw_df.loc[:,'pred_sentiment_index'] = outputs_tw_10_all['pred_sentiment']
tw_df.loc[:,'pred_sentiment'] = tw_df.pred_sentiment_index.map(name_sentiment_dict)
tw_df.loc[:,'pred_topic_index'] = outputs_tw_10_all['pred_topics']
tw_df.loc[:,'pred_topic'] = tw_df.pred_topic_index.map(name_topic_dict)

In [None]:
tw_df.to_csv('data/py_outputs/tw_10_all.csv', index=False)

# Doc2Vec scaling

In [None]:
manifesto_d2v = pd.read_csv('data/temps/manifesto.csv', encoding='utf-8', dtype={'cmp_code':'str', 'is_copy_of':'str'})


In [None]:
outputs = clean_text_loop(manifesto_d2v, 'countryname')

In [None]:
manifesto_d2v.loc[:,'text_cleaned'] = outputs

In [None]:
manifesto_d2v.loc[:,'party_election'] = manifesto_d2v.party.astype(str).str.cat(manifesto_d2v[['election']].astype(str).values, sep='_')
manifesto_d2v.loc[:,'country_party_election'] = manifesto_d2v.countryname.str.cat(manifesto_d2v[['party','election']].astype(str).values, sep='_')

## Doc2Vec scaling - original approach by R&C 

In [None]:
# Create an empty list to store the country-level dataframes
country_dfs = []

# Get the unique list of countries from your data
unique_countries = manifesto_d2v['countryname'].unique()

# Loop through each country and process separately
for country in unique_countries:
    print(f"Processing country: {country}")
    
    # Filter the dataset for the current country
    country_data = manifesto_d2v[manifesto_d2v['countryname'] == country]
    
    # Build the corpus iterator for this country's data
    outputs_stream = phraseIterator(country_data, 'text_cleaned')
    bigram = Phraser(Phrases(outputs_stream, min_count=1, threshold=5))
    trigram = Phrases(bigram[outputs_stream], min_count=1, threshold=5)
    
    # Create the Doc2Vec model and build vocabulary
    model = Doc2Vec(vector_size=500, window=6, min_count=1, workers=16, epochs=20, seed=seed_val)
    model.build_vocab(corpusIterator(country_data, bigram=bigram, trigram=trigram, text='text_cleaned', labels='party_election'))
    
    # Train the model
    model.train(corpusIterator(country_data, bigram=bigram, trigram=trigram, text='text_cleaned', labels='party_election'),
                total_examples=model.corpus_count, epochs=20)
    
    # Generate embeddings and apply dimensionality reduction
    embed_dict = d2v_reduct(model)
    df_d2v = pd.DataFrame.from_dict(embed_dict).transpose()
    df_d2v.index.name = 'party_election'
    df_d2v.reset_index(inplace=True)
    pca = PCA(n_components=2, random_state=seed_val)
    df_d2v[['d2v_d1', 'd2v_d2']] = pca.fit_transform(df_d2v.iloc[:, 1:])
    df_d2v = df_d2v[['party_election', 'd2v_d1', 'd2v_d2']]
    
    # Split the 'party_election' label into separate columns
    df_d2v[['party', 'election']] = df_d2v['party_election'].str.split('_', expand=True)
    df_d2v.loc[:, 'election'] = df_d2v['election'].astype(int)
    df_d2v['country'] = country  # Add country column for merging later
    
    # Append the country-level dataframe to the list
    country_dfs.append(df_d2v)

# Merge all country-level datasets into a single dataframe
final_df_d2v = pd.concat(country_dfs, ignore_index=True)

# Save the final dataframe to a CSV file
final_df_d2v.to_csv('data/py_outputs/r&c_gen_party_election.csv', index=False)

# Print a summary
print(final_df_d2v.info())


In [None]:
final_df_d2v.head()

In [None]:
d2v_germany = final_df_d2v[final_df_d2v.country == 'Germany'].copy()
d2v_germany.loc[:,'party_name'] = d2v_germany['party'].astype(str).apply(party_deu)
d2v_germany = d2v_germany[d2v_germany.party_name != 'Other'].reset_index(drop=True)
d2v_germany.head()

In [None]:
fig, ax = plt.subplots()
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling
for name, group in d2v_germany.groupby('party_name'):
    ax.plot(group.election, group.d2v_d1, marker='o',  ms=4, label=name)
ax.legend()

plt.show()

In [None]:
df_d2v.to_csv('data/py_outputs/r&c_gen.csv', index=False)

## Doc2Vec scaling - relevant topics
 

In [None]:
set(manifesto_d2v['topic'])

In [None]:
# Create an empty list to store the country-topic level dataframes
country_topic_dfs = []

# Get the unique list of countries from your data
unique_countries = manifesto_d2v['countryname'].unique()

# Loop through each country
for country in unique_countries:
    print(f"Processing country: {country}")
    
    # Filter the dataset for the current country
    country_data = manifesto_d2v[manifesto_d2v['countryname'] == country]
    country_data = country_data[country_data['topic'].isin(['Environment - Growth', 'Political System', 'Economics',
                                                            'European Integration','Labour and Social Welfare',
                                                            'Immigration'])]
    # Get the unique list of topics within this country
    unique_topics = country_data['topic'].unique()
    
    # Loop through each topic in the country
    for topic in unique_topics:
        print(f"Processing topic: {topic}")

        # Filter the dataset for the current country and topic
        country_topic_data = country_data[country_data['topic'] == topic]

        # Build the corpus iterator for this country's topic-specific data
        outputs_stream = phraseIterator(country_topic_data, 'text_cleaned')
        bigram = Phraser(Phrases(outputs_stream, min_count=1, threshold=5))
        trigram = Phrases(bigram[outputs_stream], min_count=1, threshold=5)

        # Create the Doc2Vec model and build vocabulary
        model = Doc2Vec(vector_size=500, window=6, min_count=1, workers=16, epochs=20, seed=seed_val)
        model.build_vocab(corpusIterator(country_topic_data, bigram=bigram, trigram=trigram, text='text_cleaned', labels='party_election'))

        # Train the model
        model.train(corpusIterator(country_topic_data, bigram=bigram, trigram=trigram, text='text_cleaned', labels='party_election'),
                    total_examples=model.corpus_count, epochs=20)

        # Generate embeddings and apply dimensionality reduction
        embed_dict = d2v_reduct(model)
        df_d2v = pd.DataFrame.from_dict(embed_dict).transpose()
        df_d2v.index.name = 'party_election'
        df_d2v.reset_index(inplace=True)
        pca = PCA(n_components=2, random_state=seed_val)
        df_d2v[['d2v_d1', 'd2v_d2']] = pca.fit_transform(df_d2v.iloc[:, 1:])
        df_d2v = df_d2v[['party_election', 'd2v_d1', 'd2v_d2']]

        # Split the 'party_election' label into separate columns
        df_d2v[['party', 'election']] = df_d2v['party_election'].str.split('_', expand=True)
        df_d2v.loc[:, 'election'] = df_d2v['election'].astype(int)
        df_d2v['country'] = country  # Add country column
        df_d2v['topic'] = topic  # Add topic column

        # Append the country-topic-level dataframe to the list
        country_topic_dfs.append(df_d2v)

# Merge all country-topic-level datasets into a single dataframe
final_df_d2v = pd.concat(country_topic_dfs, ignore_index=True)

# Save the final dataframe to a CSV file
final_df_d2v.to_csv('data/py_outputs/r&c_party_election_topic.csv', index=False)

# Print a summary
print(final_df_d2v.info())


## Doc2Vec scaling - Environment Protection
 

In [None]:
manifesto_ep = manifesto_d2v[manifesto_d2v.cmp_code.isin(['501'])].reset_index(drop=True)


In [None]:
manifesto_ep.head()

In [None]:
# Create an empty list to store the country-level dataframes
country_dfs = []

# Get the unique list of countries from your data
unique_countries = manifesto_ep['countryname'].unique()

# Loop through each country and process separately
for country in unique_countries:
    print(f"Processing country: {country}")
    
    # Filter the dataset for the current country
    country_data = manifesto_ep[manifesto_ep['countryname'] == country]
    
    # Build the corpus iterator for this country's data
    outputs_stream = phraseIterator(country_data, 'text_cleaned')
    bigram = Phraser(Phrases(outputs_stream, min_count=1, threshold=5))
    trigram = Phrases(bigram[outputs_stream], min_count=1, threshold=5)
    
    # Create the Doc2Vec model and build vocabulary
    model = Doc2Vec(vector_size=500, window=6, min_count=1, workers=16, epochs=20, seed=seed_val)
    model.build_vocab(corpusIterator(country_data, bigram=bigram, trigram=trigram, text='text_cleaned', labels='party_election'))
    
    # Train the model
    model.train(corpusIterator(country_data, bigram=bigram, trigram=trigram, text='text_cleaned', labels='party_election'),
                total_examples=model.corpus_count, epochs=20)
    
    # Generate embeddings and apply dimensionality reduction
    embed_dict = d2v_reduct(model)
    df_d2v = pd.DataFrame.from_dict(embed_dict).transpose()
    df_d2v.index.name = 'party_election'
    df_d2v.reset_index(inplace=True)
    pca = PCA(n_components=2, random_state=seed_val)
    df_d2v[['d2v_d1', 'd2v_d2']] = pca.fit_transform(df_d2v.iloc[:, 1:])
    df_d2v = df_d2v[['party_election', 'd2v_d1', 'd2v_d2']]
    
    # Split the 'party_election' label into separate columns
    df_d2v[['party', 'election']] = df_d2v['party_election'].str.split('_', expand=True)
    df_d2v.loc[:, 'election'] = df_d2v['election'].astype(int)
    df_d2v['country'] = country  # Add country column for merging later
    
    # Append the country-level dataframe to the list
    country_dfs.append(df_d2v)

# Merge all country-level datasets into a single dataframe
final_df_d2v = pd.concat(country_dfs, ignore_index=True)

# Save the final dataframe to a CSV file
final_df_d2v.to_csv('data/py_outputs/r&c_ep_party_election.csv', index=False)

# Print a summary
print(final_df_d2v.info())


## Doc2Vec scaling - Germany, growth vs anti growth
 

In [None]:
manifesto_welfare = manifesto_d2v[manifesto_d2v.cmp_code.isin(['504', '505'])].reset_index(drop=True)


In [None]:
manifesto_welfare.head()

In [None]:
# Create an empty list to store the country-level dataframes
country_dfs = []

# Get the unique list of countries from your data
unique_countries = manifesto_welfare['countryname'].unique()

# Loop through each country and process separately
for country in unique_countries:
    print(f"Processing country: {country}")
    
    # Filter the dataset for the current country
    country_data = manifesto_welfare[manifesto_welfare['countryname'] == country]
    
    # Build the corpus iterator for this country's data
    outputs_stream = phraseIterator(country_data, 'text_cleaned')
    bigram = Phraser(Phrases(outputs_stream, min_count=1, threshold=5))
    trigram = Phrases(bigram[outputs_stream], min_count=1, threshold=5)
    
    # Create the Doc2Vec model and build vocabulary
    model = Doc2Vec(vector_size=500, window=6, min_count=1, workers=16, epochs=20, seed=seed_val)
    model.build_vocab(corpusIterator(country_data, bigram=bigram, trigram=trigram, text='text_cleaned', labels='party_election'))
    
    # Train the model
    model.train(corpusIterator(country_data, bigram=bigram, trigram=trigram, text='text_cleaned', labels='party_election'),
                total_examples=model.corpus_count, epochs=20)
    
    # Generate embeddings and apply dimensionality reduction
    embed_dict = d2v_reduct(model)
    df_d2v = pd.DataFrame.from_dict(embed_dict).transpose()
    df_d2v.index.name = 'party_election'
    df_d2v.reset_index(inplace=True)
    pca = PCA(n_components=2, random_state=seed_val)
    df_d2v[['d2v_d1', 'd2v_d2']] = pca.fit_transform(df_d2v.iloc[:, 1:])
    df_d2v = df_d2v[['party_election', 'd2v_d1', 'd2v_d2']]
    
    # Split the 'party_election' label into separate columns
    df_d2v[['party', 'election']] = df_d2v['party_election'].str.split('_', expand=True)
    df_d2v.loc[:, 'election'] = df_d2v['election'].astype(int)
    df_d2v['country'] = country  # Add country column for merging later
    
    # Append the country-level dataframe to the list
    country_dfs.append(df_d2v)

# Merge all country-level datasets into a single dataframe
final_df_d2v = pd.concat(country_dfs, ignore_index=True)

# Save the final dataframe to a CSV file
final_df_d2v.to_csv('data/py_outputs/r&c_welfare_party_election.csv', index=False)

# Print a summary
print(final_df_d2v.info())


# Scale position scores for all countries (released dataset + model)

## Retrain for the entire dataset with all languages

In [None]:
manifesto_org = pd.read_csv(os.path.join("data", "r_outputs","pulled_manifestoes.csv"), encoding="utf-8", dtype={2:'str',18:'str'})

In [None]:
manifesto_other = pd.read_csv(os.path.join("data", "r_outputs","pulled_manifestoes_test.csv"), encoding="utf-8", dtype={2:'str',18:'str'})

In [None]:
manifesto_org_cleaned = manifesto_org.dropna(axis=1, how='all')
manifesto_other_cleaned = manifesto_other.dropna(axis=1, how='all')
manifesto_full = pd.concat([manifesto_org_cleaned, manifesto_other_cleaned]).reset_index(drop=True)

In [None]:
manifesto_full.head()

In [None]:
len(manifesto_full)

In [None]:
manifesto_full = manifesto_full[(manifesto_full.cmp_code.notna()) & ~(manifesto_full.cmp_code == 'H')].reset_index(drop=True)

In [None]:
manifesto_full['sentiment'] = manifesto_full['cmp_code'].apply(sentiment_code)
manifesto_full['topic'] = manifesto_full['cmp_code'].apply(topic_code)
manifesto_full['election'] = manifesto_full['date'].astype(str).str[:4]

In [None]:
manifesto_full.groupby('sentiment').count()

In [None]:
manifesto_full.groupby(['topic'])['sentiment'].value_counts().unstack(fill_value=0)

In [None]:
results = group_texts(manifesto_full, 
                      ['countryname','election','party','cmp_code'], 'text', 
                      max_group_factor = 5)

In [None]:
manifesto_regrouped = pd.DataFrame(results)
manifesto_regrouped = manifesto_regrouped.explode('text').reset_index(drop=True)

In [None]:
df_cols = manifesto_regrouped['labels'].str.split(';', expand=True)
manifesto_regrouped = pd.concat([manifesto_regrouped, df_cols], axis=1)


In [None]:
manifesto_regrouped.columns = ['text', 'country_election_party_code', 'country','election', 'party', 'cmp_code']

In [None]:
manifesto_regrouped.head()

In [None]:
manifesto_regrouped.loc[:,'sentiment'] = manifesto_regrouped['cmp_code'].apply(sentiment_code)
manifesto_regrouped.loc[:,'topic'] = manifesto_regrouped['cmp_code'].apply(topic_code)
manifesto_regrouped = manifesto_regrouped.drop_duplicates().reset_index(drop=True)
 

In [None]:
manifesto_regrouped.head()

In [None]:
manifesto_regrouped.to_csv('data/temps/manifesto_regrouped_full_processed.csv', encoding='utf-8', index=False)
manifesto_full.to_csv('data/temps/manifesto_full_processed.csv', encoding='utf-8', index=False)

In [None]:
coalition_regrouped = pd.read_csv('data/temps/coalitionagree_regrouped_processed.csv', encoding='utf-8')

In [None]:
manifesto_regrouped.loc[:, 'source'] = 'manifestos'
coalition_regrouped.loc[:, 'source'] = 'coalition_contracts'

In [None]:
coalition_regrouped.info()

In [None]:
coalition_regrouped.groupby(['topic'])['sentiment'].value_counts().unstack(fill_value=0)

In [None]:
final_df = pd.concat([manifesto_regrouped[['text','sentiment','topic', 'source']], coalition_regrouped[['text','sentiment','topic','source']]]).reset_index(drop=True)

In [None]:
model_name = 'xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer)

In [None]:
final_df.loc[:,'topic_sentiment'] = final_df['topic'] + '_' + final_df['sentiment']

In [None]:
final_dataset = Dataset.from_pandas(final_df)
final_dataset = final_dataset.class_encode_column('sentiment')
final_dataset = final_dataset.class_encode_column('topic')
final_dataset = final_dataset.class_encode_column('topic_sentiment')
final_dataset = final_dataset.class_encode_column('source')



In [None]:
train_test = final_dataset.train_test_split(test_size=0.1, stratify_by_column='topic_sentiment', seed=seed_val)

In [None]:
final_datasets = DatasetDict({
    'train': train_test['train'],
    'test': train_test['test']
})
final_datasets

In [None]:
tokenized_datasets = final_datasets.map(tokenize_function, 
                                            fn_kwargs={'tokenizer': tokenizer, 'text_var': 'text', 'max_length': 512}, 
                                            remove_columns=['text','topic_sentiment'])
tokenized_datasets.set_format("torch")
tokenized_datasets.column_names

In [None]:
train_dataloader = DataLoader(tokenized_datasets['train'], batch_size=16, shuffle=True, collate_fn = data_collator)
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=16, shuffle=False, collate_fn = data_collator)


In [None]:
num_topics = len(set(final_df['topic']))
num_sentiments = len(set(final_df['sentiment']))
model = ContextScalePrediction(roberta_model=model_name, num_topics=12, num_sentiments=3,lora=False, use_shared_attention=True).to(device)



In [None]:
n_epochs=5
total_steps = len(train_dataloader)*n_epochs
warmup = total_steps*0.1
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5) ## Recommended for LoRA. Without LoRA, can use 2e-5 instead.
scheduler = get_linear_schedule_with_warmup(optimizer, num_training_steps=total_steps, num_warmup_steps=warmup)
criterion = nn.CrossEntropyLoss()

In [None]:
for epoch in range(n_epochs):
    print(f"Epoch: {epoch+1}")
    train_loop(train_dataloader, model,optimizer, scheduler, device, criterion, criterion, sentiment_var='sentiment',
               topic_var='topic')

In [None]:
state_dict = model.state_dict()
save_file(state_dict, 'results/models/contextscale_full_released/model.safetensors')

## Scale manifestos and coalition contracts

In [None]:
coalition_regrouped = pd.read_csv('data/temps/coalitionagree_regrouped_processed.csv', encoding='utf-8')
manifesto_regrouped = pd.read_csv('data/temps/manifesto_regrouped_full_processed.csv', encoding='utf-8')

In [None]:
model_name = 'xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer)

In [None]:
num_topics = 12
num_sentiments = 3
scaling_model = ContextScalePrediction(roberta_model=model_name, num_topics=12, num_sentiments=3,lora=False, use_shared_attention=True).to(device)

model=None

In [None]:
loaded_tensors = load_file('results/models/contextscale_full_released/model.safetensors')
scaling_model.load_state_dict(loaded_tensors)

In [None]:
manifesto_dataset = Dataset.from_pandas(manifesto_regrouped[['text','topic','sentiment']].copy())
coalition_dataset = Dataset.from_pandas(coalition_regrouped[['text','topic','sentiment']].copy())
manifesto_dataset = manifesto_dataset.class_encode_column('topic') 
coalition_dataset = coalition_dataset.class_encode_column('topic')
manifesto_dataset = manifesto_dataset.class_encode_column('sentiment') 
coalition_dataset = coalition_dataset.class_encode_column('sentiment')

In [None]:
tokenized_manifesto_dataset = manifesto_dataset.map(tokenize_function, 
                                            fn_kwargs={'tokenizer': tokenizer, 'text_var': 'text', 'max_length': 512}, 
                                            remove_columns=['text'])
tokenized_manifesto_dataset.set_format("torch")
tokenized_coalition_dataset = coalition_dataset.map(tokenize_function, 
                                            fn_kwargs={'tokenizer': tokenizer, 'text_var': 'text', 'max_length': 512}, 
                                            remove_columns=['text'])
tokenized_coalition_dataset.set_format("torch")


In [None]:
manifesto_dataloader = DataLoader(tokenized_manifesto_dataset, batch_size=16, shuffle=False, collate_fn= data_collator)
coalition_dataloader = DataLoader(tokenized_coalition_dataset, batch_size=16, shuffle=False, collate_fn=data_collator)

In [None]:
## Compute position scores
output_manifesto_final = scale_func(manifesto_dataloader, 
               scaling_model, 
               device, 
               topic_label='topic', 
               sentiment_label='sentiment', 
               timing_log=True,
               use_ground_truth_topic=True)

In [None]:
file_path = 'data/temps/topic_labels'
with open(file_path, 'rb') as fp:
    topic_labels = pickle.load(fp)
name_topic_dict = dict([(x,y) for x,y in enumerate(topic_labels)])


file_path = 'data/temps/sentiment_labels'
with open(file_path, 'rb') as fp:
    sentiment_labels = pickle.load(fp)
name_sentiment_dict = dict([(x,y) for x,y in enumerate(sentiment_labels)])



In [None]:
output_manifesto_final.keys()

In [None]:
manifesto_regrouped.loc[:,'position_scores'] = output_manifesto_final['position_scores'].flatten()
manifesto_regrouped.loc[:,'pred_sentiment'] = output_manifesto_final['pred_sentiment']
manifesto_regrouped.loc[:,'pred_sentiment_name'] = manifesto_regrouped.pred_sentiment.map(name_sentiment_dict)

In [None]:
manifesto_regrouped.head()

In [None]:
manifesto_regrouped.to_csv('results/datasets/manifesto_full_scaled.csv', encoding='utf-8', index=False)

In [None]:
## Compute position scores
output_coalition_final = scale_func(coalition_dataloader, 
               scaling_model, 
               device, 
               topic_label='topic', 
               sentiment_label='sentiment', 
               timing_log=True,
               use_ground_truth_topic=True)

In [None]:
coalition_regrouped.loc[:,'position_scores'] = output_coalition_final['position_scores'].flatten()
coalition_regrouped.loc[:,'pred_sentiment'] = output_coalition_final['pred_sentiment']
coalition_regrouped.loc[:,'pred_sentiment_name'] = coalition_regrouped.pred_sentiment.map(name_topic_dict)

In [None]:
coalition_regrouped.to_csv('results/datasets/coalition_full_scaled.csv', encoding='utf-8', index=False)

## Create released dataset (position scores by country-party-election)

In [None]:
columns  =['country','party', 'election','topic','cs_mean_score', 'cs_se_score']
df = pd.DataFrame(columns=columns)

for name, group in manifesto_regrouped.groupby(['country','party','election','topic']):
    mean_score = group['position_scores'].mean()
    se_score = group['position_scores'].std()/np.sqrt(len(group))
    df_temp = pd.DataFrame([[str(group.iloc[0,group.columns.get_loc('country')]),
                             str(group.iloc[0,group.columns.get_loc('party')]), 
                    str(group.iloc[0,group.columns.get_loc('election')]), 
                    str(group.iloc[0,group.columns.get_loc('topic')]),
               mean_score, se_score]], columns = columns)
    df = (df_temp if df.empty else pd.concat([df, df_temp], ignore_index=True))

In [None]:
df.to_csv('results/datasets/contextscale_manifesto_dataset.csv', encoding='utf-8')

In [None]:
df.head()

In [None]:
columns  =['country', 'year','topic','cs_mean_score', 'cs_se_score']
df = pd.DataFrame(columns=columns)

for name, group in coalition_regrouped.groupby(['country','year','topic']):
    mean_score = group['position_scores'].mean()
    se_score = group['position_scores'].std()/np.sqrt(len(group))
    df_temp = pd.DataFrame([[str(group.iloc[0,group.columns.get_loc('country')]),
                    str(group.iloc[0,group.columns.get_loc('year')]), 
                    str(group.iloc[0,group.columns.get_loc('topic')]),
               mean_score, se_score]], columns = columns)
    df = (df_temp if df.empty else pd.concat([df, df_temp], ignore_index=True))

In [None]:
df.head()

In [None]:
df.to_csv('results/datasets/contextscale_coalition_dataset.csv', encoding='utf-8')