In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader
import json

In [2]:
with open('./party_lst.json') as json_file:
    parties = json.load(json_file)
parties

['Socialdemokratiet',
 'Venstre',
 'Socialistisk Folkeparti',
 'Det Radikale Venstre',
 'Enhedslisten',
 'Det Konservative Folkeparti',
 'Dansk Folkeparti',
 'Liberal Alliance',
 'Alternativet']

## Flow for each party

### One by one

In [3]:
party = "Socialdemokratiet"

In [4]:
df_train = pd.read_csv("./by_party_train/" + party + ".csv")
df_test = pd.read_csv("./by_party_test/" + party + ".csv")
df_train.head(2)

Unnamed: 0.1,Unnamed: 0,Absent,Against,For,Neutral,parties,description,date
0,1,18,27,0,0,Socialdemokratiet (S),"Formålet med beslutningsforslaget var, at rege...",2010-05-06
1,0,19,0,30,0,Socialdemokratiet (S),"Regeringen (Socialdemokratiet), Radikale Venst...",2021-02-13


In [5]:
def preprocess_vote(df):
  
  df['label'] = df.apply(lambda x: 1 if x['Neutral']<x['Against']>x['For'] else 0 if x['Neutral']<x['For']>x['Against'] else None, axis=1)

  df.rename(columns={'description':'text'}, inplace=True)
  df = df[['text', 'label']]
  df = df.dropna()
  df['label'] = df.apply(lambda x: int(x['label']), axis=1)

  return df

## Note "Against" = 1

In [6]:
df_train = preprocess_vote(df_train)
df_test = preprocess_vote(df_test)
df_train.head(2)

Unnamed: 0,text,label
0,"Formålet med beslutningsforslaget var, at rege...",1
1,"Regeringen (Socialdemokratiet), Radikale Venst...",0


In [7]:
train_set = Dataset.from_pandas(df_train, preserve_index=True)
test_set = Dataset.from_pandas(df_test, preserve_index=True)
dataset = DatasetDict({'train':train_set,
                        'test':test_set})
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 281
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 70
    })
})

In [8]:
#Models
klimaBERT = "/klimaBERTe4_v2.1"
checkpoint = klimaBERT

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [9]:
def tokenize(batch):
    tokenized_batch = tokenizer(batch['text'], padding=True, truncation=True, max_length=512)
    return tokenized_batch

tokenized_datasets = dataset.map(tokenize, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

100%|██████████| 1/1 [00:00<00:00,  7.25ba/s]
100%|██████████| 1/1 [00:00<00:00, 24.39ba/s]


In [10]:
## Steps for processing data
tokenized_datasets = tokenized_datasets.remove_columns(["text"])

## Rename label column to labels, if not already done
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

tokenized_datasets.set_format("torch")
tokenized_datasets.column_names
## Our model needs columns that it already knows (the 4 below, NOT any custom columns like "text"):
#['labels', 'input_ids', 'token_type_ids', 'attention_mask']

{'train': ['labels',
  '__index_level_0__',
  'input_ids',
  'token_type_ids',
  'attention_mask'],
 'test': ['labels',
  '__index_level_0__',
  'input_ids',
  'token_type_ids',
  'attention_mask']}

In [11]:
from datasets import load_metric
from datasets import load_metric
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    roc_auc = roc_auc_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'roc_auc_score': roc_auc
    }

In [12]:
from transformers import TrainingArguments
from transformers import Trainer
from transformers import AutoModelForSequenceClassification

training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch", per_device_train_batch_size=4, num_train_epochs=5, seed=2022)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) #Change num  of labels if needed

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [13]:
## Set to use GPU
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [14]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__. If __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 281
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 355
  0%|          | 0/355 [00:00<?, ?it/s]

In [None]:
trainer.save_model("models_binary_party/"+party)

In [None]:
predictions = trainer.predict(tokenized_datasets["test"])

### Loop for all parties

In [None]:
for party in parties:
    df_train = pd.read_csv("./by_party_train" + party + ".csv")
    df_test = pd.read_csv("./by_party_test" + party + ".csv")
