In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
import pandas as pd
from datasets import Dataset, DatasetDict

import json

In [2]:
with open('./party_lst.json') as json_file:
    parties = json.load(json_file)
parties

['Socialdemokratiet',
 'Venstre',
 'Socialistisk Folkeparti',
 'Det Radikale Venstre',
 'Enhedslisten',
 'Det Konservative Folkeparti',
 'Dansk Folkeparti',
 'Liberal Alliance',
 'Alternativet']

## Flow for each party

### One by one

In [4]:
party = "Alternativet"

In [5]:
df_train = pd.read_csv("./by_party_train/" + party + ".csv")
df_test = pd.read_csv("./by_party_test/" + party + ".csv")
df_train.head(2)

Unnamed: 0.1,Unnamed: 0,Absent,Against,For,Neutral,parties,description,date
0,9,1,0,0,0,Alternativet (ALT),"Regeringen (Socialdemokratiet), Radikale Venst...",2021-02-13
1,5,4,0,6,0,Alternativet (ALT),Forslaget har til formål at gennemføre en lang...,2018-05-15


In [None]:
def preprocess_vote(df):
  
  df['label'] = df.apply(lambda x: 1 if x['Neutral']<x['Against']>x['For'] else 0 if x['Neutral']<x['For']>x['Against'] else None, axis=1)

  df.rename(columns={'description':'text'}, inplace=True)
  df = df[['text', 'label']]
  df = df.dropna()
  df['label'] = df.apply(lambda x: int(x['label']), axis=1)

  return df

## Note "Against" = 1

In [None]:
df_train = preprocess_vote(df_train)
df_test = preprocess_vote(df_test)
df_train.head(2)

In [None]:
train_set = Dataset.from_pandas(df_train, preserve_index=True)
test_set = Dataset.from_pandas(df_test, preserve_index=True)
dataset = DatasetDict({'train':train_set,
                        'test':test_set})
dataset

In [None]:
#Models
klimaBERT = "/klimaBERTe4_v2.1"
checkpoint = klimaBERT

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
def tokenize(batch):
    tokenized_batch = tokenizer(batch['text'], padding=True, truncation=True, max_length=512)
    return tokenized_batch

tokenized_datasets = dataset.map(tokenize, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
## Steps for processing data
tokenized_datasets = tokenized_datasets.remove_columns(["text"])

## Rename label column to labels, if not already done
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

tokenized_datasets.set_format("torch")
tokenized_datasets.column_names
## Our model needs columns that it already knows (the 4 below, NOT any custom columns like "text"):
#['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [None]:
from datasets import load_metric
from datasets import load_metric
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    roc_auc = roc_auc_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'roc_auc_score': roc_auc
    }

In [None]:
from transformers import TrainingArguments
from transformers import Trainer
from transformers import AutoModelForSequenceClassification

training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch", per_device_train_batch_size=2, num_train_epochs=5, seed=2022, save_strategy="no")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) #Change num  of labels if needed

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
## Set to use GPU
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

In [None]:
trainer.train()

In [None]:
trainer.save_model("models_binary_party/"+party)

##### Metrics

In [None]:
# example of a roc curve for a predictive model
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from matplotlib import pyplot
from sklearn.dummy import DummyClassifier
m = torch.nn.Softmax(dim=-1)

In [None]:

weight_pred = trainer.predict(tokenized_datasets["test"])[0]
weight_for = []
weight_against = []
for input in weight_pred:
    input2 = torch.from_numpy(input)
    (i,k) = m(input2) #i=against, k=for
    weight_for.append(i.item())
    weight_against.append(k.item())

In [None]:
## ROC CURVE
testy = tokenized_datasets["test"]["labels"]

# plot no skill roc curve
model = DummyClassifier(strategy='stratified')
model.fit(tokenized_datasets["train"], tokenized_datasets["train"]["labels"])
yhat = model.predict_proba(tokenized_datasets["test"])
pos_probs = yhat[:, 1]
roc_auc_noskill = roc_auc_score(testy, pos_probs)

pyplot.plot([0, 1], [0, 1], linestyle='--', label='No Skill')

# plot the skill curve
# retrieve just the probabilities for the positive class
pos_probs = weight_against
# calculate roc curve for model
fpr, tpr, _ = roc_curve(testy, pos_probs)

# plot model roc curve
pyplot.plot(fpr, tpr, marker='.', label=f'Baseline-{party}')

# axis labels
pyplot.xlabel('False Positive Rate')
pyplot.ylabel('True Positive Rate')

# show the legend
pyplot.legend()

# show the plot
pyplot.show()


roc_auc = roc_auc_score(testy, pos_probs)
print('No Skill ROC AUC %.3f' % roc_auc_noskill)
print('Baseline ROC AUC %.3f' % roc_auc)

In [None]:
## PR Curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
y = tokenized_datasets["test"]
no_of_pos = y["labels"].sum().item()


# calculate the no skill line as the proportion of the positive class
no_skill = (no_of_pos / len(y))
# plot the no skill precision-recall curve
model = DummyClassifier(strategy='stratified')
model.fit(tokenized_datasets["train"], tokenized_datasets["train"]["labels"])
yhat = model.predict_proba(tokenized_datasets["test"])
pos_probs = yhat[:, 1]
# calculate the precision-recall auc
precision, recall, _ = precision_recall_curve(testy, pos_probs)
auc_score_noskill = auc(recall, precision)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')

# calculate model precision-recall curve
pos_probs = weight_against
precision, recall, _ = precision_recall_curve(testy, pos_probs)
# plot the model precision-recall curve
pyplot.plot(recall, precision, marker='.', label=f'Baseline-{party}')
# axis labels
pyplot.xlabel('Recall')
pyplot.ylabel('Precision')
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
auc_score = auc(recall, precision)
print('No Skill PR AUC: %.3f' % auc_score_noskill)
print('Logistic PR AUC: %.3f' % auc_score)

### Loop for all parties

In [None]:
for party in parties:
    df_train = pd.read_csv("./by_party_train" + party + ".csv")
    df_test = pd.read_csv("./by_party_test" + party + ".csv")


## For all (macro)

In [3]:
df = pd.read_pickle("../../climate_classifier/scored_data/votes_data_all_expanded_politician.pkl")
df = df.sample(frac=0.3)
print(len(df))
df_c = df[df["y_pred"]=="climate"]
print(len(df_c))
df_c["year"] = df_c["date"].astype("str").str[:4]
df_c = df_c[df_c["year"].astype("int")>=2012]
df_c.head(1)
print(len(df_c))



290893
20423
12978


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_c["year"] = df_c["date"].astype("str").str[:4]


In [4]:
df_c[["For", "Absent", "Neutral", "Against"]] = pd.get_dummies(df_c["vote"])
df_c = df_c[["For", "Absent", "Neutral", "Against", "description", "date"]]
df_c.sort_values(by="date").head(2)

Unnamed: 0,For,Absent,Neutral,Against,description,date
10,1,0,0,0,De danske ressourcer af olie og gas skal udnyt...,2012-02-23
154,1,0,0,0,De danske ressourcer af olie og gas skal udnyt...,2012-02-23


In [5]:
print(len(df_c))
print(len(df_c)*0.8)
print(len(df_c)*0.2)

df_train = df_c[:10236]
df_test = df_c[10236:]
print(len(df_train))
print(len(df_test))

12978
10382.400000000001
2595.6000000000004
10236
2742


In [6]:
def preprocess_vote(df):
  
  df['label'] = df.apply(lambda x: 1 if x['Neutral']<x['Against']>x['For'] else 0 if x['Neutral']<x['For']>x['Against'] else None, axis=1)

  df.rename(columns={'description':'text'}, inplace=True)
  df = df[['text', 'label']]
  df = df.dropna()
  df['label'] = df.apply(lambda x: int(x['label']), axis=1)

  return df

## Note "Against" = 1

In [7]:
df_train = preprocess_vote(df_train)
df_test = preprocess_vote(df_test)
df_train.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df.apply(lambda x: 1 if x['Neutral']<x['Against']>x['For'] else 0 if x['Neutral']<x['For']>x['Against'] else None, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


Unnamed: 0,text,label
49,Beslutningsforslaget pålægger regeringen at la...,1
49,Folketinget opfordrer til fastholdelse af den ...,1


In [8]:
train_set = Dataset.from_pandas(df_train, preserve_index=True)
test_set = Dataset.from_pandas(df_test, preserve_index=True)
dataset = DatasetDict({'train':train_set,
                        'test':test_set})
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 5802
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 1560
    })
})

In [9]:
#Models
klimaBERT = "/klimaBERTe4_v2.1"
checkpoint = klimaBERT

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [10]:
def tokenize(batch):
    tokenized_batch = tokenizer(batch['text'], padding=True, truncation=True, max_length=512)
    return tokenized_batch

tokenized_datasets = dataset.map(tokenize, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

100%|██████████| 6/6 [00:02<00:00,  2.22ba/s]
100%|██████████| 2/2 [00:00<00:00,  2.88ba/s]


In [11]:
## Steps for processing data
tokenized_datasets = tokenized_datasets.remove_columns(["text"])

## Rename label column to labels, if not already done
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

tokenized_datasets.set_format("torch")
tokenized_datasets.column_names
## Our model needs columns that it already knows (the 4 below, NOT any custom columns like "text"):
#['labels', 'input_ids', 'token_type_ids', 'attention_mask']

{'train': ['labels',
  '__index_level_0__',
  'input_ids',
  'token_type_ids',
  'attention_mask'],
 'test': ['labels',
  '__index_level_0__',
  'input_ids',
  'token_type_ids',
  'attention_mask']}

In [12]:
from datasets import load_metric
from datasets import load_metric
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    roc_auc = roc_auc_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'roc_auc_score': roc_auc
    }

In [13]:
from transformers import TrainingArguments
from transformers import Trainer
from transformers import AutoModelForSequenceClassification

training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch", per_device_train_batch_size=2, seed=2022, save_strategy="no")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) #Change num  of labels if needed

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [14]:
## Set to use GPU
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [15]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__. If __index_level_0__ are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 5802
  Num Epochs = 3
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 8703
  0%|          | 11/8703 [00:09<1:37:42,  1.48it/s]