In [14]:
## Imports
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification
from datasets import load_metric

In [15]:
## Data
df = pd.read_pickle("ft_meetings_2021new-2022_clean_v2.pkl")
df_part = df
test_set = Dataset.from_pandas(df_part, preserve_index=False)

test_set

Dataset({
    features: ['speaker', 'politician', 'party', 'text', 'date'],
    num_rows: 10491
})

In [7]:
df.head(2)

Unnamed: 0,speaker,politician,party,text,date
50854,Henrik Møller (S),Henrik Møller,(S),Tak for det. Det er jo noget af et syn hernede...,2021-08-26
50856,Victoria Velasquez (EL),Victoria Velasquez,(EL),"Tak for det. Jeg må indrømme, at jeg synes, de...",2021-08-26


In [10]:
## Load (part1)
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification
from datasets import load_metric

def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

checkpoint = "/klimaBERTe11_v2.1"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

## Load trainer API
trainer = Trainer(
    model,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [11]:
## Set to use GPU
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cpu')

In [12]:
## Load FT data and tokenize it
def tokenize(batch):
    tokenized_batch = tokenizer(batch['text'], padding=True, truncation=True, max_length=512)
    return tokenized_batch

dataset = test_set
tokenized_datasets = dataset.map(tokenize, batched=True)
tokenized_datasets.set_format("torch")

100%|██████████| 11/11 [00:13<00:00,  1.23s/ba]


In [13]:
## Predict on sub-set
predictions2 = trainer.predict(tokenized_datasets)
predictions2

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: politician, speaker, date, party, text.
***** Running Prediction *****
  Num examples = 10491
  Batch size = 8
  0%|          | 3/1312 [00:51<6:41:36, 18.41s/it]

KeyboardInterrupt: 

In [None]:
label = []

for (i,j) in predictions2[0]:
  if i > j: label.append("non-climate")
  else: label.append("climate")

In [None]:

df_compare = pd.DataFrame()
df_compare = dataset.to_pandas()
df_compare["y_pred"] = label

In [None]:
df_climate = df_compare[df_compare["y_pred"]=="climate"].reset_index()
df_non_climate = df_compare[df_compare["y_pred"]=="non-climate"].reset_index()
print("y_pred:climate",len(df_climate.index))
print("y_pred:non-climate",len(df_non_climate.index))

df_climate.to_csv("ft_meetings_climate_all_2.1.csv")
df_non_climate.to_csv("ft_meetings_non_climate_all_2.1.csv")