In [1]:
## Imports
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification
from datasets import load_metric

In [2]:
## Data
df = pd.read_pickle("../../data_collection/meetings/ft_meetings_2012-2022_clean_v2.3.pkl")
df_part = df
test_set = Dataset.from_pandas(df_part, preserve_index=False)

test_set

Dataset({
    features: ['speaker', 'politician', 'title', 'party', 'text', 'date'],
    num_rows: 335170
})

In [3]:
df.head(2)

Unnamed: 0,speaker,politician,title,party,text,date
1,Statsministeren Helle Thorning-Schmidt,Helle Thorning-Schmidt,Statsministeren,(S),(Talen er under udarbejdelse) (Talen er under ...,2012-10-02
2,Statsministeren Helle Thorning-Schmidt,Helle Thorning-Schmidt,Statsministeren,(S),"000 døgninstitutioner, opholdssteder og plejef...",2012-10-02


In [5]:
## Load (part1)
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification
from datasets import load_metric

def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

checkpoint = "/klimaBERTe4_v2.1"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

## Load trainer API
trainer = Trainer(
    model,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


Didn't find file /klimaBERTe4_v2.1\added_tokens.json. We won't load it.
loading file /klimaBERTe4_v2.1\vocab.txt
loading file /klimaBERTe4_v2.1\tokenizer.json
loading file None
loading file /klimaBERTe4_v2.1\special_tokens_map.json
loading file /klimaBERTe4_v2.1\tokenizer_config.json
loading configuration file /klimaBERTe4_v2.1\config.json
Model config BertConfig {
  "_name_or_path": "/klimaBERTe4_v2.1",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_si

In [6]:
## Set to use GPU
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [7]:
## Load FT data and tokenize it
def tokenize(batch):
    tokenized_batch = tokenizer(batch['text'], padding=True, truncation=True, max_length=512)
    return tokenized_batch

dataset = test_set
tokenized_datasets = dataset.map(tokenize, batched=True)
tokenized_datasets.set_format("torch")

100%|██████████| 336/336 [02:09<00:00,  2.59ba/s]


In [8]:
## Predict on sub-set
predictions2 = trainer.predict(tokenized_datasets)
predictions2

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: speaker, date, title, text, party, politician. If speaker, date, title, text, party, politician are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 335170
  Batch size = 8
100%|██████████| 41897/41897 [8:08:26<00:00,  2.27it/s]  

PredictionOutput(predictions=array([[ 4.548115 , -3.5720406],
       [ 4.5659766, -3.5668213],
       [ 4.5911045, -3.52495  ],
       ...,
       [ 4.7219825, -3.7248368],
       [ 4.7001395, -3.6855257],
       [ 4.675691 , -3.712501 ]], dtype=float32), label_ids=None, metrics={'test_runtime': 29309.9375, 'test_samples_per_second': 11.435, 'test_steps_per_second': 1.429})

In [9]:
label = []

for (i,j) in predictions2[0]:
  if i > j: label.append("non-climate")
  else: label.append("climate")

In [10]:

df_compare = pd.DataFrame()
df_compare = dataset.to_pandas()
df_compare["y_pred"] = label

In [11]:
df_climate = df_compare[df_compare["y_pred"]=="climate"].reset_index()
df_non_climate = df_compare[df_compare["y_pred"]=="non-climate"].reset_index()
print("y_pred:climate",len(df_climate.index))
print("y_pred:non-climate",len(df_non_climate.index))
print("all", len(df_compare))

df_climate.to_pickle("ft_meetings_climate_all_2.3.pkl")
df_non_climate.to_pickle("ft_meetings_non_climate_all_2.3.pkl")
df_compare.to_pickle("ft_meetings_all_2.3.pkl")

y_pred:climate 17226
y_pred:non-climate 317944
all 335170
