In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader

In [27]:
#Datasets
path = "./ft_meetings_votes_995_covid.csv"
df = pd.read_csv(path) #data_collection/votes_data_cleaned.pkl
dataset = Dataset.from_pandas(df, preserve_index=False)
dataset = dataset.remove_columns('Unnamed: 0')

# 80% train, 18% test + 2% validation
train_test = dataset.train_test_split(test_size=0.2, seed=2022)

# gather everyone if you want to have a single DatasetDict
train_test_valid_dataset = DatasetDict({
    'train': train_test['train'],
    'test': train_test['test']}
    )

dataset = train_test_valid_dataset

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 796
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 200
    })
})

In [7]:
df = dataset["test"].to_pandas()

In [12]:
len(df[df["label"]==0])
len(df[df["label"]==1])

101

### Model selection

In [None]:
#Models
Aelaectra = "Maltehb/aelaectra-danish-electra-small-cased"
BERT = "Maltehb/danish-bert-botxo"

checkpoint = BERT

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
def tokenize(batch):
    tokenized_batch = tokenizer(batch['text'], padding=True, truncation=True, max_length=512)
    return tokenized_batch


tokenized_datasets = dataset.map(tokenize, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
## Steps for processing data
tokenized_datasets = tokenized_datasets.remove_columns(["text"])

## Rename label column to labels, if not already done
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

tokenized_datasets.set_format("torch")
tokenized_datasets.column_names
## Our model needs columns that it already knows (the 4 below, NOT any custom columns like "text"):
#['labels', 'input_ids', 'token_type_ids', 'attention_mask']

### Model specification and Training

In [None]:
import json
from datasets import load_metric
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    # Open a file with access mode 'a'
    file_object = open('training_metrics.txt', 'a')
    # Append at the end of 
    result = json.dumps({
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    })
    file_object.write(result + '\n')
    # Close the file
    file_object.close()
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
from transformers import TrainingArguments
from transformers import Trainer
from transformers import AutoModelForSequenceClassification

training_args = TrainingArguments(
                    "test-trainer",
                    evaluation_strategy="epoch",
                    per_device_train_batch_size=2,
                    num_train_epochs=4,
                    save_strategy='no'
                    seed=2019,
                    )

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
## Set to use GPU
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

In [None]:
trainer.train()

In [None]:
## Save
trainer.save_model("/klimaBERTe11_v2.1")

Load model (example)

In [None]:
## Load (part1)
from transformers import Trainer
from transformers import AutoModelForSequenceClassification
from datasets import load_metric

def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

checkpoint2 = "/klimaBERT_v2"
tokenizer2 = AutoTokenizer.from_pretrained(checkpoint2)
training_args2 = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model2 = AutoModelForSequenceClassification.from_pretrained(checkpoint2, num_labels=2)


In [None]:
## Load (part2)
trainer2 = Trainer(
    model2,
    tokenizer=tokenizer2,
    compute_metrics=compute_metrics,
)

In [None]:
## Predict using the loaded model
predictions2 = trainer2.predict(tokenized_datasets["test"])
print(predictions2)

In [None]:
type(predictions2)

### Test on Twitter Data

This Twitter dataset is from huggingface's data library. It does not in particular contain climate-related material, but more broad material from Twitter.

In [None]:
import pandas as pd 
df = pd.read_csv("labeled_tw138_testing.csv")

#uncomment if you want to test on only climate quotes
#df = df[df['label']==1]

test_set = Dataset.from_pandas(df, preserve_index=False)

test_set


In [None]:
list_of_labels = test_set['label']
test_set.rename_column("label", "original_label")

In [None]:
## Load Twitter data and tokenize it
dataset_twitter = test_set
tokenized_datasets_twitter = dataset_twitter.map(tokenize, batched=True)
tokenized_datasets_twitter.set_format("torch")

In [None]:
## Predictions:
#%%capture
tokenized_datasets_twitter
predictions = trainer.predict(tokenized_datasets_twitter)
predictions

In [None]:
label = []

for (i,j) in predictions[0]:
  if i > j: label.append("non-climate")
  else: label.append("climate")


In [None]:
## Compare y_real with y_pred
df_compare = pd.DataFrame()
df_compare = dataset_twitter.to_pandas()
df_compare["y_pred"] = label
df_compare['original_label'] = list_of_labels


In [None]:

df_twitter_climate = df_compare[df_compare["y_pred"]=="non-climate"].reset_index()
print("y_pred:climate",len(df_twitter_climate.index))
print("y_pred:non-climate",len(df_compare.index))

In [None]:
#Showing Falsely labelled non-climate quotes
for i in range(len(df_twitter_climate['text'])):
  print(df_twitter_climate['text'][i], "\n")

### Performance metrics

#### Load model and test set

In [8]:
## Load (part1)
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification
from datasets import load_metric

def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

checkpoint = "/klimaBERTe4_v2.1"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

## Load trainer API
trainer = Trainer(
    model,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [9]:
def tokenize(batch):
    tokenized_batch = tokenizer(batch['text'], padding=True, truncation=True, max_length=512)
    return tokenized_batch

dataset = dataset["test"]
tokenized_datasets = dataset.map(tokenize, batched=True)
tokenized_datasets.set_format("torch")

100%|██████████| 1/1 [00:00<00:00,  3.33ba/s]


In [10]:
## Predict on sub-set
predictions2 = trainer.predict(tokenized_datasets)
predictions2

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 200
  Batch size = 8
Downloading builder script: 5.76kB [00:00, 764kB/s]                    


PredictionOutput(predictions=array([[ 4.7510056, -3.6382744],
       [ 4.6295094, -3.6536262],
       [ 4.636514 , -3.5109625],
       [-3.8254824,  3.5311162],
       [ 4.570917 , -3.5767603],
       [-4.205848 ,  3.7701135],
       [ 4.658006 , -3.5874648],
       [ 4.8381867, -3.8025331],
       [-4.7697635,  4.045868 ],
       [ 4.7213674, -3.8322732],
       [-4.5998554,  4.095766 ],
       [ 4.6287847, -3.6518412],
       [-4.696137 ,  3.9940534],
       [-3.2632928,  3.12307  ],
       [ 4.6697454, -3.7154021],
       [-4.4340196,  3.8638675],
       [-4.534858 ,  3.919992 ],
       [ 4.6017013, -3.6463678],
       [ 4.6675735, -3.6929398],
       [ 4.728398 , -3.5579088],
       [ 4.426664 , -3.553609 ],
       [ 4.685045 , -3.6837354],
       [-2.6666918,  2.5495653],
       [ 4.5215707, -3.5741937],
       [ 4.6695957, -3.6921668],
       [ 4.5658965, -3.5805566],
       [ 4.626716 , -3.6279554],
       [ 4.616471 , -3.5807369],
       [ 4.5933685, -3.6979885],
       [ 4.480

#### Calculate metrics

In [12]:
# example of a roc curve for a predictive model
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from matplotlib import pyplot
import torch
from sklearn.dummy import DummyClassifier
from datasets import load_metric
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
m = torch.nn.Softmax(dim=-1)

In [23]:
label = []

for (i,j) in predictions2[0]:
  if i > j: label.append("non-climate")
  else: label.append("climate")

{'For': 0, 'Imod': 1}

In [25]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.metrics import roc_auc_score, f1_score
lab2int = {"non-climate":0, "climate":1}
y_test2 =  dataset["label"]
y_pred2 = [lab2int[item] for item in label]

In [26]:
auc = roc_auc_score(y_test2, y_pred2)
precision = precision_score(y_test2, y_pred2)
recall = recall_score(y_test2, y_pred2)
f1 = f1_score(y_test2, y_pred2)
f1_macro = f1_score(y_test2, y_pred2,average="macro")
f1_micro = f1_score(y_test2, y_pred2,average="micro")
f1_weight = f1_score(y_test2, y_pred2,average="weighted")
print('Precision: {} / Recall: {} / AUC: {} / F1: {} / F1-macro: {} / F1-micro {}  / F1-weight {} '.format(
    round(precision, 2), round(recall, 2), round(auc,2), round(f1,2), round(f1_macro,2), round(f1_micro,2), round(f1_weight,2)))

Precision: 1.0 / Recall: 1.0 / AUC: 1.0 / F1: 1.0 / F1-macro: 1.0 / F1-micro 1.0  / F1-weight 1.0 
