In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from torch.utils.data import DataLoader

In [None]:
#Datasets
path = "./ft_meetings_votes_995_covid.csv"
df = pd.read_csv(path) #data_collection/votes_data_cleaned.pkl
dataset = Dataset.from_pandas(df, preserve_index=False)
dataset = dataset.remove_columns('Unnamed: 0')

# 80% train, 18% test + 2% validation
train_test = dataset.train_test_split(test_size=0.2)
# Split the 20% test + valid in half test, half valid
test_valid = train_test['test'].train_test_split(test_size=0.10)

# gather everyone if you want to have a single DatasetDict
train_test_valid_dataset = DatasetDict({
    'train': train_test['train'],
    'test': test_valid['test'],
    'validation': test_valid['train']})

dataset = train_test_valid_dataset

In [None]:
dataset

In [None]:
df = dataset["train"].to_pandas()

In [None]:
df.head(1)

### Model selection

In [None]:
#Models
Aelaectra = "Maltehb/aelaectra-danish-electra-small-cased"
BERT = "Maltehb/danish-bert-botxo"

checkpoint = BERT

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
def tokenize(batch):
    tokenized_batch = tokenizer(batch['text'], padding=True, truncation=True, max_length=512)
    return tokenized_batch


tokenized_datasets = dataset.map(tokenize, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
## Steps for processing data
tokenized_datasets = tokenized_datasets.remove_columns(["text"])

## Rename label column to labels, if not already done
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

tokenized_datasets.set_format("torch")
tokenized_datasets.column_names
## Our model needs columns that it already knows (the 4 below, NOT any custom columns like "text"):
#['labels', 'input_ids', 'token_type_ids', 'attention_mask']

### Model specification and Training

In [None]:
import json
from datasets import load_metric
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    # Open a file with access mode 'a'
    file_object = open('training_metrics.txt', 'a')
    # Append at the end of 
    result = json.dumps({
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    })
    file_object.write(result + '\n')
    # Close the file
    file_object.close()
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
from transformers import TrainingArguments
from transformers import Trainer
from transformers import AutoModelForSequenceClassification

training_args = TrainingArguments(
                    "test-trainer",
                    evaluation_strategy="epoch",
                    per_device_train_batch_size=2,
                    num_train_epochs=4,
                    save_strategy='no'
                    seed=2019,
                    )

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
## Set to use GPU
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

In [None]:
trainer.train()

In [None]:
## Save
trainer.save_model("/klimaBERTe11_v2.1")

Load model (example)

In [None]:
## Load (part1)
from transformers import Trainer
from transformers import AutoModelForSequenceClassification
from datasets import load_metric

def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

checkpoint2 = "/klimaBERT_v2"
tokenizer2 = AutoTokenizer.from_pretrained(checkpoint2)
training_args2 = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model2 = AutoModelForSequenceClassification.from_pretrained(checkpoint2, num_labels=2)


In [None]:
## Load (part2)
trainer2 = Trainer(
    model2,
    tokenizer=tokenizer2,
    compute_metrics=compute_metrics,
)

In [None]:
## Predict using the loaded model
predictions2 = trainer2.predict(tokenized_datasets["test"])
print(predictions2)

In [None]:
type(predictions2)

### Test on Twitter Data

This Twitter dataset is from huggingface's data library. It does not in particular contain climate-related material, but more broad material from Twitter.

In [None]:
import pandas as pd 
df = pd.read_csv("labeled_tw138_testing.csv")

#uncomment if you want to test on only climate quotes
#df = df[df['label']==1]

test_set = Dataset.from_pandas(df, preserve_index=False)

test_set


In [None]:
list_of_labels = test_set['label']
test_set.rename_column("label", "original_label")

In [None]:
## Load Twitter data and tokenize it
dataset_twitter = test_set
tokenized_datasets_twitter = dataset_twitter.map(tokenize, batched=True)
tokenized_datasets_twitter.set_format("torch")

In [None]:
## Predictions:
#%%capture
tokenized_datasets_twitter
predictions = trainer.predict(tokenized_datasets_twitter)
predictions

In [None]:
label = []

for (i,j) in predictions[0]:
  if i > j: label.append("non-climate")
  else: label.append("climate")


In [None]:
## Compare y_real with y_pred
df_compare = pd.DataFrame()
df_compare = dataset_twitter.to_pandas()
df_compare["y_pred"] = label
df_compare['original_label'] = list_of_labels


In [None]:

df_twitter_climate = df_compare[df_compare["y_pred"]=="non-climate"].reset_index()
print("y_pred:climate",len(df_twitter_climate.index))
print("y_pred:non-climate",len(df_compare.index))

In [None]:
#Showing Falsely labelled non-climate quotes
for i in range(len(df_twitter_climate['text'])):
  print(df_twitter_climate['text'][i], "\n")