In [10]:
GITHUB_DIR = "aspects-space"

In [None]:
!pip install transformers
!pip install transformers datasets evaluate
!pip install accelerate -U
!pip install pynvml
!pip install stanza

In [None]:
import sys
import os

try:  # When on Google Colab, clone the repository to download any necessary cache.
    import google.colab
    repo_path = GITHUB_DIR
    !git -C $repo_path pull origin || git clone "https://github.com/katrinrohrb/aspects-space-dev.git" $repo_path
except:
    repo_path = '.'  # Use the local path if not on Google Colab

In [19]:
!wget https://raw.githubusercontent.com/katrinrohrb/aspects-space-dev/refs/heads/colabtest/data/annotations.csv annotations.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100     9  100     9    0     0     29      0 --:--:-- --:--:-- --:--:--    29


In [13]:
module_path = os.path.abspath(os.path.join('/content', GITHUB_DIR))
sys.path.insert(0, module_path)

from katspace.core import MODEL_DIR
from pathlib import Path

In [None]:
import numpy as np

from datasets import load_dataset

from transformers import AutoTokenizer

from datasets import ClassLabel

from transformers import TrainingArguments, Trainer, logging
from transformers import AutoModelForSequenceClassification

from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

from pynvml import *

import evaluate

accuracy = evaluate.load("accuracy")

import numpy as np
from sklearn.metrics import f1_score

import logging
LOG_FILENAME = Path("session.log")
logging.basicConfig(filename=LOG_FILENAME, encoding='utf-8', level=logging.INFO, force=True)
logging.info("Start logging")

## Training

In [None]:
ds = load_dataset('csv', data_files= Path("annotations.csv").as_posix())
ds = ds["train"]

In [None]:
label2id = {"perceived_space": 0, "action_space": 1, "visual_space": 2, "descriptive_space":3, "no_space":4}
label_names = list(label2id.keys())

ds = ds.filter(lambda x: x["space_type"] in label_names)

labels = ClassLabel(num_classes = 5, names = label_names)
ds = ds.cast_column("space_type", labels)
ds = ds.remove_columns(['ID', 'filename', 'sentence_id'])
ds =ds.rename_column("space_type", "label")

In [None]:
ds = ds.train_test_split(test_size=0.3)
ds

In [None]:
tokenizer = AutoTokenizer.from_pretrained("lkonle/fiction-gbert-large")

In [46]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation = True)

In [None]:
ds = ds.map(preprocess_function, batched=True)

In [54]:
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

def compute_metrics(eval_pred):
    f1_dict = {}

    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    f1 = f1_score(y_true=labels, y_pred=predictions, average='micro', zero_division=0)
    print(classification_report(y_true=labels, y_pred=predictions))

    f1_dict['f1_score'] = f1

    ids = id2label.keys()

    for id in ids:
      indices = np.where(labels == id)
      y_true_sel = labels[indices]
      y_pred_sel = predictions[indices]

      f1_dict[f"f1_score_{id2label[id]}"] = f1_score(y_true = y_true_sel, y_pred = y_pred_sel, average = "micro", zero_division=0)


    return f1_dict

train_args = {
    #"output_dir": "bert_test_2023_07_26",
    "output_dir": Path(MODEL_DIR, "test.model").as_posix(),
    "learning_rate" : 1e-5,
    "save_strategy" : "epoch",
    "load_best_model_at_end" : True,
    "evaluation_strategy": "epoch",
    "num_train_epochs": 5,
    "log_level": "error",
    "report_to": "none",
}

batch_size = 14

#logging.set_verbosity_error()

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("lkonle/fiction-gbert-large", num_labels=5).to("cuda")

In [52]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

label2id = {"perceived_space": 0, "action_space": 1, "visual_space": 2, "descriptive_space":3, "no_space":4}
id2label = {v : k for k, v in label2id.items()}

In [53]:
def trim(dataset, batch_size):
  nr_samples = dataset.num_rows
  remainder = nr_samples % batch_size
  return dataset.select(range(nr_samples - remainder))

train_dataset=trim(ds["train"], batch_size)
eval_dataset=trim(ds["test"], batch_size)

In [None]:
training_args = TrainingArguments(per_device_train_batch_size=batch_size, **train_args)
trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=data_collator, compute_metrics=compute_metrics)
result = trainer.train()
print_summary(result)

# Inspect Results

In [None]:
predictions = trainer.predict(eval_dataset)

In [57]:
def print_predictions(dataset, trainer = None):
  output = trainer.predict(dataset)
  gold_ids = output.label_ids
  if gold_ids is None:
    gold_ids = ["?"] * dataset.num_columns
  label_ids = np.argmax(output.predictions, axis = 1)
  print(f"Number of samples: {dataset.num_columns}")
  for sentence, label, gold in zip(dataset["text"], label_ids, gold_ids):
    label = labels.int2str(int(label))
    if gold != "?":
      gold = labels.int2str(int(gold))
    print(f"\n\n{format(sentence)}\n\nPRED:\t{label}\n\nGOLD:\t{gold}\n\n+++++++++++++++++++++++++++++++++++++")

In [None]:
 print_predictions(eval_dataset.select(range(0,554)), trainer)