# SCIBERT in HuggingFace

In [1]:
import os

# needs to be executed before importing torch or transformers
# server specific: only use last 3 gpus (on rattle.ifi.uzh.ch)
os.environ["CUDA_VISIBLE_DEVICES"] = "3,4,5,6"
# set the home directory for huggingface transformers (where the models are saved)
# by default this is '~/.cache/huggingface/hub'
# see https://stackoverflow.com/questions/61798573/where-does-hugging-faces-transformers-save-models
# server specific:
os.environ["HF_HOME"] = "/srv/scratch2/dbielik/.cache/huggingface/hub"

import torch
from pathlib import Path
from transformers import set_seed
from dotenv import load_dotenv

# path of the directory containing this file
BASE_DIR_PATH = Path.cwd().parent
# path of the data directory
DATA_DIR_PATH = BASE_DIR_PATH / "data" / "swisstext-2024-sharedtask"

CHECKPOINT_PATH = "/srv/scratch2/dbielik/.cache/huggingface/checkpoints"

load_dotenv(BASE_DIR_PATH.parent / ".env")

torch.set_printoptions(threshold=10_000)

USE_DETERMINISTIC_ALGORITHMS = False
torch.use_deterministic_algorithms(USE_DETERMINISTIC_ALGORITHMS)
if USE_DETERMINISTIC_ALGORITHMS:
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"

if not torch.cuda.is_available():
    print("Warning: CUDA not available!")

SEED = 1337
set_seed(SEED)

In [2]:
from datasets import load_dataset

# load the dataset
# note: if you don't have the data in the folder, use the download-data.sh script
dataset = load_dataset("json", data_files=str(DATA_DIR_PATH / "task1.jsonl")).class_encode_column("SDG")
dataset = dataset["train"].train_test_split(test_size=0.3, stratify_by_column="SDG", seed=SEED)

example = dataset["train"][0]
print("Example instance:\t", example)

labels = set(dataset["train"]["SDG"])
# identity because labels are already ids and vice-versa
id2label = lambda i: i
label2id = id2label
labels

Example instance:	 {'ID': 'oai:www.zora.uzh.ch:168503', 'TITLE': 'The carbon bubble and the pricing of bank loans', 'ABSTRACT': 'Neglecting the possibility that fossil fuel reserves can become ‘stranded’ could result in a ‘carbon bubble’ as fossil fuel firms become overvalued. This column studies whether banks price the climate policy risk of fossil fuel firms. Prior to 2015, banks did not appear to price climate policy risk. After 2015, however, the risk is priced to a certain extent, especially for firms holding more fossil fuel reserves.', 'URL': 'https://www.zora.uzh.ch/id/eprint/168503', 'SDG': 5}


{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}

In [3]:
from transformers import AutoTokenizer
from transformers.data import DataCollatorWithPadding

# base model
HF_MODEL_NAME = "meta-llama/Meta-Llama-3-8B"
# final model
MODEL_NAME = f"{HF_MODEL_NAME}-ft-task1"

tokenizer = AutoTokenizer.from_pretrained(HF_MODEL_NAME)
tokenizer.add_special_tokens({"pad_token":"<pad>"})

def preprocess_data(instances):
    # take a batch of titles and abstracts and concat them
    titles = instances["TITLE"]
    abstracts = instances["ABSTRACT"]
    texts = [f"{title} {abstract}" for title, abstract in zip(titles, abstracts)]
    # encode
    encoding = tokenizer(texts, padding='longest', max_length=512, truncation=True, return_tensors="pt")

    # add labels
    encoding["label"] = torch.tensor([label2id(label) for label in instances["SDG"]])

    return encoding

tokenized_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset["train"].column_names)
data_collator = DataCollatorWithPadding(tokenizer)
tokenized_dataset.set_format("torch")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/301 [00:00<?, ? examples/s]

Map:   0%|          | 0/129 [00:00<?, ? examples/s]

In [4]:
example = tokenized_dataset["train"][0]
print("Example instance:\t", example)

tokenizer.decode(example["input_ids"])

Example instance:	 {'input_ids': tensor([128000,    791,  12782,  24529,    323,    279,  21913,    315,   6201,
         17017,  24952,    772,    287,    279,  13336,    430,  31376,  10633,
         30600,    649,   3719,   3451,    496,   6601,    529,   1436,   1121,
           304,    264,   3451,  74441,  24529,    529,    439,  31376,  10633,
         19339,   3719,    927,    838,   3340,     13,   1115,   3330,   7978,
          3508,  14286,   3430,    279,  10182,   4947,   5326,    315,  31376,
         10633,  19339,     13,  32499,    311,    220,    679,     20,     11,
         14286,   1550,    539,   5101,    311,   3430,  10182,   4947,   5326,
            13,   4740,    220,    679,     20,     11,   4869,     11,    279,
          5326,    374,  33705,    311,    264,   3738,  13112,     11,   5423,
           369,  19339,  10168,    810,  31376,  10633,  30600,     13, 128256,
        128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256, 128256,
       

'<|begin_of_text|>The carbon bubble and the pricing of bank loans Neglecting the possibility that fossil fuel reserves can become ‘stranded’ could result in a ‘carbon bubble’ as fossil fuel firms become overvalued. This column studies whether banks price the climate policy risk of fossil fuel firms. Prior to 2015, banks did not appear to price climate policy risk. After 2015, however, the risk is priced to a certain extent, especially for firms holding more fossil fuel reserves.<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><p

In [11]:
from transformers.models.llama import LlamaForSequenceClassification
from peft import LoraConfig, TaskType, get_peft_model


model = LlamaForSequenceClassification.from_pretrained(
    HF_MODEL_NAME,
    num_labels=len(labels),
    token=os.environ["HF_TOKEN"]
).bfloat16()

peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, inference_mode=False, r=1, lora_alpha=32, lora_dropout=0.1)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 499,712 || all params: 7,505,498,112 || trainable%: 0.006657945849070916


In [12]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import TrainingArguments, Trainer, EvalPrediction

BATCH_SIZE = 1
METRIC_NAME = "accuracy"

args = TrainingArguments(
    f"{CHECKPOINT_PATH}/{MODEL_NAME}",
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=10,
    eval_steps=10,
    logging_steps=10,
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=20,
    load_best_model_at_end=True,
    save_total_limit=2,
    metric_for_best_model=METRIC_NAME,
    seed=SEED
)

def compute_metrics(pred: EvalPrediction):
    labels = pred.label_ids
    accuracy = accuracy_score(labels, pred.predictions.argmax(-1))
    precision, recall, f1, _ = precision_recall_fscore_support(labels, pred.predictions.argmax(-1), average="weighted")
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


Detected kernel version 4.19.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU 

In [None]:
trainer.train()



Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
10,2.4437,2.292037,0.364341,0.132744,0.364341,0.194591
20,2.2613,2.152817,0.364341,0.132744,0.364341,0.194591
30,1.9506,2.037462,0.387597,0.204955,0.387597,0.241803
40,1.7841,1.933319,0.465116,0.303963,0.465116,0.349424
50,1.5414,1.839464,0.503876,0.373604,0.503876,0.395588
60,1.4301,1.784772,0.488372,0.345318,0.488372,0.393404
70,1.2049,1.744073,0.503876,0.396392,0.503876,0.439102
80,1.0367,1.723694,0.496124,0.381137,0.496124,0.422098
90,0.9642,1.719963,0.503876,0.425652,0.503876,0.455959
100,0.841,1.710947,0.488372,0.401942,0.488372,0.438292


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

TrainOutput(global_step=140, training_loss=1.3060387577329362, metrics={'train_runtime': 363.6788, 'train_samples_per_second': 16.553, 'train_steps_per_second': 0.385, 'total_flos': 1584156096552960.0, 'train_loss': 1.3060387577329362, 'epoch': 20.0})

In [9]:
import gc

try:
    del model
except NameError:
    pass
gc.collect()
torch.cuda.empty_cache()


### Investigating sklearn precision warning

In [None]:
p = torch.ones((86, 18)).argmax(-1)
p[:10] = torch.Tensor([i for i in range(10)])
t = torch.ones(86, dtype=torch.long)
t[:18] = torch.Tensor([i for i in range(18)])

# no Warning
precision_recall_fscore_support(t, p, average="weighted", labels=[i for i in range(10)])

(0.9873459873459873, 0.1282051282051282, 0.12816755893678972, None)

In [None]:
# Warning
precision_recall_fscore_support(t, p, average="weighted", labels=[i for i in range(11)])

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


(0.9748479368732533, 0.12658227848101267, 0.12654518477303286, None)