In [None]:
! pip install transformers datasets evaluate accelerate peft -q

In [2]:
import numpy as np
import evaluate
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import pandas as pd
from datasets import Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Dataset

Every annoted dataset can be used for domain adaptaion. If the dataset is changed the tokenzie_function also as to be adapted to access the correct field.

In [None]:
labels_df = pd.read_csv('medical_tc_labels.csv')
train_df = pd.read_csv('medical_tc_train.csv')
test_df = pd.read_csv('medical_tc_test.csv')
num_labels = labels_df.shape[0]
print(num_labels)

# Convert the DataFrame to a Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [6]:
model_name = "bert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
def tokenize_function(examples):
    #return tokenizer(examples["text"], padding="max_length", truncation=True, return_tensors="pt")
    tokenized_inputs = tokenizer(examples['medical_abstract'], truncation=True, padding='max_length', max_length=512)
    # Here, we assume that 'condition_label' column exists in your CSV and is numeric starting at 1.
    # Subtracting 1 from the label since Transformers expects labels to start at 0.
    tokenized_inputs['labels'] = [label - 1 for label in examples['condition_label']]
    return tokenized_inputs

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = test_dataset.map(tokenize_function, batched=True)

In [None]:
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# LoRA

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=32,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["query", "value"],
    bias="none",
)

# add LoRA adaptor
model_lora = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
model_lora = get_peft_model(model_lora, lora_config)
model_lora.print_trainable_parameters() # see % trainable parameters

# Training

In [None]:
training_args = TrainingArguments(output_dir="bert_peft_trainer", evaluation_strategy="epoch", num_train_epochs = 15)
bert_peft_trainer = Trainer(
    model=model_lora,
    args=training_args,
    train_dataset=tokenized_train_dataset, # training dataset requires column input_ids
    eval_dataset=tokenized_eval_dataset,
    compute_metrics=compute_metrics,
)
print(bert_peft_trainer.evaluate())
bert_peft_trainer.train()
bert_peft_trainer.evaluate()

In [13]:
bert_peft_trainer.save_model('model')

# Combining the LoRA adapter with the network

It is also possible to use any checkpoint that was created during the training for the adapter depending on which epoch gives the best accuracy.

In [None]:
from peft import PeftModel
original_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)
original_with_adapter = PeftModel.from_pretrained(
  original_model, "model"
)
merged_model = original_with_adapter.merge_and_unload()
merged_model.save_pretrained("merged-model")

After this step a LoRA fine-tuned BERT model is created that can be further used.