In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import DatasetDict, Dataset, load_from_disk
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import LabelEncoder
from transformers import BitsAndBytesConfig
from accelerate import Accelerator
from peft import prepare_model_for_kbit_training, LoraConfig, TaskType, get_peft_model

In [2]:
path_to_retrieve = "../tokenized_dataset"


In [3]:
dataset_dict = load_from_disk(path_to_retrieve)

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [5]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

In [6]:
model_name = "bert-large-uncased"

In [7]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=5, quantization_config=bnb_config)
#model.gradient_checkpointing_enable()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
print_trainable_parameters(model)

trainable params: 31886341 || all params: 183627781 || trainable%: 17.36


target_modules = [

    "bert.encoder.layer.{}.attention.self.query".format(i) for i in range(12)
] + [
    "bert.encoder.layer.{}.attention.self.key".format(i) for i in range(12)
] + [
    "bert.encoder.layer.{}.attention.self.value".format(i) for i in range(12)
]

In [9]:
target_modules = ['q_proj','k_proj','v_proj','o_proj','gate_proj','down_proj','up_proj','lm_head']


In [10]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["query", "value"]
)

In [None]:
peft_config = LoraConfig(
    r=16, # rank
    lora_alpha=16, # lora scaling factor
    target_modules=["query", "value"], # modules to apply LoRA
    lora_dropout=0.1, # dropout
    bias="none",
    modules_to_save=["classifier"], # additional modules to save
)

In [11]:
model = get_peft_model(model, peft_config)
print_trainable_parameters(model)

trainable params: 1583114 || all params: 185205770 || trainable%: 0.85


In [12]:
def compute_metrics(p):
    logits, labels = p.predictions, p.label_ids
    preds = logits.argmax(axis=-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

In [13]:
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir='/results',
    num_train_epochs=1,
    evaluation_strategy="steps",
    save_steps=10,
    save_total_limit=2,
    remove_unused_columns=False,
    run_name='run_name',
    logging_dir='/logs',
    logging_steps=10,
    load_best_model_at_end=True,
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["test"],
    compute_metrics=compute_metrics, 
)

In [15]:
trainer.train()


[34m[1mwandb[0m: Currently logged in as: [33mlukemonington3[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss
10,0.8852,No log


KeyError: 'eval_loss'