In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import DatasetDict, Dataset, load_from_disk
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, GenerationConfig

from sklearn.metrics import accuracy_score

from sklearn.preprocessing import LabelEncoder
from transformers import BitsAndBytesConfig
from accelerate import Accelerator
from peft import prepare_model_for_kbit_training, LoraConfig, TaskType, get_peft_model
from transformers import TrainingArguments, AutoConfig, \
    AutoModelForSequenceClassification, AutoTokenizer, BitsAndBytesConfig, DataCollatorWithPadding
from peft import (
    PeftConfig,
    PeftModel,
)

In [2]:
#https://huggingface.co/docs/peft/quicktour
#https://huggingface.co/docs/peft/conceptual_guides/lora

In [3]:
path_to_retrieve = "../tokenized_dataset"


In [4]:
dataset_dict = load_from_disk(path_to_retrieve)

In [5]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

In [6]:
model_id = "bert-large-uncased"
num_labels=5

In [7]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [8]:

model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=5,quantization_config=bnb_config, device_map={"":0})

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [10]:
config = LoraConfig(
    r=32,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="all",
    task_type=TaskType.SEQ_CLS
)


In [11]:
peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, inference_mode=False, r=12, lora_alpha=32, lora_dropout=0.1)


In [12]:
model = get_peft_model(model, peft_config)


In [13]:
model.print_trainable_parameters()


trainable params: 1,189,898 || all params: 336,331,786 || trainable%: 0.35378695964228607


In [14]:
def compute_metrics(p):
    logits, labels = p.predictions, p.label_ids
    preds = logits.argmax(axis=-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

In [15]:
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir='/results',
    num_train_epochs=1,
    evaluation_strategy="steps",
    save_steps=10,
    save_total_limit=2,
    remove_unused_columns=False,
    run_name='run_name',
    logging_dir='/logs',
    logging_steps=10,
    load_best_model_at_end=True,
    learning_rate=5e-4
)

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["test"],
    compute_metrics=compute_metrics, 
)

In [17]:
trainer.train()


[34m[1mwandb[0m: Currently logged in as: [33mlukemonington3[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Accuracy
10,1.8085,1.620777,0.27
20,1.5321,1.38492,0.395
30,1.3305,1.295095,0.415
40,1.3296,1.122131,0.59
50,1.2295,1.099754,0.51
60,1.0218,0.954341,0.605
70,0.8702,0.802804,0.72
80,0.8117,0.735125,0.67
90,0.8554,0.709131,0.755
100,0.7464,0.679095,0.775


TrainOutput(global_step=100, training_loss=1.1535734462738036, metrics={'train_runtime': 149.5552, 'train_samples_per_second': 5.349, 'train_steps_per_second': 0.669, 'total_flos': 376090681344000.0, 'train_loss': 1.1535734462738036, 'epoch': 1.0})