In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import DatasetDict, Dataset, load_from_disk
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, GenerationConfig

from sklearn.metrics import accuracy_score

from sklearn.preprocessing import LabelEncoder
from transformers import BitsAndBytesConfig
import bitsandbytes as bnb
from accelerate import Accelerator
from peft import prepare_model_for_kbit_training, LoraConfig, TaskType, get_peft_model
from transformers import TrainingArguments, AutoConfig, \
    AutoModelForSequenceClassification, AutoTokenizer, BitsAndBytesConfig, DataCollatorWithPadding
from peft import (
    PeftConfig,
    PeftModel,
)

In [14]:
from transformers import EarlyStoppingCallback, IntervalStrategy


In [2]:
#https://huggingface.co/docs/peft/quicktour
#https://huggingface.co/docs/peft/conceptual_guides/lora
#https://blog.ovhcloud.com/fine-tuning-llama-2-models-using-a-single-gpu-qlora-and-ai-notebooks/

In [3]:
path_to_retrieve = "../tokenized_dataset"


In [4]:
dataset_dict = load_from_disk(path_to_retrieve)

In [5]:
set(dataset_dict['train']['labels'])

{0, 1, 2, 3, 4}

In [6]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param:.2f}"
    )

In [10]:
model_id = "bert-large-uncased"
num_labels=5

In [11]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [12]:

model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=5,quantization_config=bnb_config, device_map={"":0})

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [35]:
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

In [36]:
modules = find_all_linear_names(model)

In [37]:
modules

['query', 'dense', 'key', 'value']

In [38]:
config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.1,
    bias="all",
    task_type=TaskType.SEQ_CLS,
    target_modules = modules
)


In [39]:
peft_config = LoraConfig(task_type=TaskType.SEQ_CLS, inference_mode=False, r=12, lora_alpha=32, lora_dropout=0.1)


In [40]:
model = get_peft_model(model, peft_config)


In [41]:
model.print_trainable_parameters()


trainable params: 1,189,898 || all params: 336,331,786 || trainable%: 0.35378695964228607


In [42]:
def compute_metrics(p):
    logits, labels = p.predictions, p.label_ids
    preds = logits.argmax(axis=-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

In [43]:
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    #gradient_accumulation_steps=4,
    output_dir='/results',
    num_train_epochs=5,
    evaluation_strategy="steps",
    save_steps=10,
    save_total_limit=2,
    remove_unused_columns=False,
    run_name='run_name',
    logging_dir='/logs',
    logging_steps=10,
    load_best_model_at_end=True,
    learning_rate=5e-4,
    optim="paged_adamw_8bit",
)

In [44]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["test"],
    compute_metrics=compute_metrics, 
)

In [45]:
trainer.train()


Step,Training Loss,Validation Loss,Accuracy
10,1.6621,1.542344,0.325
20,1.4721,1.320019,0.46
30,1.3624,1.143827,0.585
40,1.195,0.985453,0.61
50,1.0086,0.733378,0.715
60,0.6556,0.58684,0.845
70,0.6403,0.461379,0.86
80,0.4495,0.334911,0.87
90,0.2796,0.228962,0.935
100,0.2057,0.197788,0.93


TrainOutput(global_step=500, training_loss=0.19404975135391578, metrics={'train_runtime': 741.3762, 'train_samples_per_second': 5.395, 'train_steps_per_second': 0.674, 'total_flos': 1880453406720000.0, 'train_loss': 0.19404975135391578, 'epoch': 5.0})

In [46]:
model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", torch_dtype=torch.bfloat16)
model = model.merge_and_unload()

output_merged_dir = "results/llama2/final_merged_checkpoint"
os.makedirs(output_merged_dir, exist_ok=True)
model.save_pretrained(output_merged_dir, safe_serialization=True)

# save tokenizer for easy inference
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained(output_merged_dir)

NameError: name 'AutoPeftModelForCausalLM' is not defined