In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EvalPrediction
from datasets import load_dataset
from peft import VeraConfig, get_peft_model
import numpy as np
import torch
import os
import evaluate

In [2]:
torch.set_printoptions(threshold=torch.inf)  # Display all elements
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
BASE_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tok  = AutoTokenizer.from_pretrained(BASE_ID, use_fast=True)

model = AutoModelForSequenceClassification.from_pretrained(
        BASE_ID,
        num_labels=2,
        # load_in_4bit=True,
        device_map="auto")

vera_cfg = VeraConfig(
        r=128,
        target_modules=["q_proj","v_proj"],
        fan_in_fan_out=False,)


model = get_peft_model(model, vera_cfg)
model.config.pad_token_id = tok.pad_token_id

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at TinyLlama/TinyLlama-1.1B-Chat-v1.0 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
core_model   = model.get_base_model()        # → LlamaForSequenceClassification
llama_blocks = core_model.model.layers       # → ModuleList of decoder layers
qproj_0      = llama_blocks[0].self_attn.q_proj

print(type(qproj_0))          # should be your Linear4bit / Linear8bitLt
print(qproj_0.weight.shape)   # should be (out, in)  e.g.  (4096, 2048)


<class 'peft.tuners.vera.layer.Linear'>
torch.Size([2048, 2048])


In [5]:
print(model)

PeftModel(
  (base_model): VeraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 2048)
        (layers): ModuleList(
          (0-3): 4 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): vera.Linear(
                in_features=2048, out_features=2048, bias=False
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (vera_dropout): ModuleDict(
                  (default): Identity()
                )
                (vera_lambda_b): ParameterDict(  (default): Parameter containing: [torch.cuda.FloatTensor of size 2048 (cuda:0)])
                (vera_lambda_d): ParameterDict(  (default): Parameter containing: [torch.cuda.FloatTensor of size 128 (cuda:0)])
                (vera_A): BufferDict(  (default): Buffer containing: [torch.cuda.FloatTensor of size 128x2048 (GPU 3)])
                (vera_B): BufferDict(  (default): Buffer con

In [6]:
# ---------- data ----------
raw_ds = load_dataset("glue", "sst2")

def tokenize(batch):
    natural   = tok(batch["sentence"], add_special_tokens=True)
    true_lens = [len(ids) for ids in natural["input_ids"]]

    padded = tok(
        batch["sentence"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )

    padded["real_length"] = true_lens
    return padded

tokenized_ds = raw_ds.map(
    tokenize,
    batched=True,
    remove_columns=["sentence", "idx"]
)

# rename + set Torch format
tokenized_ds = tokenized_ds.rename_column("label", "labels")
tokenized_ds.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels", "real_length"],
)

# ---------- stats ----------
max_len = max(tokenized_ds["train"]["real_length"])
print(f"Longest raw sentence: {max_len} tokens")

Longest raw sentence: 74 tokens


In [7]:
max_len = max(tokenized_ds["test"]["real_length"])
print(f"Longest raw sentence: {max_len} tokens")

Longest raw sentence: 70 tokens


In [8]:
# ---------- data ----------
raw_datasets = load_dataset("glue", "sst2")
def tokenize_function(example):
    return tok(example["sentence"], truncation=True, padding="max_length", max_length=128)

# Tokenize the entire dataset
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
del raw_datasets
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [9]:
# ---------- trainer ----------
args = TrainingArguments(
        output_dir="vera-tiny-sst2",
        per_device_train_batch_size=32,
        num_train_epochs=2,
        learning_rate=3e-3,
        # fp16=True,
        evaluation_strategy="epoch",
        save_strategy="no",
        logging_steps=50)


trainer = Trainer(model=model,
                  args=args,
                  train_dataset=tokenized_datasets["train"],
                  eval_dataset=tokenized_datasets["validation"])



In [10]:
trainer.train()

forward called!!!!!!!!!!!!!!!!
forward called!!!!!!!!!!!!!!!!
forward called!!!!!!!!!!!!!!!!
forward called!!!!!!!!!!!!!!!!
forward called!!!!!!!!!!!!!!!!



KeyboardInterrupt



In [7]:
predictions = trainer.predict(tokenized_datasets["validation"])
logits = predictions.predictions[1]
preds = np.argmax(logits, axis=-1)
metric = evaluate.load("accuracy")

tokenized_datasets["validation"]["labels"]
metric.compute(predictions=preds, references=tokenized_datasets["validation"]["labels"])

{'accuracy': 0.9392201834862385}

In [None]:
# For non trained model accuracy 0.4919
# For r=128 one epoch lr 3e-3 accuracy 0.932
# For r=128 two epochs lr 3e-3 accuracy 0.939
/home/storage/guy_bilitski/Advanced experiments/vera-tiny-sst2/LLM adaptors.ipynb

In [9]:
# after trainer.train()
adapter_dir = "vera_adapter"
model.save_pretrained(adapter_dir, safe_serialization=True)  # adapter only
tok.save_pretrained(adapter_dir)                             # optional, for easy reload

('vera_adapter/tokenizer_config.json',
 'vera_adapter/special_tokens_map.json',
 'vera_adapter/tokenizer.model',
 'vera_adapter/added_tokens.json',
 'vera_adapter/tokenizer.json')

In [10]:
base = AutoModelForSequenceClassification.from_pretrained(
           BASE_ID, load_in_4bit=True, device_map="auto")
from peft import PeftModel
model = PeftModel.from_pretrained(base, "vera_adapter").to(device)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at TinyLlama/TinyLlama-1.1B-Chat-v1.0 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
