In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import DatasetDict, Dataset, load_from_disk
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import LabelEncoder
from transformers import BitsAndBytesConfig
from accelerate import Accelerator
from peft import prepare_model_for_kbit_training, LoraConfig, TaskType, get_peft_model
from transformers import AutoModelForSequenceClassification

In [2]:
# this thing says I should use peft_model_for_sequence_classification:
#https://huggingface.co/docs/peft/package_reference/peft_model

In [3]:
path_to_retrieve = "../tokenized_dataset"


In [4]:
dataset_dict = load_from_disk(path_to_retrieve)

In [5]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [6]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [7]:
model_name = "bert-base-uncased"

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5, device_map={"":0})
#model.gradient_checkpointing_enable()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
print_trainable_parameters(model)

trainable params: 109486085 || all params: 109486085 || trainable%: 100.0


In [10]:
from peft import PeftModelForSequenceClassification, get_peft_config


In [11]:
config = {
    "peft_type": "PREFIX_TUNING",
    "task_type": "SEQ_CLS",
    "inference_mode": False,
    "num_virtual_tokens": 20,
    "token_dim": 768,
    "num_transformer_submodules": 1,
    "num_attention_heads": 12,
    "num_layers": 12,
    "encoder_hidden_size": 768,
    "prefix_projection": False,
    #"postprocess_past_key_value_function": None,
}

In [12]:
peft_config = get_peft_config(config)


In [13]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
peft_model = PeftModelForSequenceClassification(model, peft_config)


In [15]:
peft_model.print_trainable_parameters()


trainable params: 371,716 || all params: 108,681,988 || trainable%: 0.34202171568668766


In [9]:
#model.gradient_checkpointing_enable()

model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)

In [10]:
target_modules = [

    "bert.encoder.layer.{}.attention.self.query".format(i) for i in range(12)
] + [
    "bert.encoder.layer.{}.attention.self.key".format(i) for i in range(12)
] + [
    "bert.encoder.layer.{}.attention.self.value".format(i) for i in range(12)
]

In [11]:

peft_config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    target_modules=target_modules,
    lora_dropout=0.05, 
    bias="none", 
    task_type="SEQ2SEQ_LM"
)

In [12]:
model = get_peft_model(model, peft_config)

In [13]:
#model = get_peft_model(model, peft_config)
print_trainable_parameters(model)

trainable params: 442368 || all params: 109928453 || trainable%: 0.4024144686180565


In [16]:
def compute_metrics(p):
    logits, labels = p.predictions, p.label_ids
    preds = logits.argmax(axis=-1)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc}

In [17]:
training_args = TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    output_dir='/results',
    num_train_epochs=1,
    evaluation_strategy="steps",
    save_steps=10,
    save_total_limit=2,
    remove_unused_columns=False,
    run_name='run_name',
    logging_dir='/logs',
    logging_steps=10,
    load_best_model_at_end=True,
)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["test"],
    compute_metrics=compute_metrics, 
)

In [19]:
trainer.train()


[34m[1mwandb[0m: Currently logged in as: [33mlukemonington3[0m. Use [1m`wandb login --relogin`[0m to force relogin


../aten/src/ATen/native/cuda/Loss.cu:240: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [1,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:240: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [2,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:240: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [6,0,0] Assertion `t >= 0 && t < n_classes` failed.
../aten/src/ATen/native/cuda/Loss.cu:240: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [7,0,0] Assertion `t >= 0 && t < n_classes` failed.


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


wandb: Network error (ReadTimeout), entering retry loop.


In [16]:
pip freeze

absl-py==2.0.0
accelerate==0.23.0
aiohttp==3.8.5
aiosignal==1.3.1
anyio==4.0.0
appdirs==1.4.4
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
arrow==1.2.3
asttokens==2.4.0
async-lru==2.0.4
async-timeout==4.0.3
attrs==23.1.0
Babel==2.12.1
backcall==0.2.0
beautifulsoup4==4.12.2
bitsandbytes==0.41.1
bleach==6.0.0
blinker==1.4
cachetools==5.3.1
certifi==2023.7.22
cffi==1.15.1
charset-normalizer==3.2.0
click==8.1.7
cmake==3.27.5
comm==0.1.4
contourpy==1.1.1
cryptography==3.4.8
cycler==0.11.0
datasets==2.14.5
dbus-python==1.2.18
debugpy==1.8.0
decorator==5.1.1
deepspeed==0.10.3
defusedxml==0.7.1
dill==0.3.7
distro==1.7.0
distro-info==1.1+ubuntu0.1
docker-pycreds==0.4.0
evaluate==0.4.0
exceptiongroup==1.1.3
executing==1.2.0
fastjsonschema==2.18.0
filelock==3.12.4
fonttools==4.42.1
fqdn==1.5.1
frozenlist==1.4.0
fsspec==2023.6.0
gitdb==4.0.10
GitPython==3.1.37
glob2==0.7
google-auth==2.23.1
google-auth-oauthlib==1.0.0
grpcio==1.58.0
hjson==3.1.0
httplib2==0.20.2
huggingface-hub==0.17.3
idna==3