In [1]:
# from huggingface_hub import hf_hub_download 
import torch
from datasets import load_dataset, load_from_disk
import evaluate
import transformers
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModel
from transformers.modeling_outputs import TokenClassifierOutput
from transformers import TrainingArguments, Trainer, AdamW, get_scheduler
from peft import LoraConfig, TaskType, get_peft_model
from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate
from peft import PeftConfig, PeftModel
from torch import nn


import pandas as pd
import numpy as np
import os

Could not find the bitsandbytes CUDA binary at WindowsPath('D:/projects/LLM/env/lib/site-packages/bitsandbytes/libbitsandbytes_cuda116.dll')
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


In [2]:
print(transformers.__version__)

4.30.2


In [3]:
print(torch.__version__)
print(torch.cuda.is_available())

1.12.0+cu116
True


In [4]:
import peft
peft.__version__

'0.1.0'

In [5]:
! nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:41:10_Pacific_Daylight_Time_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [6]:
# del model
# del trainer
# torch.cuda.empty_cache()

In [7]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

def load_llm(model_path, num_labels):
    """
    run this for different experiments (freezing different params)
    """
    if not os.path.isfile(model_path + '/model.safetensors'):
        return 'model does not exist. Create model first'
    
    model = AutoModelForSequenceClassification.from_pretrained(model_path, 
                                                           num_labels=num_labels, 
                                                           cache_dir=cache_dir, 
                                                           local_files_only=True)
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    print(device)
    model.to(device) # use GPU
    return model

In [8]:
llm_repo_dir = 'D:/projects/LLM'
cache_dir = '/cygdrive/d/projects/LLM/.cache'
os.environ['TRANSFORMERS_CACHE'] = cache_dir
os.environ['HF_HOME'] = cache_dir + '/huggingface'
os.environ['XDG_CACHE_HOME'] = cache_dir
os.environ['HF_DATASETS_CACHE'] = cache_dir

model_path = cache_dir + '/models--google-bert--bert-base-cased/snapshots/cd5ef92a9fb2f889e972770a36d4ed042daf221e'
dataset_path = cache_dir + '/parquet/yelp_polarity' # cache_dir + '/parquet/yelp_review_full-e22176106d6e7534'
dataset_name = 'yelp_polarity' # yelp_review_full
tokenized_data_path = cache_dir + '/tokenized_dataset_yelp_polarity'
num_labels = 2

In [9]:
if not os.path.isdir(dataset_path):
    dataset = load_dataset(dataset_name, cache_dir=cache_dir + '/parquet')
else:
    dataset = load_dataset(dataset_path)
    
if not os.path.isdir(tokenized_data_path):
    print('tokenized dataset does not exist. Download dataset')
    if not os.path.isfile(model_path + '/tokenizer.json'):
        print('tokenizer does not exist. Create and save tokenized dataset')
        tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased", 
                                                  cache_dir=cache_dir) # to load tokenizer to cache
    else:
        print('tokenizer exists. Load from existing tokenizer')
        tokenizer = AutoTokenizer.from_pretrained(model_path, 
                                              cache_dir=cache_dir, 
                                              local_flies_only=True) # to load tokenizer from cache
    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    tokenized_datasets.save_to_disk(tokenized_data_path)
else:
    print('tokenized dataset exists. Load from disk')
    tokenized_datasets = load_from_disk(tokenized_data_path)

tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

Found cached dataset arrow (C:/cygwin64/home/jacky/.cache/huggingface/datasets/arrow/yelp_polarity-fa5030fa747c4f91/0.0.0/74f69db2c14c2860059d39860b1f400a03d11bf7fb5a8258ca38c501c878c137)


  0%|          | 0/2 [00:00<?, ?it/s]

tokenized dataset exists. Load from disk


Loading cached shuffled indices for dataset at D:\cygdrive\d\projects\LLM\.cache\tokenized_dataset_yelp_polarity\train\cache-0b983b74d94eba28.arrow
Loading cached shuffled indices for dataset at D:\cygdrive\d\projects\LLM\.cache\tokenized_dataset_yelp_polarity\test\cache-01e541604b191dc4.arrow


In [39]:
for i in range(2):
    try:
        del model
    except:
        pass
    
    torch.cuda.empty_cache()
    torch.manual_seed(123)
    
    # load model (use this if no MLP head is needed)
    if not os.path.isfile(model_path + '/pytorch_model.bin'):
        print('bert clf does not exist. Download model')
        model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", 
                                                                   num_labels=num_labels, 
                                                                   cache_dir=cache_dir,
                                                                  output_attentions=True,
                                                                  output_hidden_states=True)
    else:
        print('bert clf exists. Load from local file')
        model = AutoModelForSequenceClassification.from_pretrained(model_path, 
                                                                   num_labels=num_labels, 
                                                                   cache_dir=cache_dir, 
                                                                   local_files_only=True,
                                                                  output_attentions=True,
                                                                  output_hidden_states=True)
    
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device) # use GPU
    
    print('iter %i'%i)
    print('last output dense weight')
    print(model.bert.encoder.layer[-1].output.dense.weight[0,:10])
    print('classifier weight')
    print(model.classifier.weight[:10])

bert clf exists. Load from local file


Some weights of the model checkpoint at /cygdrive/d/projects/LLM/.cache/models--google-bert--bert-base-cased/snapshots/cd5ef92a9fb2f889e972770a36d4ed042daf221e were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequ

iter 0
last output dense weight
tensor([ 0.0267,  0.0778,  0.0361,  0.0079,  0.0181,  0.0295,  0.0151,  0.0394,
         0.0039, -0.0348], device='cuda:0', grad_fn=<SliceBackward0>)
classifier weight
tensor([[ 0.0311,  0.0180, -0.0131,  ...,  0.0065, -0.0095,  0.0194],
        [-0.0302,  0.0217,  0.0429,  ...,  0.0158,  0.0167,  0.0128]],
       device='cuda:0', grad_fn=<SliceBackward0>)
bert clf exists. Load from local file


Some weights of the model checkpoint at /cygdrive/d/projects/LLM/.cache/models--google-bert--bert-base-cased/snapshots/cd5ef92a9fb2f889e972770a36d4ed042daf221e were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequ

iter 1
last output dense weight
tensor([ 0.0267,  0.0778,  0.0361,  0.0079,  0.0181,  0.0295,  0.0151,  0.0394,
         0.0039, -0.0348], device='cuda:0', grad_fn=<SliceBackward0>)
classifier weight
tensor([[ 0.0311,  0.0180, -0.0131,  ...,  0.0065, -0.0095,  0.0194],
        [-0.0302,  0.0217,  0.0429,  ...,  0.0158,  0.0167,  0.0128]],
       device='cuda:0', grad_fn=<SliceBackward0>)


## MLP clf head

In [17]:
# peft.peft_model.PeftModel
# peft.peft_model.PeftModelForSequenceClassification
class ModelPeftMLP(peft.peft_model.PeftModel):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.num_labels = 2
        self.mlp = nn.Sequential(
            nn.Linear(768,32),
            nn.ReLU(),
            nn.Dropout(p=0.1),
            nn.Linear(32,16),
            nn.ReLU(),
            nn.Dropout(p=0.1),
            nn.Linear(16,8),
            nn.ReLU(),
            nn.Dropout(p=0.1),
            nn.Linear(8, num_labels)
        )
        
        
    def forward(self,
        input_ids=None,
        attention_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        task_ids=None,
        **kwargs,):
        outputs = self.model(input_ids=input_ids, 
                             attention_mask=attention_mask,
                             #output_hidden_states=True
                            )
        logits = self.mlp(outputs['last_hidden_state'][:,0,:].view(-1, 768))
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            return TokenClassifierOutput(loss=loss,
                                         logits=logits,
                                         #hidden_states=outputs.hidden_states,
                                         #attentions=outputs.attentions
                                        )

In [18]:
if not os.path.isfile(model_path + '/pytorch_model.bin'):
    print('bert clf does not exist. Download model')
    model = AutoModel.from_pretrained("google-bert/bert-base-cased", 
                                      cache_dir=cache_dir,
                                      output_attentions=True,
                                      output_hidden_states=True)
else:
    print('bert clf exists. Load from local file')
    model = AutoModel.from_pretrained(model_path,  
                                      cache_dir=cache_dir, 
                                      local_files_only=True,
                                      output_attentions=True,
                                      output_hidden_states=True)
    
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device) # use GPU
print(type(model))

bert clf exists. Load from local file


Some weights of the model checkpoint at /cygdrive/d/projects/LLM/.cache/models--google-bert--bert-base-cased/snapshots/cd5ef92a9fb2f889e972770a36d4ed042daf221e were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<class 'transformers.models.bert.modeling_bert.BertModel'>


In [19]:
lora_config = LoraConfig(
    #task_type=TaskType.SEQ_CLS, 
    r=32, # rank of the lower dimensional space
    lora_alpha=50, # effectively learning rate
    lora_dropout=0.1
) 
model_peft = get_peft_model(model, lora_config)
print_trainable_parameters(model_peft)
print(type(model_peft))

trainable params: 1179648 || all params: 109489920 || trainable%: 1.0774032897274928
<class 'peft.peft_model.PeftModel'>


In [20]:
model_peft_mlp = ModelPeftMLP(model=model_peft, peft_config=lora_config)
print(model_peft_mlp)
print_trainable_parameters(model_peft_mlp)

ModelPeftMLP(
  (base_model): LoraModel(
    (model): PeftModel(
      (base_model): LoraModel(
        (model): BertModel(
          (embeddings): BertEmbeddings(
            (word_embeddings): Embedding(28996, 768, padding_idx=0)
            (position_embeddings): Embedding(512, 768)
            (token_type_embeddings): Embedding(2, 768)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (encoder): BertEncoder(
            (layer): ModuleList(
              (0): BertLayer(
                (attention): BertAttention(
                  (self): BertSelfAttention(
                    (query): Linear(
                      in_features=768, out_features=768, bias=True
                      (lora_dropout): Dropout(p=0.1, inplace=False)
                      (lora_A): Linear(in_features=768, out_features=32, bias=False)
                      (lora_B): Linear(in_features=32, out_features=768

In [24]:
##### metric = evaluate.load("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(output_dir="test_trainer", 
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size=8,
                                  per_device_eval_batch_size=8,
                                  seed=123)

trainer = Trainer(
    model=model_peft_mlp,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [25]:
%%time
trainer.train()



Epoch,Training Loss,Validation Loss


NameError: name 'metric' is not defined

In [None]:
del model_peft_mlp
del model_peft
del model
del trainer
torch.cuda.empty_cache()

In [14]:
train_dataloader = DataLoader(small_train_dataset, 
                             batch_size=8)

In [27]:
model_peft_mlp.to(device)
num_epochs = 3
num_training_steps = num_epochs*len(small_train_dataset)
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model_peft_mlp.parameters(), lr=5e-5)

lr_scheduler = get_scheduler("linear",
                             optimizer=optimizer,
                             num_warmup_steps=0,
                             num_training_steps=num_training_steps)

for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        
        outputs = model_peft_mlp(**batch)
        loss = criterion(outputs['logits'], batch['labels'])
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        print(f"Epoch {epoch}, Loss: {loss.item()}")

Epoch 0, Loss: 0.6554639935493469
Epoch 0, Loss: 0.6666975021362305
Epoch 0, Loss: 0.65705406665802
Epoch 0, Loss: 0.7516437768936157
Epoch 0, Loss: 0.7530192136764526
Epoch 0, Loss: 0.698760986328125
Epoch 0, Loss: 0.6818865537643433
Epoch 0, Loss: 0.6730867624282837
Epoch 0, Loss: 0.6506271362304688
Epoch 0, Loss: 0.7049846649169922
Epoch 0, Loss: 0.6568168997764587
Epoch 0, Loss: 0.6922946572303772
Epoch 0, Loss: 0.780484676361084
Epoch 0, Loss: 0.6415077447891235
Epoch 0, Loss: 0.6557684540748596
Epoch 0, Loss: 0.7308101654052734
Epoch 0, Loss: 0.6956213116645813
Epoch 0, Loss: 0.6455085277557373
Epoch 0, Loss: 0.7390742301940918
Epoch 0, Loss: 0.6416972875595093
Epoch 0, Loss: 0.6507484316825867
Epoch 0, Loss: 0.663865864276886
Epoch 0, Loss: 0.7179645895957947
Epoch 0, Loss: 0.7015106678009033
Epoch 0, Loss: 0.6436869502067566
Epoch 0, Loss: 0.7092452049255371
Epoch 0, Loss: 0.6659911274909973
Epoch 0, Loss: 0.6872844696044922
Epoch 0, Loss: 0.6446219682693481
Epoch 0, Loss: 0.70

KeyboardInterrupt: 

### Hyperparameter tuning