In [23]:
# from huggingface_hub import hf_hub_download 
import torch
from datasets import load_dataset, load_from_disk
import evaluate
import transformers
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModel
from transformers.modeling_outputs import TokenClassifierOutput
from transformers import TrainingArguments, Trainer, AdamW, get_scheduler, EarlyStoppingCallback
from peft import LoraConfig, TaskType, get_peft_model
from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate
from peft import PeftConfig, PeftModel
from torch import nn


import pandas as pd
import numpy as np
import os

In [2]:
print(transformers.__version__)

4.30.2


In [3]:
print(torch.__version__)
print(torch.cuda.is_available())

1.12.0+cu116
True


In [4]:
import peft
peft.__version__

'0.1.0'

In [5]:
! nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:41:10_Pacific_Daylight_Time_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [6]:
# del model
# del trainer
# torch.cuda.empty_cache()

In [7]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

def load_llm(model_path, num_labels, seed=123):
    """
    run this for different experiments (freezing different params)
    """
    if not os.path.isfile(model_path + '/model.safetensors'):
        return 'model does not exist. Create model first'
    
    torch.manual_seed(seed)
    model = AutoModelForSequenceClassification.from_pretrained(model_path, 
                                                           num_labels=num_labels, 
                                                           cache_dir=cache_dir, 
                                                           local_files_only=True)
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    print(device)
    model.to(device) # use GPU
    return model

In [8]:
llm_repo_dir = 'D:/projects/LLM'
cache_dir = '/cygdrive/d/projects/LLM/.cache'
os.environ['TRANSFORMERS_CACHE'] = cache_dir
os.environ['HF_HOME'] = cache_dir + '/huggingface'
os.environ['XDG_CACHE_HOME'] = cache_dir
os.environ['HF_DATASETS_CACHE'] = cache_dir

model_path = cache_dir + '/models--google-bert--bert-base-cased/snapshots/cd5ef92a9fb2f889e972770a36d4ed042daf221e'
dataset_path = cache_dir + '/parquet/yelp_polarity' # cache_dir + '/parquet/yelp_review_full-e22176106d6e7534'
dataset_name = 'yelp_polarity' # yelp_review_full
tokenized_data_path = cache_dir + '/tokenized_dataset_yelp_polarity'
num_labels = 2

In [9]:
class WeightedTrainer(Trainer):    
    def __init__(self, weights, **kwargs):
        super().__init__(**kwargs)
        DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        self.weights = torch.tensor(weights, dtype=torch.float32).to(DEVICE)
        
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get('labels')
        outputs = model(**inputs)
        logits = outputs['logits']
        loss = nn.CrossEntropyLoss(weight=self.weights)(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [10]:
# peft.peft_model.PeftModel
# peft.peft_model.PeftModelForSequenceClassification
class ModelPeftMLP(peft.peft_model.PeftModel):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.num_labels = 2
        self.mlp = nn.Sequential(
            nn.Linear(768,32),
            nn.ReLU(),
            nn.Dropout(p=0.1),
            nn.Linear(32,16),
            nn.ReLU(),
            nn.Dropout(p=0.1),
            nn.Linear(16,8),
            nn.ReLU(),
            nn.Dropout(p=0.1),
            nn.Linear(8, num_labels)
        )
              
    def forward(self,
        input_ids=None,
        attention_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        task_ids=None,
        **kwargs,):
        outputs = self.model(input_ids=input_ids, 
                             attention_mask=attention_mask,
                            )
        logits = self.mlp(outputs['last_hidden_state'][:,0,:].view(-1, 768))
        
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            return TokenClassifierOutput(loss=loss,
                                         logits=logits,
                                        )

In [11]:
def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8,16,32,64]),
        "warmup_steps":trial.suggest_categorical("warmup_steps", [0,10,100])
    }

In [12]:
if not os.path.isfile(model_path + '/pytorch_model.bin'):
    print('bert clf does not exist. Download model')
    model = AutoModel.from_pretrained("google-bert/bert-base-cased", 
                                      cache_dir=cache_dir,
                                      output_attentions=True,
                                      output_hidden_states=True)
else:
    print('bert clf exists. Load from local file')
    model = AutoModel.from_pretrained(model_path,  
                                      cache_dir=cache_dir, 
                                      local_files_only=True,
                                      output_attentions=True,
                                      output_hidden_states=True)
    
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device) # use GPU
print(type(model))

bert clf exists. Load from local file


Some weights of the model checkpoint at /cygdrive/d/projects/LLM/.cache/models--google-bert--bert-base-cased/snapshots/cd5ef92a9fb2f889e972770a36d4ed042daf221e were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<class 'transformers.models.bert.modeling_bert.BertModel'>


In [20]:
lora_config = LoraConfig(
    #task_type=TaskType.SEQ_CLS, 
    r=32, # rank of the lower dimensional space
    lora_alpha=50, # effectively learning rate
    lora_dropout=0.1
) 
model_peft = get_peft_model(model, lora_config)
torch.manual_seed(123)
model_peft_mlp = ModelPeftMLP(model=model_peft, peft_config=lora_config)
print(model_peft_mlp)
print_trainable_parameters(model_peft_mlp)

ModelPeftMLP(
  (base_model): LoraModel(
    (model): PeftModel(
      (base_model): LoraModel(
        (model): BertModel(
          (embeddings): BertEmbeddings(
            (word_embeddings): Embedding(28996, 768, padding_idx=0)
            (position_embeddings): Embedding(512, 768)
            (token_type_embeddings): Embedding(2, 768)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (encoder): BertEncoder(
            (layer): ModuleList(
              (0): BertLayer(
                (attention): BertAttention(
                  (self): BertSelfAttention(
                    (query): Linear(
                      in_features=768, out_features=768, bias=True
                      (lora_dropout): Dropout(p=0.1, inplace=False)
                      (lora_A): Linear(in_features=768, out_features=32, bias=False)
                      (lora_B): Linear(in_features=32, out_features=768

In [24]:
##### metric = evaluate.load("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def model_init(trials):
    torch.manual_seed(123)
    model_peft = get_peft_model(model, lora_config)
    model_peft_mlp = ModelPeftMLP(model=model_peft, 
                                  peft_config=model_peft.peft_config['default'])
    return model_peft_mlp


training_args = TrainingArguments(output_dir="test_trainer", 
                                  evaluation_strategy="epoch",
                                  seed=123,
                                  data_seed=123)

earlystopping = EarlyStoppingCallback(3, 0.1)

NameError: name 'small_train_dataset' is not defined

In [25]:
if not os.path.isdir(dataset_path):
    dataset = load_dataset(dataset_name, cache_dir=cache_dir + '/parquet')
else:
    dataset = load_dataset(dataset_path)
    
if not os.path.isdir(tokenized_data_path):
    print('tokenized dataset does not exist. Download dataset')
    if not os.path.isfile(model_path + '/tokenizer.json'):
        print('tokenizer does not exist. Create and save tokenized dataset')
        tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased", 
                                                  cache_dir=cache_dir) # to load tokenizer to cache
    else:
        print('tokenizer exists. Load from existing tokenizer')
        tokenizer = AutoTokenizer.from_pretrained(model_path, 
                                              cache_dir=cache_dir, 
                                              local_flies_only=True) # to load tokenizer from cache
    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    tokenized_datasets.save_to_disk(tokenized_data_path)
else:
    print('tokenized dataset exists. Load from disk')
    tokenized_datasets = load_from_disk(tokenized_data_path)

tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

Found cached dataset arrow (C:/cygwin64/home/jacky/.cache/huggingface/datasets/arrow/yelp_polarity-fa5030fa747c4f91/0.0.0/74f69db2c14c2860059d39860b1f400a03d11bf7fb5a8258ca38c501c878c137)


  0%|          | 0/2 [00:00<?, ?it/s]

tokenized dataset exists. Load from disk


Loading cached shuffled indices for dataset at D:\cygdrive\d\projects\LLM\.cache\tokenized_dataset_yelp_polarity\train\cache-0b983b74d94eba28.arrow
Loading cached shuffled indices for dataset at D:\cygdrive\d\projects\LLM\.cache\tokenized_dataset_yelp_polarity\test\cache-01e541604b191dc4.arrow


In [26]:
trainer = Trainer(
    model=model_peft_mlp,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
    model_init=model_init,
    callbacks=[earlystopping, lambda study, trial: gc.collect()]
)

best_trials = trainer.hyperparameter_search(
    direction='maximize',
    backend='optuna',
    hp_space=optuna_hp_space,
    n_trials=5,
)



AttributeError: 'function' object has no attribute 'on_init_end'