In [3]:
# from huggingface_hub import hf_hub_download 
import torch
from datasets import load_dataset, load_from_disk
import evaluate
import transformers
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from peft import LoraConfig, TaskType, get_peft_model
from torch.utils.data import DataLoader
from torch.utils.data.dataloader import default_collate
from peft import PeftConfig, PeftModel

import pandas as pd
import numpy as np
import os

In [4]:
print(transformers.__version__)

4.30.2


In [5]:
print(torch.__version__)
print(torch.cuda.is_available())

1.12.0+cu116
True


In [7]:
import peft
peft.__version__

'0.1.0'

In [None]:
! nvcc --version

In [5]:
# del model
# del trainer
# torch.cuda.empty_cache()

In [6]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

def load_llm(model_path, num_labels):
    """
    run this for different experiments (freezing different params)
    """
    if not os.path.isfile(model_path + '/model.safetensors'):
        return 'model does not exist. Create model first'
    
    model = AutoModelForSequenceClassification.from_pretrained(model_path, 
                                                           num_labels=num_labels, 
                                                           cache_dir=cache_dir, 
                                                           local_files_only=True)
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    print(device)
    model.to(device) # use GPU
    return model

In [None]:
llm_repo_dir = 'D:/projects/LLM'
cache_dir = '/cygdrive/d/projects/LLM/.cache'
os.environ['TRANSFORMERS_CACHE'] = cache_dir
os.environ['HF_HOME'] = cache_dir + '/huggingface'
os.environ['XDG_CACHE_HOME'] = cache_dir
os.environ['HF_DATASETS_CACHE'] = cache_dir

model_path = cache_dir + '/models--google-bert--bert-base-cased/snapshots/cd5ef92a9fb2f889e972770a36d4ed042daf221e'
dataset_path = cache_dir + '/parquet/yelp_polarity' # cache_dir + '/parquet/yelp_review_full-e22176106d6e7534'
dataset_name = 'yelp_polarity' # yelp_review_full
tokenized_data_path = cache_dir + '/tokenized_dataset_yelp_polarity'
num_labels = 2

In [8]:
if not os.path.isdir(dataset_path):
    dataset = load_dataset(dataset_name, cache_dir=cache_dir + '/parquet')
else:
    dataset = load_dataset(dataset_path)
    
if not os.path.isdir(tokenized_data_path):
    print('tokenized dataset does not exist. Download dataset')
    if not os.path.isfile(model_path + '/tokenizer.json'):
        print('tokenizer does not exist. Create and save tokenized dataset')
        tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased", 
                                                  cache_dir=cache_dir) # to load tokenizer to cache
    else:
        print('tokenizer exists. Load from existing tokenizer')
        tokenizer = AutoTokenizer.from_pretrained(model_path, 
                                              cache_dir=cache_dir, 
                                              local_flies_only=True) # to load tokenizer from cache
    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    tokenized_datasets.save_to_disk(tokenized_data_path)
else:
    print('tokenized dataset exists. Load from disk')
    tokenized_datasets = load_from_disk(tokenized_data_path)

tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))


    
if not os.path.isfile(model_path + '/pytorch_model.bin'):
    print('bert clf does not exist. Download model')
    model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", 
                                                           num_labels=num_labels, 
                                                           cache_dir=cache_dir)
else:
    print('bert clf exists. Load from local file')
    model = AutoModelForSequenceClassification.from_pretrained(model_path, 
                                                           num_labels=num_labels, 
                                                           cache_dir=cache_dir, 
                                                           local_files_only=True)
    
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device) # use GPU

tokenized dataset exists. Load from disk


Loading cached shuffled indices for dataset at D:\cygdrive\d\projects\LLM\.cache\tokenized_dataset_yelp_polarity\train\cache-9e3a9f62d4fbf69a.arrow
Loading cached shuffled indices for dataset at D:\cygdrive\d\projects\LLM\.cache\tokenized_dataset_yelp_polarity\test\cache-7ac1eeeafb61d0c9.arrow


bert clf exists. Load from local file


Some weights of the model checkpoint at /cygdrive/d/projects/LLM/.cache/models--google-bert--bert-base-cased/snapshots/cd5ef92a9fb2f889e972770a36d4ed042daf221e were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification m

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [9]:
# check embedding dimension for BERT, should equal 512
len(small_train_dataset['input_ids'][0])

512

In [10]:
# No param freezing
model = load_llm(model_path, num_labels)
print('no param freezing')
print_trainable_parameters(model)

# freeze all but the last layer on the BERT encoder
model = load_llm(model_path, num_labels)
params = model._modules['bert'].encoder.layer[:-1].parameters()
for param in params:
    param.requires_grad = False
print('freeze all but last encoder layer')
print_trainable_parameters(model)

# freeze all layer on the BERT encoder
model = load_llm(model_path, num_labels)
params = model._modules['bert'].encoder.layer.parameters()
for param in params:
    param.requires_grad = False
print('freeze all encoder layer')
print_trainable_parameters(model)

# LoRA
model = load_llm(model_path, num_labels)
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, r=50, lora_alpha=50, lora_dropout=0.1
)
model = get_peft_model(model, lora_config)
print('LoRA')
print_trainable_parameters(model)

Some weights of the model checkpoint at /cygdrive/d/projects/LLM/.cache/models--google-bert--bert-base-cased/snapshots/cd5ef92a9fb2f889e972770a36d4ed042daf221e were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification m

cuda
no param freezing
trainable params: 108311810 || all params: 108311810 || trainable%: 100.0


Some weights of the model checkpoint at /cygdrive/d/projects/LLM/.cache/models--google-bert--bert-base-cased/snapshots/cd5ef92a9fb2f889e972770a36d4ed042daf221e were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification m

cuda
freeze all but last encoder layer
trainable params: 30345218 || all params: 108311810 || trainable%: 28.016536700845457


Some weights of the model checkpoint at /cygdrive/d/projects/LLM/.cache/models--google-bert--bert-base-cased/snapshots/cd5ef92a9fb2f889e972770a36d4ed042daf221e were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification m

cuda
freeze all encoder layer
trainable params: 23257346 || all params: 108311810 || trainable%: 21.472585491831406


Some weights of the model checkpoint at /cygdrive/d/projects/LLM/.cache/models--google-bert--bert-base-cased/snapshots/cd5ef92a9fb2f889e972770a36d4ed042daf221e were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification m

cuda
LoRA
trainable params: 1844738 || all params: 110155010 || trainable%: 1.6746746244224389


## Fine-tuning

In [11]:
# label distribution
small_train_dataset['labels'].type(torch.float64).mean()

tensor(0.5050, dtype=torch.float64)

### LoRA

In [12]:
# Fine-tuning using LoRA
metric = evaluate.load("accuracy")
training_args = TrainingArguments(output_dir="test_trainer", 
                                  evaluation_strategy="epoch")

model = load_llm(model_path, num_labels)
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, 
    r=50, # rank of the lower dimensional space
    lora_alpha=50, # effectively learning rate
    lora_dropout=0.1
) 
model = get_peft_model(model, lora_config)
print_trainable_parameters(model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

Some weights of the model checkpoint at /cygdrive/d/projects/LLM/.cache/models--google-bert--bert-base-cased/snapshots/cd5ef92a9fb2f889e972770a36d4ed042daf221e were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification m

cuda
trainable params: 1844738 || all params: 110155010 || trainable%: 1.6746746244224389


In [13]:
%%time
trainer.train() # run very slow even running on gpu



Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.633136,0.722
2,No log,0.311632,0.884
3,No log,0.26071,0.907


Wall time: 1h 17min 41s


TrainOutput(global_step=375, training_loss=0.4719854736328125, metrics={'train_runtime': 4661.1727, 'train_samples_per_second': 0.644, 'train_steps_per_second': 0.08, 'total_flos': 806320097280000.0, 'train_loss': 0.4719854736328125, 'epoch': 3.0})

In [15]:
ft_model_dir = cache_dir + '/ft_lora_yelp_polarity'
if not os.path.isdir(ft_model_dir):
    os.mkdir(ft_model_dir)
trainer.save_model(ft_model_dir)

### Freeze all encoder layers

In [None]:
# Freeze all encoders
metric = evaluate.load("accuracy")
training_args = TrainingArguments(output_dir="test_trainer", 
                                  evaluation_strategy="epoch")

model = load_llm(model_path)
params = model._modules['bert'].encoder.layer.parameters()
for param in params:
    param.requires_grad = False
print_trainable_parameters(model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
%%time
trainer.train()

In [None]:
ft_model_dir = cache_dir + '/ft_encoder_all_freeze'
if not os.path.isdir(ft_model_dir):
    os.mkdir(ft_model_dir)
trainer.save_model(ft_model_dir)

## Performance evaluation

In [None]:
eval_dataloader = DataLoader(small_eval_dataset, 
                             batch_size=8)

### FT (LoRA) model

In [None]:
ft_model_dir = cache_dir + '/ft_lora_yelp_polarity'
model_pretrained = AutoModelForSequenceClassification.from_pretrained(model_path, 
                                                           num_labels=num_labels, 
                                                           cache_dir=cache_dir, 
                                                           local_files_only=True)
model = PeftModel.from_pretrained(model_pretrained, ft_model_dir)
model.to(device)
model.eval()
print(next(model.parameters()).is_cuda)

In [18]:
metric = evaluate.load("accuracy")

for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
metric.compute()

{'accuracy': 0.907}

### No FT

In [20]:
model = AutoModelForSequenceClassification.from_pretrained(model_path, 
                                                           num_labels=num_labels, 
                                                           cache_dir=cache_dir, 
                                                           local_files_only=True)
model.to(device)
model.eval()

Some weights of the model checkpoint at /cygdrive/d/projects/LLM/.cache/models--google-bert--bert-base-cased/snapshots/cd5ef92a9fb2f889e972770a36d4ed042daf221e were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification m

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [21]:
metric = evaluate.load("accuracy")

for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])
metric.compute()

{'accuracy': 0.516}

## Hyperparameter tuning