In [1]:
import transformers
transformers.__version__

'4.38.1'

##### Initial setup

In [1]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
from transformers import (
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    default_data_collator,
    get_linear_schedule_with_warmup,
)
from datasets import (
    load_dataset,
    Dataset,
)
from peft import (
    LoraConfig,
    PeftConfig,
    PromptEncoderConfig,
    PrefixTuningConfig,
    IA3Config,
    get_peft_model,
    TaskType,
    PromptTuningInit,
    PromptTuningConfig,
    prepare_model_for_int8_training,
    prepare_model_for_kbit_training
)
import torch
from torch import optim
from torch.utils.data import DataLoader
from tqdm import tqdm
import os


In [2]:
device = "cuda"
# model_path = "google/gemma-2b-it"
# model_path = "t5-large"  # is a seq2seqlm
# resorting to below model, since gemma_2B was creating issues 
# with lora_config. 
model_path = "mistralai/Mistral-7B-v0.1" 
# model_path = "bigscience/bloomz-560m" 
lr = 1e-3
num_epochs = 1
batch_size = 1
steps = 150

save_location = "/home/aicoder/training/multi_mistral_models"
# loading tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [4]:
# loading model no-quantisation, device_map="auto" takes 10.25GB VRAM
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")

In [3]:
# creating BnB confing for 8-bit loading
bnb_8bit = BitsAndBytesConfig(load_in_8bit=True)
bnb_4bit = BitsAndBytesConfig(load_in_4bit=True)
bnb_4bit_nested = BitsAndBytesConfig(load_in_4bit=True,
                                    bnb_4bit_use_double_quant=True)
bnb_4bit_nested_nf4 = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_quant_type='nf4',
                                         bnb_4bit_use_double_quant=True,
                                         bnb_4bit_compute_dtype=torch.bfloat16)

In [5]:
# loading model 8-bit quantisation device_map=auto takes 3.62 GB VRAM
# loading mistral model 8-bit quantisation device_map=auto takes 8 GB VRAM
# training fails due to multiple issues, scroll down
model = AutoModelForCausalLM.from_pretrained(model_path,
                                             # load_in_8bit=True, depricated,
                                             quantization_config=bnb_8bit,
                                             device_map="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
# loading model 4-bit quantisation device_map=auto takes 2.80 GB VRAM
# loading mistral model 4-bit quantisation device_map=auto takes 5.2 GB VRAM
# training doesn't works with mistral_7B model
model = AutoModelForCausalLM.from_pretrained(model_path,
                                             # load_in_4bit=True, depricated,
                                             quantization_config=bnb_4bit,
                                             device_map="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# loading model 4-bit quantisation with nesting, device_map=auto takes 2.70 GB VRAM
# did not try with mistral
model = AutoModelForCausalLM.from_pretrained(model_path,
                                             # load_in_4bit=True, depricated,
                                             quantization_config=bnb_4bit_nested,
                                             device_map="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
# loading model 4-bit quantisation with nesting & nf4, device_map=auto takes 2.70 GB VRAM
# loading mistral model 4-bit quantisation device_map=auto takes 4.7 GB VRAM
# training throws trying to unscale fp16 error, change to bf16=True in t_args
model = AutoModelForCausalLM.from_pretrained(model_path,
                                             # load_in_4bit=True, depricated,
                                             quantization_config=bnb_4bit_nested_nf4,
                                             device_map="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
def test_prompt(prompt, your_model):
    input_ids = tokenizer(prompt, return_tensors='pt')
    logits = your_model.generate(**input_ids.to(device),
                           max_new_tokens=50)
    return tokenizer.decode(logits[0])

In [7]:
# Testing the loaded model
# DONT RUN THIS WHEN PLANNING TO TRAIN
input_text = "What is the zodiac signs"
print(test_prompt(input_text,model))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


<s> What is the zodiac signs of the month of May?

The zodiac signs of the month of May are Taurus, Gemini, Cancer, Leo, Virgo, Libra, Scorpio, Sagittarius, Capricorn, Aquarius


##### Moving the models

In [5]:
# try moving the model back to cpu 
# will still keep the gpu memory
# trying this with quantised model will throw error
cpu_model = model.to('cpu')

In [6]:
# try deleting quantised model & releasing memory will also not work
del model

In [7]:
# Now the memory will be released from VRAM.
torch.cuda.empty_cache()

##### Preparing Datasets

In [6]:
# preping the dataset for training.
import pandas as pd
import json

data = []

with open('/home/aicoder/gitfolder/python_de_learners_data/code_script_notebooks/projects/learning_hf_philosophy/databricks-dolly-15k.jsonl') as file:
    for line in file:
        features = json.loads(line)
        # Filter out examples with context, to keep it simple.
        if features["context"]:
            continue
        # Format the entire example as a single string.
        template = "Instruction:\n{instruction}\n\nResponse:\n{response}"
        data.append(template.format(**features))

data_500 = [x for x in data if len(x) <= 500]
json_ds = Dataset.from_pandas(pd.DataFrame(data=data_500))

# Add tokens <|im_start|> and <|im_end|>, latter is special eos token 
# tokenizer.pad_token = "</s>"
# tokenizer.add_tokens(["<|im_start|>"])
# tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))
# model.resize_token_embeddings(len(tokenizer))
# model.config.eos_token_id = tokenizer.eos_token_id

tokenizer.pad_token = tokenizer.eos_token 

def tokenize_instruction(row):
    # print(row)
    input_token = tokenizer(row['0'], padding=True,
                            truncation=True, max_length=512)
    input_token['labels'] = input_token['input_ids'].copy()
    return input_token

dataset_tokenized = json_ds.map(tokenize_instruction,
                                remove_columns=["0"],
                                batched=True)

process_ds = dataset_tokenized.train_test_split(test_size=0.2)

process_ds

Map:   0%|          | 0/7339 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5871
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1468
    })
})

##### Model size details

In [7]:
# reviewing model footprint, use this for all models
model.get_memory_footprint()

4551360512

In [8]:
# looking at the model architecture
model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )

### Run below cells when doing PEFT

In [7]:
# Creating Low Ranking Adapters and checking trainable parameters
lora_config = config = LoraConfig(
    r=16, 
    lora_alpha=16, 
    target_modules = ['q_proj', 'k_proj',  'v_proj', 'o_proj', 'gate_proj', 'down_proj', 'up_proj'],
    # target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj',],
    # target_modules = ['q_proj', 'k_proj',],
    # target_modules = ['self_attn',],
    lora_dropout=0.1, 
    bias="none", 
    modules_to_save = ["lm_head", "embed_tokens"],		# needed because we added new tokens to tokenizer/model
    task_type="CAUSAL_LM"
)

# LoRA decomposes the weight update matrix into two smaller matrices. 
# The size of these low-rank matrices is determined by its rank or r. 
# A higher rank means the model has more parameters to train, but it also means the model has 
# more learning capacity. 
# Specify the target_modules which determine where the smaller matrices are inserted. 
# Other important parameters to set are lora_alpha (scaling factor), 
# bias (whether none, all or only the LoRA bias parameters should be trained), 
# and modules_to_save 

In [8]:
# prep_model = prepare_model_for_int8_training(model) # deprecated
# will take 2.0 GB of VRAM
# don't use this
# https://huggingface.co/docs/peft/v0.8.2/en/package_reference/peft_model#peft.prepare_model_for_kbit_training
prep_model = prepare_model_for_kbit_training(model)

In [8]:
# Takes > 7.0 GB of VRAM & with r = 16 throws OOM when using 8-bit prepped model
# Takes ~ 4.0 GB of VRAM & with r = 16 throws when using 8-bit no_prep model
# Takes ~ 7.0 GB of VRAM & with r = 16 works when using 4-bit prep model
# Takes ~ 3.0 GB of VRAM & with r = 16 works when using 4-bit no-prep model
# lora_model = get_peft_model(prep_model, lora_config)
lora_model_noprep = get_peft_model(model, lora_config)

In [9]:
# lora_model.print_trainable_parameters()
lora_model_noprep.print_trainable_parameters()  # trainable is 1.Mil params

trainable params: 304,087,040 || all params: 7,545,819,136 || trainable%: 4.029874484391565


Errors:

Value Error: Attempting to Unscale FP Gradients

- issue seems to be due to float16 https://github.com/huggingface/transformers/issues/23165

- https://discuss.pytorch.org/t/valueerror-attemting-to-unscale-fp16-gradients/81372/18

Solution was to use the plain 4-bit quantisation.

### Training part

In [14]:
# creating training_args
args = TrainingArguments(
    output_dir=save_location,
    evaluation_strategy='steps',
    save_strategy='steps',
    eval_steps=steps,
    save_steps=steps,
    per_device_eval_batch_size=batch_size,
    per_device_train_batch_size=batch_size,
    push_to_hub=False,
    report_to="none",
    #learning_rate=lr,
    lr_scheduler_type='constant',
    optim='paged_adamw_32bit',
    bf16=True,
)
# We have to use bf16 in the args above

In [15]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm=False)

In [16]:
trainer = Trainer(
    # model=model,
    model=lora_model_noprep,
    # model=lora_model,
    tokenizer=tokenizer,
    data_collator=data_collator,
    train_dataset=process_ds['train'],
    eval_dataset=process_ds['test'],
    args=args,
)

In [None]:
# model w/o quantisation will throw OOM Error with 11.9GB VRAM Usage
# when using lora_unprep_8bit model with bfloat16, throws Attempting to Unscale FP16 grad
# when using lora_prep_4bit model, throws OOM error
# using the model quantised with 4-bit and compute_type of bf_16 it
# started training.
trainer.train()