# Efficient Fine Tuning of Quantized LLMs

### Installing required packages
#### Setting up virtual env

In [1]:
# conda create -n qlora python=3.10
# !conda activate qlora
# pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

# pip install scipy
# pip install -q -U bitsandbytes
# pip install -q -U git+https://github.com/huggingface/transformers.git
# pip install -q -U git+https://github.com/huggingface/peft.git
# pip install -q -U git+https://github.com/huggingface/accelerate.git
# pip install -q datasets
# pip install einops  # needed for falcon

In [6]:
import torch
import bitsandbytes
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [3]:
import os
os.getcwd()

'/data/finetuning/qlora'

First let's load the model we are going to use - falcon-7b! Note that the model itself is around 14GB in half precision

## Loading LLM

In [7]:
#model_id = "EleutherAI/gpt-neox-20b"
# model_id="google/flan-ul2" -- doesn't work
model_id="tiiuae/falcon-7b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
# how does device_map interact with CUDA_VISIBLE_DEVICES
# maybe device_map="auto"?
# model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})
#model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")
# for falcon
# model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto", trust_remote_code=True)

# Load the model from local
model = AutoModelForCausalLM.from_pretrained('/data/finetuning/lit-parrot/checkpoints/tiiuae/falcon-7b/', quantization_config=bnb_config, device_map="auto", local_files_only=True, trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Pre-processing Model

Then we have to apply some preprocessing to the model to prepare it for training. For that use the prepare_model_for_kbit_training method from PEFT.

In [None]:
from peft import prepare_model_for_kbit_training
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [5]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )



trainable params: 9437184 || all params: 3618182016 || trainable%: 0.260826679207064


In [None]:
# Regularization strength (lora_alpha)
# Dropout probability (lora_dropout)
# target modules to be compressed (target_modules).
from peft import LoraConfig, get_peft_model
config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.2,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

In [6]:
# print(model)

## Loading Data

Let's load a dolly instruct dataset, to fine tune our model on famous quotes.

In [None]:
from datasets import load_dataset
# data = load_dataset("Abirate/english_quotes")
data = load_dataset('json', data_files='dolly-instruct-dataset.json')
data = data.map(lambda samples: tokenizer(samples["instruction"]), batched=True)

## Fine-tuning model

In [7]:
import transformers
# needed for gpt-neo-x tokenizer
tokenizer.pad_token = tokenizer.eos_token

trainer = transformers.Trainer(
    model=model,
    train_dataset=data["train"],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=100,
        gradient_accumulation_steps=6,
        warmup_steps=2,
        max_steps=30, #10,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs2",
        optim="paged_adamw_8bit",
        report_to="none"  # turns off wandb -> def. turn off for CCC.
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

Found cached dataset json (/root/.cache/huggingface/datasets/json/default-061f7d4818b7ea6d/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/json/default-061f7d4818b7ea6d/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96/cache-9546399c3240be86.arrow
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1,2.8298
2,2.8298
3,2.801
4,2.7098
5,2.5811
6,2.4215
7,2.2506
8,2.0732
9,1.8673
10,1.6208


TrainOutput(global_step=30, training_loss=0.9821900140494109, metrics={'train_runtime': 193.0904, 'train_samples_per_second': 93.221, 'train_steps_per_second': 0.155, 'total_flos': 3534702046848000.0, 'train_loss': 0.9821900140494109, 'epoch': 10.0})

## Saving Model

In [8]:
model.save_pretrained("outputs2/")


## Inference

In [10]:
from peft import PeftModelForCausalLM, get_peft_config

compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

# model = AutoModelForCausalLM.from_pretrained(
#     "tiiuae/falcon-7b-instruct", quantization_config=bnb_config, device_map="auto", trust_remote_code=True
# )

# You can comment and un comment this line to either use base model 
# or the peft model during the inference.
model = PeftModelForCausalLM.from_pretrained(model, 'outputs',local_files_only=True)

tok = AutoTokenizer.from_pretrained('ybelkada/falcon-7b-sharded-bf16')
tok.pad_token = tok.eos_token

In [22]:
%%time
prompt = f"""
: How can I create an account?
:
""".strip()

encoding = tok(prompt, return_tensors="pt")
with torch.inference_mode():
    outputs = model.generate(
        input_ids=encoding.input_ids,
        attention_mask=encoding.attention_mask,
#         generation_config=generation_config,
    )
print(tok.decode(outputs[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


: How can I create an account?
: How can I change my password?
: How
CPU times: user 3.07 s, sys: 2.6 ms, total: 3.07 s
Wall time: 3.07 s


In [35]:
prompt = f"""
Answer the question based only on the context below. \
Context: it a debate that modern football fans have been having over recent years Ive got to get your take on it who better Messi or Ronaldo Ronaldo come two or three years as the one of the best in Europe I think in the moment no doubt no doubt that Ronald was the best teacher watch out lava sure Bobby what going on is really hot it that rule very well so they need to get the ball they have enough nobody gonna pick now Ronaldo really good bright sharp start to the game I just said pick test from but Im sure in rise to the occasion chickens are trying to take him a different levels different inscription here we see him come on the ball terrific balance look look at that lovely little trick that he has and his profes 21 international and you saw him take apart England under-21 side one with a couple of those well Kresna has gone and seen to that really speed of the ball either freaking manipulating it make decision. \
Question: Who is the best football player?
""".strip()

# inputs = tok(prompt, return_tensors="pt")
# inputs = inputs.to(0)
# output = model.generate(inputs["input_ids"])
# tokenizer.decode(output[0].tolist())

encoding = tok(prompt, return_tensors="pt")
with torch.inference_mode():
    outputs = model.generate(
        input_ids=encoding.input_ids,
        attention_mask=encoding.attention_mask,
        max_new_tokens=1000,
#         generation_config=generation_config,
    )
print(tok.decode(outputs[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Answer the question based only on the context below. Context: it a debate that modern football fans have been having over recent years Ive got to get your take on it who better Messi or Ronaldo Ronaldo come two or three years as the one of the best in Europe I think in the moment no doubt no doubt that Ronald was the best teacher watch out lava sure Bobby what going on is really hot it that rule very well so they need to get the ball they have enough nobody gonna pick now Ronaldo really good bright sharp start to the game I just said pick test from but Im sure in rise to the occasion chickens are trying to take him a different levels different inscription here we see him come on the ball terrific balance look look at that lovely little trick that he has and his profes 21 international and you saw him take apart England under-21 side one with a couple of those well Kresna has gone and seen to that really speed of the ball either freaking manipulating it make decision. Question: Who is t