# Fine-tune with Own Dataset

In [None]:
import torch
import os
import transformers
from transformers import AutoTokenizer
from peft import LoraConfig
import intel_extension_for_pytorch as ipex
from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training, PeftModel
from bigdl.llm.transformers import AutoModelForCausalLM
from datasets import load_dataset

In [None]:
!rm -rf outputs/own-lora-model
!rm -rf outputs/own-merged-llm

<img src="imgs/finetune_pipe.png" width="800" />

## Load Desired Model

In [None]:
model_path = "mistralai/Mistral-7B-Instruct-v0.1"
model = AutoModelForCausalLM.from_pretrained(model_path, load_in_low_bit="nf4", optimize_model=False, torch_dtype=torch.float16)

## Load Tokenizer

*In the context of natural language processing and machine learning, **pad_token_id** typically refers to the identifier or index assigned to a special token representing padding in a sequence. When working with sequences of varying lengths, it's common to pad shorter sequences with a special token to make them uniform in length.*

Eg:
```
data = [ "I like cat", "Do you like cat too?"]
tokenizer(data, padding=True, truncation=True, return_tensors='pt')
```
Output:
```
'input_ids': tensor([[101,146,1176,5855,102,0,0,0],[101,2091,1128,1176,5855,1315,136,102]])
'token_type_ids': tensor([[0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0]])
'attention_mask': tensor([[1,1,1,1,1,0,0,0],[1,1,1,1,1,1,1,1]])
```
```
```

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_path, add_eos_token=True, max_model_length=512, use_fast=True, trust_remote_code=True)
tokenizer.pad_token_id = 0 
tokenizer.padding_side = "left"

## Load Own Dataset

In [None]:
import pickle
from datasets import Dataset, concatenate_datasets

with open('outputs/custom_data.pickle', 'rb') as handle:
    custom_data = pickle.load(handle)

custom_prompt_format = "### Question: {question} ### Answer: {answer}"

def process_data(sample):
    prompt = custom_prompt_format.format(question=sample["question"], answer=sample["answer"])
    print(prompt)
    return tokenizer(prompt)

dataset = Dataset.from_list(custom_data)
# dataset = concatenate_datasets([dataset, dataset])
dataset = dataset.map(process_data)

## Select xPU accelerator for Fine-tuning

In [None]:
model = model.to('xpu')

## Prepare Model for QLoRA INT4 Fine Tuning

<img src="imgs/qlora.png" width="800"/>

[Reference](https://www.linkedin.com/pulse/trends-llms-qlora-efficient-finetuning-quantized-vijay/?trk=article-ssr-frontend-pulse_more-articles_related-content-card)

Summary:
1. 4-bit quantization of the full pretrained language model to compress weights and reduce memory requirements using a novel NormalFloat encoding optimized for the distribution of neural network weights.
2. Low-rank adapters added densely throughout the layers of the 4-bit quantized base model. The adapters use full 16-bit precision and are finetuned while the base model remains fixed.
3. Double quantization further reduces memory by quantizing the first-stage quantization constants themselves from 32-bit to 8-bit with no accuracy loss.
4. Paged optimizers leverage unified memory to gracefully handle gradient checkpointing memory spikes and finetune models larger than the physical GPU memory through automatic paging.
5. Mixed precision combines 4-bit weights with 16-bit adapter parameters and activations, maximizing memory savings while retaining accuracy.

In [None]:
model = prepare_model_for_kbit_training(model)

### LoRA Configuration

LoraConfig allows you to control how LoRA is applied to the base model through the following parameters:

***target_modules** - The modules (for example, attention blocks) to apply the LoRA update matrices* [Reference](https://www.databricks.com/blog/efficient-fine-tuning-lora-guide-llms)



In [None]:
config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)

In [None]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=dataset,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=1,
        warmup_steps=20,
        max_steps=200,
        learning_rate=2e-5,
        save_steps=100,
        bf16=True,  # bf16 is more stable in training
        logging_steps=20,
        output_dir="outputs",
        optim="adamw_hf", # paged_adamw_8bit is not supported yet
        # gradient_checkpointing=True, # can further reduce memory but slower
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
result = trainer.train()
model.save_pretrained("outputs/own-lora-model")

## Merge LoRA Adapter with Base Model

In [None]:
import torch
import os
import transformers
from transformers import AutoTokenizer
from peft import LoraConfig
import intel_extension_for_pytorch as ipex
from bigdl.llm.transformers.qlora import get_peft_model, prepare_model_for_kbit_training, PeftModel
from bigdl.llm.transformers import AutoModelForCausalLM

In [None]:
base_model = model_path = "mistralai/Mistral-7B-Instruct-v0.1"
adapter_path = "outputs/own-lora-model"
tokenizer = AutoTokenizer.from_pretrained(base_model)
base_model = AutoModelForCausalLM.from_pretrained(
        base_model,
        # load_in_low_bit="nf4", # should load the orignal model
        torch_dtype=torch.float16,
        device_map={"": "cpu"},
)

In [None]:
lora_model = PeftModel.from_pretrained(
        base_model,
        adapter_path,
        device_map={"": "cpu"},
        torch_dtype=torch.float16,
)
lora_weight = lora_model.base_model.model.model.layers[0].self_attn.q_proj.weight

In [None]:
lora_model = lora_model.merge_and_unload()
lora_model.train(False)
lora_model_sd = lora_model.state_dict()
deloreanized_sd = { k.replace("base_model.model.", ""): v for k, v in lora_model_sd.items() if "lora" not in k }

In [None]:
base_model.save_pretrained("outputs/own-merged-llm", state_dict=deloreanized_sd)
tokenizer.save_pretrained("outputs/own-merged-llm")

## Notices & Disclaimers 

Intel technologies may require enabled hardware, software or service activation. 

No product or component can be absolutely secure.  

Your costs and results may vary.  

No license (express or implied, by estoppel or otherwise) to any intellectual property rights is granted by this document, with the sole exception that code included in this document is licensed subject to the Zero-Clause BSD open source license (0BSD), Open Source Initiative. No rights are granted to create modifications or derivatives of this document. 

© Intel Corporation.  Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries.  Other names and brands may be claimed as the property of others.  