## Run assistant train

The following configuration is tested working:
* disable Flash_Attn
* create peft_model with get_peft_model before calling SFTTrainer (without passing peft_config to the SFTrainer)

If we pass `peft_config` to the `SFTTrainer` to create the `peft_model`, the following error is raised and need to be debugged:
`RuntimeError: expected scalar type BFloat16 but found Float`

### Prepare Environment (only need to do once!)

We first create a virtual environment and install the required packages.

```shell
cat /etc/os-release
nvcc -V
cd personal_copilot
python3.11 -m venv .copilot
source .copilot/bin/activate
pip install --upgrade pip setuptools wheel
pip install torch torchvision torchaudio
pip install packaging
pip install flash-attn
pip install -r training/requirements.txt
pip install -r dateset_generation/requirements.txt
```

### Train Model

```shell
python train.py \
    --model_name_or_path "bigcode/starcoder2-7b" \
    --lora_r 32 \
    --lora_alpha 64 \
    --lora_dropout 0.0 \
    --lora_target_modules "c_proj,c_attn,q_attn,c_fc,c_proj" \
    --use_nested_quant \
    --bnb_4bit_compute_dtype "bfloat16" \
    --use_flash_attn \
    --use_peft_lora \
    --use_4bit_quantization \
    --dataset_name "smangrul/hug_stack" \
    --dataset_text_field "text" \
    --max_seq_length 1024 \
    --fim_rate 0.5 \
    --fim_spm_rate 0.5 \
    --splits "train" \
    --per_device_train_batch_size 2 \
    --per_device_eval_batch_size 2 \
    --gradient_accumulation_steps 4 \
    --bf16 \
    --learning_rate 5e-4 \
    --lr_scheduler_type "cosine" \
    --weight_decay 0.01 \
    --max_steps 1000 \
    --warmup_steps 30 \
    --dataloader_num_workers 4 \
    --evaluation_strategy "steps" \
    --eval_steps 50 \
    --save_steps 50 \
    --logging_steps 25 \
    --output_dir "peft-lora-starcoder2-7b-personal-copilot-dual-3090-local" 
```

```shell
python train.py \
--model_name "codellama/CodeLlama-7b-hf" \
--lora_r 8 \
--lora_alpha 32 \
--lora_target_modules "all-linear" \
--use_nested_quant True \
--bnb_4bit_compute_dtype "bfloat16" \
--use_flash_attn True \
--use_peft_lora \
--use_4bit_quantization \
--dataset_name "smangrul/code-chat-assistant-v1" \
--dataset_text_field "content" \
--max_seq_len 512 \
--per_device_train_batch_size 1 \
--gradient_accumulation_steps 4 \
--bf16 True \
--learning_rate 5e-4 \
--lr_scheduler_type "cosine" \
--weight_decay 0.01 \
--save_steps 50 \
--dataloader_num_workers 4 \
--num_train_epochs 3 \
--logging_steps 25 \
--packing True \
--output_dir "peft-lora-codellama-7b-chat-asst-dual-3090-local"
```

### Using Tensorboard

```shell
cd personal_copilot/training/peft-lora-starcoder2-7b-personal-copilot-dual-3090-local
tensorboard --logdir=runs --bind_all
```

## Deep Dive 

### Dependencies

Now that we can run the training, let's go back to understand what is actually going on.

In [None]:
import sys
sys.path

In [None]:
import os
os.getcwd()

In [None]:
# add the parent directory to the path
sys.path.append('../chat_assistant/sft/training')
sys.path

In [None]:
packages = ['ipywidgets']  # Add your packages here

for package in packages:
    !pip show {package} > /dev/null || pip install {package}

In [None]:
from dataclasses import dataclass, field
import os
import sys
from typing import Optional
import torch
from transformers import (
    AutoModelForCausalLM,
    set_seed,
    BitsAndBytesConfig,
    HfArgumentParser, 
    TrainingArguments,
    AutoTokenizer,
    Trainer
)
from datasets import DatasetDict, load_dataset, load_from_disk
from datasets.builder import DatasetGenerationError
from trl import SFTTrainer
# from utils import (
#     create_and_prepare_model,
#     create_datasets,
#     loftq_init,
#     get_module_class_from_name,
# )
from train import ModelArguments, DataTrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training


We start with defining a `HfArgumentParser`: This module from the Hugging Face transformers library parses command-line arguments related to the model, data, and training configurations. 

* We can place all the arguments in a `json` file and use `parse_json_file`.
* or place them in the command line and use `parse_args_into_dataclasses`. 

### Inputs 

#### from command line

In [None]:
args = [
    "--model_name", "bigcode/starcoder2-3b",
    "--lora_r", "8",
    "--lora_alpha", "32",
    "--lora_target_modules", "all-linear",
    "--use_nested_quant", "True",
    "--bnb_4bit_compute_dtype", "bfloat16",
    "--use_flash_attn", "False",
    "--use_peft_lora",
    "--use_4bit_quantization",
    "--dataset_name", "smangrul/code-chat-assistant-v1",
    "--dataset_text_field", "content",
    "--max_seq_len", "512",
    "--per_device_train_batch_size", "1",
    "--gradient_accumulation_steps", "1",
    "--bf16",
    "--learning_rate", "5e-4",
    "--lr_scheduler_type", "cosine",
    "--weight_decay", "0.01",
    "--save_steps", "50",
    "--dataloader_num_workers", "4",
    "--num_train_epochs", "3",
    "--logging_steps", "25",
    "--packing", "True",
    "--output_dir", "peft-lora-starcoder2-3b-chat-asst-dual-3090-local"
]

In [None]:
# Parse arguments
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
model_args, data_args, training_args = parser.parse_args_into_dataclasses(args)

In [None]:
vars(model_args)

In [None]:
vars(data_args)

In [None]:
training_args;

### create_and_prepare_model

In [None]:
args = model_args
args

In [None]:
data_args = data_args
data_args

In [None]:
training_args = training_args
training_args;

In [None]:
# device_map = None
bnb_config = None
quant_storage_stype = None

In [None]:
load_in_8bit = model_args.use_8bit_qunatization
load_in_8bit

In [None]:
model_args.use_unsloth

In [None]:
if model_args.use_unsloth:
    from unsloth import FastLanguageModel

In [None]:
load_in_4bit = model_args.use_4bit_quantization
load_in_4bit

In [None]:
if (
    torch.distributed.is_available()
    and torch.distributed.is_initialized()
    and torch.distributed.get_world_size() > 1
    and args.use_unsloth
):
    raise NotImplementedError("Unsloth is not supported in distributed training")

#### Quantization & bnb config

We are using [QLoRA](https://huggingface.co/papers/2305.14314). QLoRA is a method for fine-tuning models that employs a two-pronged approach. 

Firstly, it quantizes the model to 4-bits, thereby reducing the computational resources required. 

Secondly, it incorporates a set of Low-Rank Adaptation (LoRA) weights into the model, which are fine-tuned via the quantized weights. 

In addition to the conventional Float4 data type (LinearFP4), QLoRA introduces a new 4-bit NormalFloat (LinearNF4) data type. This new data type is specifically designed for quantizing normally distributed data, and can enhance the model's performance.

##### 4bit quantization

In [None]:
bnb_4bit_compute_dtype = model_args.bnb_4bit_compute_dtype
bnb_4bit_compute_dtype

In [None]:
quant_storage_stype = getattr(torch, model_args.bnb_4bit_quant_storage_dtype)
quant_storage_stype

In [None]:
bnb_4bit_quant_type = model_args.bnb_4bit_quant_type
bnb_4bit_quant_type

In [None]:
bnb_4bit_use_double_quant = model_args.use_nested_quant
bnb_4bit_use_double_quant

In [None]:
# if load_in_4bit:
compute_dtype = getattr(torch, model_args.bnb_4bit_compute_dtype)
compute_dtype

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=load_in_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
)
bnb_config

##### what does this `compute_type` do?

We can change the data type from the default `flaot32` to `bf16` to speed up computation. This requires cuda capability that supports `torch.bfloat`.

In [None]:
if compute_dtype == torch.float16 and load_in_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print(
            "Your GPU supports bfloat16, you can accelerate training with the argument --bf16"
        )
        print("=" * 80)

In [None]:
torch.cuda.get_device_capability()

##### quantization type

In [None]:
quant_storage_stype 

In [None]:
quant_storage_stype.is_floating_point

In [None]:
torch_dtype = quant_storage_stype if quant_storage_stype and quant_storage_stype.is_floating_point else torch.float32
torch_dtype

In [None]:
# bnb_4bit_quant_type = model_args.bnb_4bit_quant_type
# bnb_4bit_quant_type

[NF4](https://huggingface.co/docs/transformers/main/en/quantization?bnb=4-bit) is a 4-bit data type adpated for weights initialized from a normal distribution.

In [None]:
from bitsandbytes.nn import modules

We've set the `compute_type` for bnb to be `torch.bloat16`.  

##### Nested quantization

[Nested quantization](https://huggingface.co/docs/transformers/main/en/quantization?bnb=4-bit) performs a second round of quantization on quantized weights to achieve additional 0.4 bits/parameter memory savings. 

In [None]:
# bnb_4bit_use_double_quant

##### Device Map (either 4bit or 8bit quantization)

```
if args.use_4bit_quantization or args.use_8bit_qunatization:
    device_map = (
        int(os.environ.get("LOCAL_RANK", -1))
        if torch.distributed.is_available() and torch.distributed.is_initialized()
        else "auto"
    )  # {"": 0}
```

In [None]:
# os.environ.get("LOCAL_RANK", -1)

In [None]:
# torch.distributed.is_available() 

In [None]:
# torch.distributed.is_initialized()

`torch.distributed.is_initialized()` is false so the `device_map` is set to "auto".

In [None]:
# device_map = (
#     int(os.environ.get("LOCAL_RANK", -1))
#     if torch.distributed.is_available() and torch.distributed.is_initialized()
#     else "auto"
# )  # {"": 0}
# device_map

The `device_map` variable is used to determine the device mapping for distributed training when using quantization.

In the context of distributed training, each process runs on a specific device (like a GPU). The `device_map` variable is used to specify which device the current process should run on.

`int(os.environ.get("LOCAL_RANK", -1))` tries to get the `LOCAL_RANK` environment variable, which is typically set in distributed training to indicate the rank of the current process. The rank is a unique identifier assigned to each process in a distributed training setup. If `LOCAL_RANK` is not set, it defaults to -1.

`torch.distributed.is_available()` and `torch.distributed.is_initialized()` checks ensure that the PyTorch distributed package is available and has been initialized. If these conditions are met, it means the code is running in a distributed training setup.

If `device_map` is set to "auto" during training, it'll automatically load the model on a GPU. 

When using the 8-bit quantized model, it is possible to [offload weights between the CPU and GPU](https://huggingface.co/docs/transformers/main/en/quantization?bnb=4-bit#offloading) with a custom `device_map` setting such as:

```python
device_map = {
    "transformer.word_embeddings": 0,
    "transformer.word_embeddings_layernorm": 0,
    "lm_head": "cpu",
    "transformer.h": 0,
    "transformer.ln_f": 0,
}
```

'0' represents the GPU. This allows support for very large models into memory.

#### Load model 

Depending on whether `unsloth` is used, we use different methods to load the model. 

We also specify different attention mechanisms.

In [None]:
model_args.use_unsloth

If `unsloth` is not used, we initialize the model with `AutoModelForCausalLM`.

```python
if args.use_unsloth:
    # Load model
    model, _ = FastLanguageModel.from_pretrained(
        model_name=args.model_name_or_path,
        max_seq_length=data_args.max_seq_length,
        dtype=None,
        load_in_4bit=load_in_4bit,
    )
else:
    model = AutoModelForCausalLM.from_pretrained(
        args.model_name_or_path,
        load_in_8bit=load_in_8bit,
        quantization_config=bnb_config,
        device_map=device_map,
        trust_remote_code=True,
        attn_implementation="flash_attention_2" if args.use_flash_attn else "eager",
    )
```

In [None]:
model_args.model_name_or_path

In [None]:
bnb_config

See also [quantization with bits and bytes](https://huggingface.co/docs/transformers/main/en/quantization?bnb=4-bit)

In [None]:
# device_map

##### flash attention

Using [Flash Attention 2](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#Flash-Attention-2) in transformers can help speed up the training throughput. 

In [None]:
model_args.use_flash_attn

In [None]:
torch_dtype

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    args.model_name_or_path,
    load_in_8bit=load_in_8bit,
    quantization_config=bnb_config,
    trust_remote_code=True,
    attn_implementation="flash_attention_2" if args.use_flash_attn else "eager",
    torch_dtype=torch_dtype,
)
# model = AutoModelForCausalLM.from_pretrained(
#     model_args.model_name_or_path,
#     load_in_8bit=load_in_8bit,
#     quantization_config=bnb_config,
#     device_map=device_map,
#     trust_remote_code=True,
#     attn_implementation="flash_attention_2" if model_args.use_flash_attn else "eager",
# )

In [None]:
model

In [None]:
print(f"The memory footprint of the model is: {model.get_memory_footprint():,}")

### Prep peft_lora with quantization and no unsloth

#### LORA PEFT

Parameter-Efficient Fine Tuning (PEFT) is a technique that allows you to fine-tune large models with limited resources. It does so by freezing the pretrained model parameters during fine-tuning, and add a small set of trainable parameters called adapters on top of it. Thus significantly [reduces the memory](https://huggingface.co/docs/transformers/model_memory_anatomy#anatomy-of-models-memory) required to fine-tune the model. 

Low-Rank Adaptation [(LoRA)](https://huggingface.co/docs/peft/main/en/conceptual_guides/lora) is a popular adapter-based method. It represent the weight updates with two smaller 'update matrices' through low-rank decomposition. The original weight matrix is frozen but the "update matrices" are trained based on the new data. At the end, the original weights and the adapter weights are combined to create the new weights.

Performance of LoRA fine-tuned models have been found to be comparable to that of fully fine-tuned models. Once the adapter weights are merged with the base model, it does not introduce additional inference latency.

LoRA is othogonal to and can be combined with other PEFT methods. 

For fine-tunning transformer models, LoRA is typically applied to only attention blocks for simplicity. The number of parameters in the adapter is determined by the rank parameter `r` and the shape of the original weight matrix.





* If we are using 4-bit or 8-bit quantization for peft_lora and
* We are NOT using unsloth 

Here is how we prepare for kbit training.

```python
if (
    (args.use_4bit_quantization or args.use_8bit_qunatization)
    and args.use_peft_lora
    and not args.use_unsloth
):
    model = prepare_model_for_kbit_training(
        model,
        use_gradient_checkpointing=training_args.gradient_checkpointing,
        gradient_checkpointing_kwargs={"use_reentrant": model_args.use_reentrant},
    )
```

In [None]:
prepare_model_for_kbit_training?

### Create peft model 

Depending on whether unsloth is used, we use different methods:

```python
if args.use_peft_lora and not args.use_unsloth:
    peft_config = LoraConfig(
        lora_alpha=args.lora_alpha,
        lora_dropout=args.lora_dropout,
        r=args.lora_r,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=args.lora_target_modules.split(",")
        if args.lora_target_modules != "all-linear"
        else args.lora_target_modules,
    )
    model = get_peft_model(model, peft_config)
elif args.use_peft_lora and args.use_unsloth:
    # Do model patching and add fast LoRA weights
    model = FastLanguageModel.get_peft_model(
        model,
        lora_alpha=args.lora_alpha,
        lora_dropout=args.lora_dropout,
        r=args.lora_r,
        target_modules=args.lora_target_modules.split(",")
        if args.lora_target_modules != "all-linear"
        else args.lora_target_modules,
        use_gradient_checkpointing=training_args.gradient_checkpointing,
        random_state=training_args.seed,
        max_seq_length=data_args.max_seq_length,
    )
```

##### lora_config

In [None]:
peft_config = None
chat_template = None

In [None]:
model_args.use_peft_lora

In [None]:
model_args.use_unsloth

##### lora_alpha

Scaling factor

In [None]:
model_args.lora_alpha

In [None]:
model_args.lora_dropout

##### lora_r

rank of the "update matrices" in int. Lower rank leads to smaller update matrices and fewer trainable parameters.

In [None]:
model_args.lora_r

##### bias

Whether `bias` parameters should be trained.

##### target modules

The modules (e.g., attention blocks etc.) to which the LoRA weights are applied.

In [None]:
model_args.lora_target_modules.split(",")

In [None]:
peft_config = LoraConfig(
    lora_alpha=model_args.lora_alpha,
    lora_dropout=model_args.lora_dropout,
    r=model_args.lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=model_args.lora_target_modules.split(",")
    if model_args.lora_target_modules != "all-linear"
    else model_args.lora_target_modules,
)
peft_config

In [None]:
vars(peft_config)

#### tokenizer

```python
    special_tokens = None
    chat_template = None
    if args.chat_template_format == "chatml":
        special_tokens = ChatmlSpecialTokens
        chat_template = DEFAULT_CHATML_CHAT_TEMPLATE
    elif args.chat_template_format == "zephyr":
        special_tokens = ZephyrSpecialTokens
        chat_template = DEFAULT_ZEPHYR_CHAT_TEMPLATE

    if special_tokens is not None:
        tokenizer = AutoTokenizer.from_pretrained(
            args.model_name_or_path,
            pad_token=special_tokens.pad_token.value,
            bos_token=special_tokens.bos_token.value,
            eos_token=special_tokens.eos_token.value,
            additional_special_tokens=special_tokens.list(),
            trust_remote_code=True,
        )
        tokenizer.chat_template = chat_template
        # make embedding resizing configurable?
        model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=8)
    else:
        tokenizer = AutoTokenizer.from_pretrained(
            args.model_name_or_path, trust_remote_code=True
        )
        tokenizer.pad_token = tokenizer.eos_token
```

In [None]:
special_tokens = None
chat_template = None

In [None]:
model_args.chat_template_format

In [None]:
model_args.model_name_or_path

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, trust_remote_code=True)

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
vars(tokenizer)

In [None]:
model

In [None]:
peft_config

**We call `get_peft_model` here** 

In [None]:
model = get_peft_model(model, peft_config)
model

In [None]:
tokenizer

### Configure gradient checkpointing

Gradient checkpointing is a technique used to reduce the memory usage when training deep learning models, at the cost of increased computation time. It's useful when training large models that would otherwise not fit in memory.

```python
    model.config.use_cache = not training_args.gradient_checkpointing
```

This line disables caching in the model configuration if gradient checkpointing is enabled. Caching is used to speed up computation by storing the results of expensive function calls and reusing them when the same inputs occur again. However, it increases memory usage, so it's disabled when gradient checkpointing is used.


In [None]:
model.config.use_cache = not training_args.gradient_checkpointing
model.config.use_cache


```python
    training_args.gradient_checkpointing = (
        training_args.gradient_checkpointing and not model_args.use_unsloth
    )
    if training_args.gradient_checkpointing:
        training_args.gradient_checkpointing_kwargs = {
            "use_reentrant": model_args.use_reentrant
        }
```

We enable gradient checkpointing only if it was initially enabled and `use_unsloth` is not set in the model arguments.

If gradient checkpointing is enabled, we set the `use_reentrant` argument according to the provided input arguments.

In [None]:
training_args.gradient_checkpointing

In [None]:
training_args.gradient_checkpointing and not model_args.use_unsloth

In [None]:
model_args.use_reentrant

In [None]:
training_args.gradient_checkpointing = (
    training_args.gradient_checkpointing and not model_args.use_unsloth
)
if training_args.gradient_checkpointing:
    training_args.gradient_checkpointing_kwargs = {
        "use_reentrant": model_args.use_reentrant
    }

In [None]:
training_args.gradient_checkpointing

### Datasets

In [None]:
vars(data_args)

In [None]:
model_args.chat_template_format

In [None]:
apply_chat_template=model_args.chat_template_format != "none"
apply_chat_template

#### Load dataset

In [None]:
raw_datasets = DatasetDict()

In [None]:
data_args.splits.split(",")

In [None]:
data_args.dataset_name

In [None]:
data_args.splits

In [None]:
for split in data_args.splits.split(","):
    try:
        # Try first if dataset on a Hub repo
        dataset = load_dataset(data_args.dataset_name, split=split)
    except DatasetGenerationError:
        # If not, check local dataset
        dataset = load_from_disk(os.path.join(data_args.dataset_name, split))

    if "train" in split:
        raw_datasets["train"] = dataset
    elif "test" in split:
        raw_datasets["test"] = dataset
    else:
        raise ValueError(
            f"Split type {split} not recognized as one of test or train."
        )

In [None]:
raw_datasets['train']

In [None]:
raw_datasets['test']

```python
    if apply_chat_template:
        raw_datasets = raw_datasets.map(
            preprocess,
            batched=True,
            remove_columns=raw_datasets["train"].column_names,
        )
```

In [None]:
apply_chat_template

In [None]:
train_data = raw_datasets["train"]
valid_data = raw_datasets["test"]

In [None]:
print(
    f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}"
)

In [None]:
print(f"A sample of train dataset: {train_data[0]}")

In [None]:
train_dataset = train_data
train_dataset

In [None]:
eval_dataset = valid_data
eval_dataset

In [None]:
data_column = data_args.dataset_text_field
data_column

### Review all the arguments

In [None]:
vars(model_args)

In [None]:
vars(data_args)

In [None]:
vars(training_args)

Let's discuss those parameters that we have not yet covered

#### batch size

Batch size is recommended to be 2^N, often muliple of 8.

[Tensor Core Requirements](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc) define the multiplier based on the dtype and the hardware. For instance, 
* for fp16 data type a multiple of 8 is recommended 
* but for an A100 GPU, a multiples of 64 is recommended

#### gradient accumulation

Gradient accumulation is a technique designed to compute gradients in smaller, more manageable increments rather than processing the entire batch simultaneously. This method involves a series of forward and backward passes through the model, during which gradients are calculated and accumulated. After a sufficient number of gradients have been gathered, the optimization step of the model is carried out. 

The advantage of using gradient accumulation is that it allows for an increase in the effective batch size, surpassing the constraints set by the GPU's memory. However, it's crucial to be aware that the extra forward and backward passes required by this method can potentially decelerate the training process.

In [None]:
training_args.gradient_accumulation_steps

In [None]:
training_args.per_device_train_batch_size

The above results in a 4x2 = 8 effective batch size on a single GPU.

#### gradient checkpointing

Gradient checkpointing is a technique that balances memory usage and computational speed during model training. Instead of storing all activations from the forward pass for gradient computation, which can consume significant memory, or discarding and recalculating them, which can slow down training, gradient checkpointing selectively saves certain activations. This means only a subset of activations need to be recalculated, optimizing both memory and computation resources.

But it comes with a cost of [slowing down the training by approximately 20%](https://huggingface.co/docs/transformers/main/en/perf_train_gpu_one)

In [None]:
training_args.gradient_checkpointing

#### Mixed precision

Mixed precision training is a method that enhances computational efficiency in model training by using lower-precision numerical formats for certain variables. While most models traditionally use 32-bit floating point precision (fp32), not all variables need this level of precision. By lowering the precision of some variables to formats like 16-bit floating point (fp16), computations can be sped up.

Typically in mixed precision training: 
* Activations are in half precision (fp16)
* Despite gradients being computed in half precision, they are converted back to full precision for optimization, so no memory is saved in this step. 
* It could also lead to more GPU memory being utilized, especially for small batch sizes. 

Newer GPU architectures, like the Ampere architecture, offer bf16 and tf32 data types. Tradditonal one is ft16.  

In [None]:
print(training_args.tf32)

In [None]:
print(training_args.bf16)

In [None]:
training_args.optim

### Trainer

In [None]:
len(train_dataset)

In [None]:
train_dataset

In [None]:
len(eval_dataset)

In [None]:
eval_dataset

In [None]:
# # trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
# )

In [None]:
# trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    # peft_config=peft_config, disable peft_config here since we already created the peft_model
    packing=data_args.packing,
    dataset_kwargs={
        "append_concat_token": data_args.append_concat_token,
        "add_special_tokens": data_args.add_special_tokens,
    },
    dataset_text_field=data_args.dataset_text_field,
    max_seq_length=data_args.max_seq_length,
)

In [None]:
trainer.accelerator.print(f"{trainer.model}")

In [None]:
model_args.use_peft_lora

In [None]:
if model_args.use_peft_lora:
    trainer.model.print_trainable_parameters()

##### loftq

For QLoRA training, when we're preparing to quantize the base model, it's worth considering the use of LoftQ initialization. This method has demonstrated its ability to enhance performance in conjunction with quantization. The underlying concept is to initialize the LoRA weights in a way that minimizes the quantization error. 

In [None]:
model_args.use_loftq

In [None]:
# LoftQ initialization when using QLoRA
if model_args.use_4bit_quantization and model_args.use_loftq:
    loftq_init(trainer.model, tokenizer, train_dataset, data_args.max_seq_length ,model_args)

If enabled, `loftq_init` will call `replace_lora_weights_loftq` to replace the LoRA weights with LoftQ-initialized weights.

##### checkpoint

In [None]:
print(training_args.resume_from_checkpoint)

In [None]:
checkpoint = None
if training_args.resume_from_checkpoint is not None:
    checkpoint = training_args.resume_from_checkpoint

In [None]:
vars(model_args)

### Train

In [None]:
trainer.train(resume_from_checkpoint=checkpoint)

### Save model result

In [None]:
trainer.is_fsdp_enabled

In [None]:
if trainer.is_fsdp_enabled:
    trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")

#trainer.save_model()