In [1]:
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments,AutoConfig
from datasets import Dataset
import torch
import bitsandbytes as bnb
from peft import LoraConfig, TaskType, get_peft_model, prepare_model_for_kbit_training

[2023-12-25 03:13:49,134] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
import pandas as pd
import requests
from datasets import Dataset

train_df = pd.read_csv('train.csv')
train_dataset = Dataset.from_pandas(train_df)
validation_df = pd.read_csv('val.csv')
validation_dataset = Dataset.from_pandas(validation_df)

In [3]:
import wandb, os
wandb_project = "mix-finetune"
if len(wandb_project) > 0:
    os.environ["WANDB_PROJECT"] = wandb_project

# Load model

In [4]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16,
    bnb_4bit_use_double_quant=True,
)
config = AutoConfig.from_pretrained('mistralai/Mixtral-8x7B-v0.1')
config.use_cache = False
config.gradient_checkpointing = True
torch_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16


model = AutoModelForCausalLM.from_pretrained('mistralai/Mixtral-8x7B-v0.1',
                                             config=config,
                                             quantization_config=bnb_config,
                                             trust_remote_code=False,
                                             torch_dtype=torch_dtype,
                                             device_map="auto")

Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [6]:
pad_token_id = 0
tokenizer = AutoTokenizer.from_pretrained('mistralai/Mixtral-8x7B-v0.1',
                                          trust_remote_code=False,
                                          use_fast=True)
tokenizer.pad_token_id = pad_token_id
tokenizer.pad_token = tokenizer.convert_ids_to_tokens(pad_token_id)

# Qlora configuration

In [12]:
def find_all_linear_names(model, add_lm_head=True):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if add_lm_head and not "lm_head" in lora_module_names:
        lora_module_names.add("lm_head")

    return list(lora_module_names)

In [13]:
target_modules = find_all_linear_names(model)
target_modules

['k_proj', 'w3', 'v_proj', 'gate', 'lm_head', 'q_proj', 'w2', 'w1', 'o_proj']

In [14]:
################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=target_modules,
    bias="none",
    task_type="CAUSAL_LM",
    inference_mode=False
)
model = prepare_model_for_kbit_training(model,
                                        use_gradient_checkpointing=True)

model = get_peft_model(model, peft_config)

# refine training arguments

In [15]:
################################################################################
# TrainingArguments parameters
################################################################################

# Number of training epochs
num_train_epochs = 3

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 2

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 2

# Initial learning rate (AdamW optimizer)
learning_rate = 1e-4

# Optimizer to use
optim = "paged_adamw_8bit"

# Log every X updates steps
logging_steps = 10

eval_steps = 10
save_steps = 10
logging_steps = 10
lr_scheduler_type = "constant"
warmup_steps = 50
gradient_checkpointing = True
weight_decay = 0.05
save_total_limit = 3

training_args = TrainingArguments(
    do_train=True,
    do_eval=True,
    output_dir="./checkpoints-3",
    dataloader_drop_last=True,
    evaluation_strategy="steps",
    save_strategy="steps",
    logging_strategy="steps",
    num_train_epochs=num_train_epochs,
    eval_steps=eval_steps,
    save_steps=save_steps,
    logging_steps=logging_steps,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_train_batch_size*2,
    optim=optim,
    learning_rate=learning_rate,
    lr_scheduler_type=lr_scheduler_type,
    warmup_steps=warmup_steps,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=gradient_checkpointing,
    weight_decay=weight_decay,
    report_to="wandb",
    load_best_model_at_end=True,
    save_total_limit=save_total_limit,
    bf16=True if torch.cuda.is_bf16_supported() else False,
    fp16=False if torch.cuda.is_bf16_supported() else True,
)

# define trainer

In [16]:
block_size = 1024
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    dataset_text_field="text",
    max_seq_length=block_size,
    tokenizer=tokenizer,
    data_collator=None,
    packing=None
)

Map:   0%|          | 0/329 [00:00<?, ? examples/s]

Map:   0%|          | 0/37 [00:00<?, ? examples/s]



# Train the model

In [None]:
trainer.train()

In [None]:
trainer.save_model('mix-qlora-result')