In [1]:
from transformers import (AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
HfArgumentParser,
AutoTokenizer,
TrainingArguments,
Trainer,
GenerationConfig)
from tqdm import tqdm
import torch
import time
from trl import SFTTrainer
import pandas as pd
import numpy as np
from huggingface_hub import interpreter_login
device = 'cuda' if torch.cuda.is_available() else 'cpu'
interpreter_login(new_session=False)

print(torch.cuda.is_available())

  from .autonotebook import tqdm as notebook_tqdm


User is already logged in.
True


In [2]:
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig

fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)

accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

In [3]:
compute_dtype = getattr(torch,'float16')
bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_use_double_quant = True,
    bnb_4bit_compute_dtype = torch.bfloat16,
)

In [4]:
model_name = 'mistralai/Mistral-7B-Instruct-v0.2'
original_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map='auto',
    quantization_config=bnb_config,
    trust_remote_code=True,
    token=True
)

Loading checkpoint shards: 100%|██████████| 3/3 [02:24<00:00, 48.12s/it]


In [5]:
model_name = 'mistralai/Mistral-7B-Instruct-v0.2'
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    padding_side='left',
    add_eos_token=True,
    add_bos_token = True
)
tokenizer.pad_token = tokenizer.eos_token

In [6]:
import json 
import random
with open('bt_dataset.json') as f:
    dataset = json.load(f)

test_samples = random.sample(range(len(dataset)),26)

train_jsons = []
test_jsons = []

for i in range(len(dataset)):
    if i in test_samples:
        test_jsons.append(dataset[i])
    else:
        train_jsons.append(dataset[i])

print("train dataset samples:",len(train_jsons))
print("test dataset samples:",len(test_jsons))

with open('bt_dataset_train.json','w') as f:
    json.dump(train_jsons,f)

with open('bt_dataset_test.json','w') as f:
    json.dump(test_jsons,f)

train dataset samples: 568
test dataset samples: 26


In [7]:
from datasets import load_dataset

train_dataset= load_dataset('json',data_files = 'bt_dataset_train.json')
test_dataset = load_dataset('json',data_files = 'bt_dataset_test.json')
print(train_dataset)
print(test_dataset)

Generating train split: 568 examples [00:00, 15217.72 examples/s]
Generating train split: 26 examples [00:00, 6606.40 examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 568
    })
})
DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'instruction'],
        num_rows: 26
    })
})





In [8]:
import json

def create_prompt_formats(sample):
    system_prompt = f"<s>[INST]{sample['instruction']}"
    summary = f"summary:{sample['input']}[/INST]"
    output = f"output:{sample['output']}</s>" if sample["output"] else None
    
    parts = [part for part in [system_prompt,summary,output] if part]

    formatted_prompt = "\n\n".join(parts)
    sample["text"] = formatted_prompt
    return sample

In [9]:
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length

In [10]:
from functools import partial

def preprocess_batch(batch,tokenizer,max_length):
    return tokenizer(batch['text'],max_length = max_length,truncation=True,padding=True,return_tensors = "pt")

def preprocess_dataset(tokenizer,max_length, seed, dataset):
    print("---preprocessing dataset---")
    dataset = dataset.map(create_prompt_formats)
    _preprocessing_function = partial(preprocess_batch,max_length = max_length,tokenizer = tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=['instruction','input','output','text'],
    )
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)
    dataset = dataset.shuffle(seed=seed)
    # dataset.set_format("pt",columns=["input_ids","attention_mask"],output_all_columns=True) 
    # requires_grad=True
    return dataset

In [11]:
max_length = get_max_length(original_model)
prompt_train = train_dataset['train'].map(create_prompt_formats)
prompt_test = test_dataset['train'].map(create_prompt_formats)

Found max lenth: 32768


Map: 100%|██████████| 568/568 [00:00<00:00, 6364.84 examples/s]
Map: 100%|██████████| 26/26 [00:00<00:00, 4151.20 examples/s]


In [12]:
train = preprocess_dataset(tokenizer,max_length,20,train_dataset['train'])
train

---preprocessing dataset---


Map: 100%|██████████| 568/568 [00:03<00:00, 150.35 examples/s]
Filter: 100%|██████████| 568/568 [00:05<00:00, 96.18 examples/s]


Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 568
})

In [13]:
test = preprocess_dataset(tokenizer,max_length,20,test_dataset['train'])
test

---preprocessing dataset---


Map: 100%|██████████| 26/26 [00:00<00:00, 247.08 examples/s]
Filter: 100%|██████████| 26/26 [00:00<00:00, 281.61 examples/s]


Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 26
})

In [None]:
train['input_ids']

## Fine Tuning

In [14]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

## lora config
config = LoraConfig(
    r = 8,
    lora_alpha = 16,
    target_modules = [
       "gate_proj",
        "v_proj",
        "o_proj",
        "down_proj",
        "up_proj",
        "k_proj",
        "q_proj"
    ],
    bias = "none",
    lora_dropout = 0.05,
    task_type = "CAUSAL_LM"
)

original_model.gradient_checkpointing_enable()
kbit_model = prepare_model_for_kbit_training(original_model)

In [15]:
peft_model = get_peft_model(kbit_model,config)

In [16]:
peft_model.print_trainable_parameters()

trainable params: 20,971,520 || all params: 7,262,703,616 || trainable%: 0.2887563792882719


In [17]:
accelerate_model = accelerator.prepare_model(peft_model)

In [18]:
import transformers

output_dir = 'fine_tuned_mistral'
finetuned_name = "mistral-bt-finetuned"
peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps = 1,
    per_device_train_batch_size = 2,
    max_steps = 350,
    learning_rate = 3e-4,
    optim = "paged_adamw_8bit",
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    fp16=True,
    logging_steps=50,
    logging_dir = "./logs",
    save_strategy = "steps",
    save_steps = 50,
    evaluation_strategy = "steps",
    eval_steps = 50,
    do_eval=True,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length = True,
    load_best_model_at_end = True,
)

accelerate_model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model = accelerate_model,
    train_dataset = train,
    eval_dataset = test,
    args = peft_training_args,
    data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

peft_trainer.train()

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss
50,0.6809,0.448083
100,0.3798,0.366858
150,0.2832,0.33356
200,0.1861,0.311269
250,0.1307,0.313911
300,0.0971,0.339954
350,0.0642,0.335184




TrainOutput(global_step=350, training_loss=0.26029074123927526, metrics={'train_runtime': 40621.2409, 'train_samples_per_second': 0.069, 'train_steps_per_second': 0.009, 'total_flos': 1.7898626711519232e+18, 'train_loss': 0.26029074123927526, 'epoch': 4.929577464788732})