In [None]:
!pip install peft
!pip install datasets
!pip install bitsandbytes
!pip install accelerate

In [None]:
import os
import json
from transformers import AutoModelForCausalLM,AutoTokenizer
from transformers import TrainingArguments
from transformers import Trainer
from peft import get_peft_model,LoraConfig,TaskType
from datasets import load_dataset,Dataset,concatenate_datasets
import torch
from torch.nn.utils.rnn import pad_sequence


In [None]:
# Model load
model_id = "MLP-KTLim/llama-3-Korean-Bllossom-8B"
base_model = AutoModelForCausalLM.from_pretrained(model_id,
                                                  torch_dtype=torch.float16,
                                                  load_in_8bit=True,
                                                  device_map='auto')

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side='right'

In [None]:
# Add Lora Adapter
# ref. https://arxiv.org/abs/2106.09685

lora_config = LoraConfig(
     task_type=TaskType.CAUSAL_LM,
     r=4,
     target_modules = ['q_proj','k_proj','v_proj','o_proj','gate_proj','up_proj','down_proj'],
     lora_alpha = 8,
     lora_dropout=0.05,
     #modules_to_save=['embed_tokens','lm_head']
)

model = get_peft_model(base_model,lora_config)

In [None]:
# Dataset Load
train_dataset = load_dataset('beomi/KoAlpaca-v1.1a',split='train')
train_dataset

Downloading readme:   0%|          | 0.00/1.75k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/12.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/21155 [00:00<?, ? examples/s]

Dataset({
    features: ['instruction', 'output', 'url'],
    num_rows: 21155
})

In [None]:
# System Message 설정
PROMPT = \
    '''당신은 유용한 AI 어시스턴트입니다. 사용자의 질의에 대해 친절하고 정확하게 답변해야 합니다.\nYou are a helpful AI assistant, you'll need to answer users' queries in a friendly and accurate manner.'''

In [None]:
# Dataset Preprocessing
def formatting_func(examples):
    input_ids=[]
    labels = []

    for ins,ou in zip(examples['instruction'],examples['output']):
        instruction = ins
        response = ou
        #context =inp

        messages = [{'role':'system', 'content':f"{PROMPT}"},
                    {'role':'user', 'content':f"{instruction}"}]

        instruction_chat= tokenizer.apply_chat_template(messages,tokenize=True,add_generation_prompt=True)
        response_chat = tokenizer(response,return_attention_mask=False,add_special_tokens=False)['input_ids']

        chat_messages = instruction_chat+response_chat+[tokenizer.convert_tokens_to_ids('<|eot_id|>')]

        label = ([-100]*len(instruction_chat))+response_chat+[tokenizer.convert_tokens_to_ids('<|eot_id|>')]

        input_ids.append(chat_messages)
        labels.append(label)

    return {'input_ids':input_ids,'labels':labels}

In [None]:
train_dataset=train_dataset.shuffle()
train_dataset=train_dataset.select(range(50))
train_dataset = train_dataset.map(
    formatting_func,
    num_proc=4,
    batched=True,
    remove_columns = train_dataset.column_names,
)

  self.pid = os.fork()


Map (num_proc=4):   0%|          | 0/50 [00:00<?, ? examples/s]

In [None]:
split_dataset = train_dataset

In [None]:
class CustomDataCollator(object):
    def __init__(self,tokenizer,prompt,padding_value,batch_first):
        self.tokenizer = tokenizer
        self.prompt = prompt
        self.padding_value=padding_value
        self.batch_first=batch_first

    def __call__(self, examples):
        # [{},{},{}]
        input_ids=[]
        labels = []

        for i in range(len(examples)):
            input_ids.append(torch.tensor(examples[i]['input_ids'],dtype=torch.long))
            labels.append(torch.tensor(examples[i]['labels'],dtype=torch.long))

        padded_input_ids = pad_sequence(input_ids,padding_value=self.padding_value,batch_first=self.batch_first)
        padded_labels = pad_sequence(labels,padding_value=self.padding_value,batch_first=self.batch_first)
        attention_mask = padded_input_ids.ne(self.padding_value)

        return {'input_ids': padded_input_ids, 'labels': padded_labels,'attention_mask':attention_mask}


In [None]:
data_collator = CustomDataCollator(tokenizer,PROMPT,tokenizer.pad_token_id,batch_first=True)

In [None]:
training_args = TrainingArguments(
    output_dir = '/content/result',
    num_train_epochs = 1,
    fp16=True,
    per_device_train_batch_size=1,
    #per_device_eval_batch_size=1,
    #gradient_accumulation_steps=1,
    save_strategy='epoch',
    #evaluation_strategy='epoch',
    save_total_limit=1,
    optim='adamw_bnb_8bit',
    #load_best_model_at_end=True,
    save_only_model=True,
    logging_strategy='steps',
    logging_steps=30,
    label_names=['labels'],
)

In [None]:
trainer = Trainer(
    model=model,
    train_dataset=split_dataset,
    #eval_dataset=split_dataset['test'],
    args=training_args,
    data_collator=data_collator,
)

trainer.train()

Step,Training Loss
30,0.0062




TrainOutput(global_step=50, training_loss=0.00591467022895813, metrics={'train_runtime': 78.8826, 'train_samples_per_second': 0.634, 'train_steps_per_second': 0.634, 'total_flos': 562182678429696.0, 'train_loss': 0.00591467022895813, 'epoch': 1.0})