In [46]:
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
from peft import LoraConfig, get_peft_model

In [47]:
local_dir = "./model/kanana-1.5-2.1b-instruct-2505"

tokenizer = AutoTokenizer.from_pretrained(local_dir, padding_side="left")
tokenizer.pad_token = tokenizer.eos_token

In [48]:
df = pd.read_excel("./data/emotion_data.xlsx")
df = df.fillna('')

In [49]:
data_list = []
for _, row in df.iterrows():
    data_list.append({
        'emotion': row['감정_소분류'],
        '사람문장1': row['사람문장1'],
        '시스템문장1': row['시스템문장1'],
        '사람문장2': row['사람문장2'],
        '시스템문장2': row['시스템문장2'],
        '사람문장3': row['사람문장3'],
        '시스템문장3': row['시스템문장3'],
    })

dataset = Dataset.from_dict({key: [d[key] for d in data_list] for key in data_list[0].keys()})
dataset[0]

{'emotion': '노여워하는',
 '사람문장1': '일은 왜 해도 해도 끝이 없을까? 화가 난다.',
 '시스템문장1': '많이 힘드시겠어요. 주위에 의논할 상대가 있나요?',
 '사람문장2': '그냥 내가 해결하는 게 나아. 남들한테 부담 주고 싶지도 않고.',
 '시스템문장2': '혼자 해결하기로 했군요. 혼자서 해결하기 힘들면 주위에 의논할 사람을 찾아보세요. ',
 '사람문장3': '',
 '시스템문장3': ''}

In [50]:
def formatting_prompts_func(examples):
    texts = []

    for i in range(len(examples['사람문장1'])):
        messages = []
        for turn in range(1, 4):
            user = examples[f'사람문장{turn}'][i].strip()
            assistant = examples[f'시스템문장{turn}'][i].strip()

            if user:
                messages.append({"role": "user", "content": user})
            if assistant:
                messages.append({"role": "assistant", "content": assistant})

        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False
        )

        texts.append(text)

    return {"text": texts}

formatted_dataset = dataset.map(formatting_prompts_func, batched=True, remove_columns=dataset.column_names)
formatted_dataset[0]

Map:   0%|          | 0/51630 [00:00<?, ? examples/s]

{'text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n일은 왜 해도 해도 끝이 없을까? 화가 난다.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n많이 힘드시겠어요. 주위에 의논할 상대가 있나요?<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n그냥 내가 해결하는 게 나아. 남들한테 부담 주고 싶지도 않고.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n혼자 해결하기로 했군요. 혼자서 해결하기 힘들면 주위에 의논할 사람을 찾아보세요.<|eot_id|>'}

In [51]:
def tokenize_function(examples):
    tokens = tokenizer(examples["text"], padding=True, return_tensors="pt")
    return tokens

tokenized_dataset = formatted_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_dataset[0]

Map:   0%|          | 0/51630 [00:00<?, ? examples/s]

{'input_ids': [128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  128009,
  12800

In [52]:
lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=[
        "q_proj", 
        "k_proj", 
        "v_proj"]
)
base_model = AutoModelForCausalLM.from_pretrained(local_dir, dtype=torch.bfloat16, device_map="auto", trust_remote_code=True)
model = get_peft_model(base_model, lora_config)

In [53]:
split_dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_ds = split_dataset["train"]
eval_ds = split_dataset["test"]

In [54]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

args=TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # max_steps = 60,
        num_train_epochs=1,
        learning_rate = 2e-4, 
        bf16 = True,
        seed = 1234,
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        output_dir = "outputs",

        logging_steps=500,
        eval_strategy="steps",
        eval_steps=500,
        save_strategy="steps",
        save_steps=500,
        save_total_limit=3,
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        greater_is_better=False
)

trainer = Trainer(
    model=model,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    data_collator=data_collator,
    args=args
)

The model is already on multiple devices. Skipping the move to device specified in `args`.


In [55]:
trainer.train()

Step,Training Loss,Validation Loss
500,2.0691,2.001827
1000,1.9484,1.993265
1500,1.9273,1.986358
2000,1.9081,1.988279


KeyboardInterrupt: 