In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import Dataset

In [2]:
local_dir = "./model/kanana-1.5-2.1b-instruct-2505"

base_model = AutoModelForCausalLM.from_pretrained(local_dir, dtype=torch.bfloat16, device_map="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(local_dir, padding_side="left")
tokenizer.pad_token = tokenizer.eos_token

In [6]:
dataset = Dataset.from_csv('./data/sample_train_data.csv')
dataset[0]

{'Unnamed: 0': 0,
 'instruction': '카카오클라우드의 AI 서비스는 어떤 기능을 제공하나요?',
 'output': '카카오클라우드는 고객이 필요로 하는 서비스에 AI 기술을 편리하게 적용할 수 있도록 현재는 Kubeflow 서비스를 제공합니다.',
 'input': None}

In [14]:
def formatting_prompts_func(examples):
    texts = []

    for user, assistant in zip(examples["instruction"], examples["output"]):
        messages = [
            {"role": "user", "content": user},
            {"role": "assistant", "content": assistant}
        ]
        text = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=False
        )

        texts.append(text)

    return {"text": texts}

formatted_dataset = dataset.map(formatting_prompts_func, batched=True, remove_columns=dataset.column_names)
formatted_dataset[0]

Map:   0%|          | 0/111 [00:00<?, ? examples/s]

{'text': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n카카오클라우드의 AI 서비스는 어떤 기능을 제공하나요?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n카카오클라우드는 고객이 필요로 하는 서비스에 AI 기술을 편리하게 적용할 수 있도록 현재는 Kubeflow 서비스를 제공합니다.<|eot_id|>'}

In [None]:
# def formatting_prompts_func(examples):
#     alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

# ### Instruction:
# {}

# ### Input:
# {}

# ### Response:
# {}"""

#     instructions = examples["instruction"]
#     inputs = examples["input"]
#     outputs = examples["output"]

#     EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

#     texts = []
#     for instruction, input, output in zip(instructions, inputs, outputs):
#         # Must add EOS_TOKEN, otherwise your generation will go on forever!
#         text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
#         texts.append(text)

#     return {"text": texts}


# dataset = dataset.map(formatting_prompts_func, batched=True)
# dataset = dataset.remove_columns(['Unnamed: 0'])
# dataset