In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Put your huggingface api key here
AUTH_TOKEN = ""

In [3]:
data = load_dataset("beomi/KoAlpaca-v1.1a")

Found cached dataset parquet (/Users/id4thomas/.cache/huggingface/datasets/beomi___parquet/beomi--KoAlpaca-v1.1a-1465f66eb846fd61/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 1/1 [00:00<00:00, 148.06it/s]


In [4]:
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"

# We use the default prompt in llama git repo
DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""

In [5]:
# Original Koalpaca preprocessing
def koalpaca_map_fn(x):
    text_format = lambda x: f"### 질문: {x['instruction']}\n\n### 답변: {x['output']}<|endoftext|>"
    return {'text': text_format(x) }

In [6]:
def llama_dialog_preprocess(dialog):
	if dialog[0]["role"] != "system":
		dialog = [
			{
				"role": "system",
				"content": DEFAULT_SYSTEM_PROMPT,
			}
		] + dialog
	dialog = [
		{
			"role": dialog[1]["role"],
			"content": B_SYS
			+ dialog[0]["content"]
			+ E_SYS
			+ dialog[1]["content"],
		}
	] + dialog[2:]
	assert all([msg["role"] == "user" for msg in dialog[::2]]) and all(
		[msg["role"] == "assistant" for msg in dialog[1::2]]
	), (
		"model only supports 'system', 'user' and 'assistant' roles, "
		"starting with 'system', then 'user' and alternating (u/a/u/a/u...)"
	)
	return dialog

def map_dialog_to_tokenizer_input(dialog):
	return f"{B_INST} {(dialog[0]['content']).strip()} {E_INST} {(dialog[1]['content']).strip()}"

def map_data_to_lambda_text(x):
	instruction = x["instruction"]
	output = x["output"]

	dialog = [
		# {"role": "system", "content": DEFAULT_SYSTEM_PROMPT},
		{"role": "user", "content": instruction},
		{"role": "assistant", "content": output}
	]
	processed_dialog = llama_dialog_preprocess(dialog)
	dialog_input = map_dialog_to_tokenizer_input(processed_dialog)
	return dialog_input


In [7]:
sample = data["train"][0]
instruction = sample["instruction"]
output = sample["output"]

dialog = [
	# {"role": "system", "content": DEFAULT_SYSTEM_PROMPT}, # Use system if needed
	{"role": "user", "content": instruction},
	{"role": "assistant", "content": output}
]
processed_dialog = llama_dialog_preprocess(dialog)
# print(processed_dialog)

processed_dialog_input = map_dialog_to_tokenizer_input(processed_dialog)
print(processed_dialog_input)

[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>>

양파는 어떤 식물 부위인가요? 그리고 고구마는 뿌리인가요? [/INST] 양파는 잎이 아닌 식물의 줄기 부분입니다. 고구마는 식물의 뿌리 부분입니다. 

식물의 부위의 구분에 대해 궁금해하는 분이라면 분명 이 질문에 대한 답을 찾고 있을 것입니다. 양파는 잎이 아닌 줄기 부분입니다. 고구마는 다른 질문과 답변에서 언급된 것과 같이 뿌리 부분입니다. 따라서, 양파는 식물의 줄기 부분이 되고, 고구마는 식물의 뿌리 부분입니다.

 덧붙이는 답변: 고구마 줄기도 볶아먹을 수 있나요? 

고구마 줄기도 식용으로 볶아먹을 수 있습니다. 하지만 줄기 뿐만 아니라, 잎, 씨, 뿌리까지 모든 부위가 식용으로 활용되기도 합니다. 다만, 한국에서는 일반적으로 뿌리 부분인 고구마를 주로 먹습니다.


In [8]:
tokenizer = AutoTokenizer.from_pretrained(
	"meta-llama/Llama-2-7b-chat-hf",
	use_auth_token = AUTH_TOKEN
)



In [9]:
print(tokenizer.bos_token, tokenizer.bos_token_id)
print(tokenizer.eos_token, tokenizer.eos_token_id)

<s> 1
</s> 2


In [10]:
data = data.map(
    lambda x: {'text': tokenizer.bos_token+map_data_to_lambda_text(x)+tokenizer.eos_token}
)

Loading cached processed dataset at /Users/id4thomas/.cache/huggingface/datasets/beomi___parquet/beomi--KoAlpaca-v1.1a-1465f66eb846fd61/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-316fcef9c9555a24.arrow


In [11]:
data["train"][:2]["text"]

["<s>[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>\n\n양파는 어떤 식물 부위인가요? 그리고 고구마는 뿌리인가요? [/INST] 양파는 잎이 아닌 식물의 줄기 부분입니다. 고구마는 식물의 뿌리 부분입니다. \n\n식물의 부위의 구분에 대해 궁금해하는 분이라면 분명 이 질문에 대한 답을 찾고 있을 것입니다. 양파는 잎이 아닌 줄기 부분입니다. 고구마는 다른 질문과 답변에서 언급된 것과 같이 뿌리 부분입니다. 따라서, 양파는 식물의 줄기 부분이 되고, 고구마는 식물의 뿌리 부분입니다.\n\n 덧붙이는 답변: 고구마 줄기도 볶아먹을 수 있나요? \n\n고구마 줄기도 식용으로 볶아먹을 수 있습니다. 하지만 줄기 뿐만 아니라, 잎, 씨, 뿌리까지 모든 부위가 식용으로 활용되기도 합니다. 다만, 한국에서는 일반적으로 뿌리 부분인 고구마를 주로 먹습니다.</s>",
 "<s>[INST] <<SYS>>\nYou are a helpful, respectful and hones

In [12]:
train_ds = data["train"].map(lambda samples: tokenizer(samples["text"], add_special_tokens=False), batched=True)

Loading cached processed dataset at /Users/id4thomas/.cache/huggingface/datasets/beomi___parquet/beomi--KoAlpaca-v1.1a-1465f66eb846fd61/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-1c10eb386fe5a352.arrow


In [13]:
print(train_ds[0]["input_ids"])

[1, 518, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 29892, 3390, 1319, 322, 15993, 20255, 29889, 29849, 1234, 408, 1371, 3730, 408, 1950, 29892, 1550, 1641, 9109, 29889, 3575, 6089, 881, 451, 3160, 738, 10311, 1319, 29892, 443, 621, 936, 29892, 11021, 391, 29892, 7916, 391, 29892, 304, 27375, 29892, 18215, 29892, 470, 27302, 2793, 29889, 3529, 9801, 393, 596, 20890, 526, 5374, 635, 443, 5365, 1463, 322, 6374, 297, 5469, 29889, 13, 13, 3644, 263, 1139, 947, 451, 1207, 738, 4060, 29892, 470, 338, 451, 2114, 1474, 16165, 261, 296, 29892, 5649, 2020, 2012, 310, 22862, 1554, 451, 1959, 29889, 960, 366, 1016, 29915, 29873, 1073, 278, 1234, 304, 263, 1139, 29892, 3113, 1016, 29915, 29873, 6232, 2089, 2472, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 239, 153, 148, 240, 143, 143, 31081, 29871, 31129, 238, 153, 167, 29871, 31895, 238, 175, 191, 29871, 31279, 31724, 30918, 30903, 31527, 29973, 29871, 31607, 30826, 31137, 29871, 31137, 31231, 31417, 31081, 29871, 238