In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import json

model_name = "meta-llama/Meta-Llama-3.1-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(f"Vocab size: {tokenizer.vocab_size}")
print(f"Model max length: {tokenizer.model_max_length}")

Vocab size: 128000
Model max length: 131072


In [2]:
# encode: 문자열 -> token IDs
text = "Hello, how are you?"
token_ids = tokenizer.encode(text)
print(f"원문: {text}")
print(f"Token IDs: {token_ids}")
print(f"Token 수: {len(token_ids)}")
print()

# 각 token ID가 어떤 문자열에 대응하는지 확인
for tid in token_ids:
    print(f"  {tid:>6d} -> '{tokenizer.decode([tid])}'")

print()

# decode: token IDs -> 문자열
decoded = tokenizer.decode(token_ids)
print(f"Decoded: {decoded}")

원문: Hello, how are you?
Token IDs: [128000, 9906, 11, 1268, 527, 499, 30]
Token 수: 7

  128000 -> '<|begin_of_text|>'
    9906 -> 'Hello'
      11 -> ','
    1268 -> ' how'
     527 -> ' are'
     499 -> ' you'
      30 -> '?'

Decoded: <|begin_of_text|>Hello, how are you?


In [3]:
# padding: 배치 처리를 위해 길이를 맞춤
tokenizer.pad_token = tokenizer.eos_token  # Llama는 pad token이 없으므로 설정 필요

# padding="max_length": 지정한 길이까지 PAD 토큰 추가
result = tokenizer(
    text,
    padding="max_length",
    max_length=15,
    return_tensors="pt",
)
print("=== padding='max_length', max_length=15 ===")
print(f"input_ids:      {result['input_ids'].tolist()}")
print(f"attention_mask: {result['attention_mask'].tolist()}")
print(f"  -> PAD token ID: {tokenizer.pad_token_id}")
print(f"  -> attention_mask에서 0인 위치가 padding")

=== padding='max_length', max_length=15 ===
input_ids:      [[128000, 9906, 11, 1268, 527, 499, 30, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001]]
attention_mask: [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]
  -> PAD token ID: 128001
  -> attention_mask에서 0인 위치가 padding


In [4]:
# truncation: 최대 길이 초과 시 잘라냄
long_text = "This is a very long sentence that we want to truncate to a shorter length."
result_trunc = tokenizer(
    long_text,
    truncation=True,
    max_length=10,
    return_tensors="pt",
)
print("=== truncation=True, max_length=10 ===")
print(f"원문 token 수: {len(tokenizer.encode(long_text))}")
print(f"Truncated input_ids: {result_trunc['input_ids'].tolist()}")
print(f"Truncated 후 token 수: {result_trunc['input_ids'].shape[1]}")
print(f"Decoded: '{tokenizer.decode(result_trunc['input_ids'][0])}'")

=== truncation=True, max_length=10 ===
원문 token 수: 17
Truncated input_ids: [[128000, 2028, 374, 264, 1633, 1317, 11914, 430, 584, 1390]]
Truncated 후 token 수: 10
Decoded: '<|begin_of_text|>This is a very long sentence that we want'


In [5]:
# padding side 설정
print(f"현재 padding_side: {tokenizer.padding_side}")

# right padding (default)
tokenizer.padding_side = "right"
right = tokenizer("Hi there", padding="max_length", max_length=10, return_tensors="pt")
print(f"\nRight padding: {right['input_ids'].tolist()}")
print(f"  -> [tokens... PAD PAD PAD]")

# left padding
tokenizer.padding_side = "left"
left = tokenizer("Hi there", padding="max_length", max_length=10, return_tensors="pt")
print(f"\nLeft padding:  {left['input_ids'].tolist()}")
print(f"  -> [PAD PAD PAD tokens...]")

# 다시 기본값으로 복원
tokenizer.padding_side = "right"

현재 padding_side: right

Right padding: [[128000, 13347, 1070, 128001, 128001, 128001, 128001, 128001, 128001, 128001]]
  -> [tokens... PAD PAD PAD]

Left padding:  [[128001, 128001, 128001, 128001, 128001, 128001, 128001, 128000, 13347, 1070]]
  -> [PAD PAD PAD tokens...]


In [8]:
with open("/teamspace/studios/this_studio/finetuning-tutorial/alpaca_gpt4_data.json", "r") as f:
    alpaca = json.load(f)

print(f"데이터셋 크기: {len(alpaca)}개")
print(f"\n--- 예시 1: input이 없는 경우 ---")
print(json.dumps(alpaca[0], indent=2, ensure_ascii=False))

# input이 있는 샘플 찾기
with_input = [row for row in alpaca if row["input"] != ""]
print(f"\n--- 예시 2: input이 있는 경우 ({len(with_input)}개 존재) ---")
print(json.dumps(with_input[0], indent=2, ensure_ascii=False))

데이터셋 크기: 52002개

--- 예시 1: input이 없는 경우 ---
{
  "instruction": "Give three tips for staying healthy.",
  "input": "",
  "output": "1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n\n2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n\n3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night."
}

--- 예시 2: input이 있는 경우 (20679개 존재) ---
{
  "instruction": "Identify the odd one out.",
  "input": "Twitter

In [9]:
# Instruct 모델의 tokenizer를 로드해야 chat_template이 있음
tokenizer_instruct = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")

# chat_template 확인
print("=== Chat Template (Jinja2) ===")
print(tokenizer_instruct.chat_template[:500], "...")

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

=== Chat Template (Jinja2) ===
{{- bos_token }}
{%- if custom_tools is defined %}
    {%- set tools = custom_tools %}
{%- endif %}
{%- if not tools_in_user_message is defined %}
    {%- set tools_in_user_message = true %}
{%- endif %}
{%- if not date_string is defined %}
    {%- set date_string = "26 Jul 2024" %}
{%- endif %}
{%- if not tools is defined %}
    {%- set tools = none %}
{%- endif %}

{#- This block extracts the system message, so we can slot it into the right place. #}
{%- if messages[0]['role'] == 'system' %}
  ...


In [10]:
# Alpaca 데이터를 chat format으로 변환
row = alpaca[0]  # input이 없는 예시
print(f"Instruction: {row['instruction']}")
print(f"Input: '{row['input']}'")
print(f"Output: {row['output'][:100]}...")
print()

# chat message 형식으로 구성
user_content = row["instruction"]
if row["input"]:
    user_content += f"\n\n{row['input']}"

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": user_content},
    {"role": "assistant", "content": row["output"]},
]

# apply_chat_template으로 변환
formatted = tokenizer_instruct.apply_chat_template(messages, tokenize=False)
print("=== apply_chat_template 결과 ===")
print(formatted)

Instruction: Give three tips for staying healthy.
Input: ''
Output: 1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and...

=== apply_chat_template 결과 ===
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>

Give three tips for staying healthy.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.

2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exerc

In [11]:
# input이 있는 샘플도 확인
row_with_input = with_input[0]
print(f"Instruction: {row_with_input['instruction']}")
print(f"Input: {row_with_input['input'][:100]}...")
print()

user_content = row_with_input["instruction"] + f"\n\n{row_with_input['input']}"
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": user_content},
    {"role": "assistant", "content": row_with_input["output"]},
]

formatted = tokenizer_instruct.apply_chat_template(messages, tokenize=False)
print("=== input이 있는 경우 ===")
print(formatted)

Instruction: Identify the odd one out.
Input: Twitter, Instagram, Telegram...

=== input이 있는 경우 ===
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>

Identify the odd one out.

Twitter, Instagram, Telegram<|eot_id|><|start_header_id|>assistant<|end_header_id|>

The odd one out is Telegram. Twitter and Instagram are social media platforms mainly for sharing information, images and videos while Telegram is a cloud-based instant messaging and voice-over-IP service.<|eot_id|>


In [12]:
# tokenize=True로 하면 바로 token ID로 변환
token_ids = tokenizer_instruct.apply_chat_template(messages, tokenize=True)
print(f"Token 수: {len(token_ids)}")
print(f"앞부분 token IDs: {token_ids[:20]}")
print()

# 앞부분 decode해서 special token 확인
print("=== 앞부분 decode (특수 토큰 포함) ===")
for i, tid in enumerate(token_ids[:15]):
    print(f"  [{i:2d}] {tid:>8d} -> '{tokenizer_instruct.decode([tid])}'")

Token 수: 2
앞부분 token IDs: [Encoding(num_tokens=91, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]

=== 앞부분 decode (특수 토큰 포함) ===


TypeError: unsupported format string passed to tokenizers.Encoding.__format__

In [13]:
# special_tokens_map으로 전체 확인
print("=== special_tokens_map ===")
for key, value in tokenizer.special_tokens_map.items():
    print(f"  {key}: {value}")

print(f"\n=== 개별 확인 ===")
print(f"BOS token: '{tokenizer.bos_token}' (ID: {tokenizer.bos_token_id})")
print(f"EOS token: '{tokenizer.eos_token}' (ID: {tokenizer.eos_token_id})")
print(f"PAD token: '{tokenizer.pad_token}' (ID: {tokenizer.pad_token_id})")
print(f"UNK token: '{tokenizer.unk_token}' (ID: {tokenizer.unk_token_id})")

=== special_tokens_map ===
  bos_token: <|begin_of_text|>
  eos_token: <|end_of_text|>
  pad_token: <|end_of_text|>

=== 개별 확인 ===
BOS token: '<|begin_of_text|>' (ID: 128000)
EOS token: '<|end_of_text|>' (ID: 128001)
PAD token: '<|end_of_text|>' (ID: 128001)
UNK token: 'None' (ID: None)


In [14]:
# Llama 3의 추가 special token 확인
print("=== Llama 3 추가 special tokens ===")
added_tokens = tokenizer.added_tokens_encoder
for token, idx in sorted(added_tokens.items(), key=lambda x: x[1]):
    print(f"  [{idx:>6d}] {token}")

=== Llama 3 추가 special tokens ===
  [128000] <|begin_of_text|>
  [128001] <|end_of_text|>
  [128002] <|reserved_special_token_0|>
  [128003] <|reserved_special_token_1|>
  [128004] <|finetune_right_pad_id|>
  [128005] <|reserved_special_token_2|>
  [128006] <|start_header_id|>
  [128007] <|end_header_id|>
  [128008] <|eom_id|>
  [128009] <|eot_id|>
  [128010] <|python_tag|>
  [128011] <|reserved_special_token_3|>
  [128012] <|reserved_special_token_4|>
  [128013] <|reserved_special_token_5|>
  [128014] <|reserved_special_token_6|>
  [128015] <|reserved_special_token_7|>
  [128016] <|reserved_special_token_8|>
  [128017] <|reserved_special_token_9|>
  [128018] <|reserved_special_token_10|>
  [128019] <|reserved_special_token_11|>
  [128020] <|reserved_special_token_12|>
  [128021] <|reserved_special_token_13|>
  [128022] <|reserved_special_token_14|>
  [128023] <|reserved_special_token_15|>
  [128024] <|reserved_special_token_16|>
  [128025] <|reserved_special_token_17|>
  [128026] <|re

In [15]:
# Reserved special token 확인
# Llama 3는 미리 예약된 빈 슬롯을 가지고 있음
reserved = [t for t in added_tokens.keys() if "reserved_special_token" in t]
print(f"Reserved special tokens: {len(reserved)}개")
print(f"예시: {reserved[:5]}")
print(f"\n-> vocab size가 고정이므로, 미리 예약된 자리를 활용할 수 있음")

Reserved special tokens: 248개
예시: ['<|reserved_special_token_0|>', '<|reserved_special_token_1|>', '<|reserved_special_token_2|>', '<|reserved_special_token_3|>', '<|reserved_special_token_4|>']

-> vocab size가 고정이므로, 미리 예약된 자리를 활용할 수 있음


In [16]:
import torch

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
)

print(f"추가 전 vocab size: {len(tokenizer)}")
print(f"추가 전 embedding shape: {model.get_input_embeddings().weight.shape}")

`torch_dtype` is deprecated! Use `dtype` instead!


Loading weights:   0%|          | 0/291 [00:00<?, ?it/s]

추가 전 vocab size: 128256
추가 전 embedding shape: torch.Size([128256, 4096])


In [17]:
# 새로운 special token 추가
new_tokens = ["<TIME>", "<DATE>", "<LOCATION>"]

num_added = tokenizer.add_special_tokens({
    "additional_special_tokens": new_tokens
})
print(f"추가된 토큰 수: {num_added}")
print(f"추가 후 vocab size: {len(tokenizer)}")

# 새 토큰 확인
for token in new_tokens:
    token_id = tokenizer.convert_tokens_to_ids(token)
    print(f"  '{token}' -> ID: {token_id}")

추가된 토큰 수: 3
추가 후 vocab size: 128259
  '<TIME>' -> ID: 128256
  '<DATE>' -> ID: 128257
  '<LOCATION>' -> ID: 128258


In [19]:
# embedding layer resize
model.resize_token_embeddings(len(tokenizer))
print(f"Resize 후 embedding shape: {model.get_input_embeddings().weight.shape}")
print()

# 새 토큰이 제대로 tokenize되는지 확인
test_text = "today we have meeting at <DATE> Seoul <LOCATION>"
encoded = tokenizer.encode(test_text)
print(f"원문: {test_text}")
print(f"Token IDs: {encoded}")
print()
for tid in encoded:
    print(f"  {tid:>8d} -> '{tokenizer.decode([tid])}'")

print(f"\n⚠️ 새 토큰의 embedding은 random init 상태 -> fine-tuning으로 학습 필요!")

Resize 후 embedding shape: torch.Size([128259, 4096])

원문: today we have meeting at <DATE> Seoul <LOCATION>
Token IDs: [128000, 31213, 584, 617, 6574, 520, 220, 128257, 51289, 220, 128258]

    128000 -> '<|begin_of_text|>'
     31213 -> 'today'
       584 -> ' we'
       617 -> ' have'
      6574 -> ' meeting'
       520 -> ' at'
       220 -> ' '
    128257 -> '<DATE>'
     51289 -> ' Seoul'
       220 -> ' '
    128258 -> '<LOCATION>'

⚠️ 새 토큰의 embedding은 random init 상태 -> fine-tuning으로 학습 필요!


In [20]:
# 1) 기본 Causal LM label: input을 한 칸 shift
text = "The cat sat on the mat"
token_ids = tokenizer.encode(text)
tokens = [tokenizer.decode([tid]) for tid in token_ids]

input_tokens = tokens[:-1]
label_tokens = tokens[1:]

print("=== Causal LM: Next Token Prediction ===")
print(f"Input:  {input_tokens}")
print(f"Label:  {label_tokens}")
print()
for i, (inp, lbl) in enumerate(zip(input_tokens, label_tokens)):
    print(f"  Step {i}: '{inp}' -> 예측 대상: '{lbl}'")

=== Causal LM: Next Token Prediction ===
Input:  ['<|begin_of_text|>', 'The', ' cat', ' sat', ' on', ' the']
Label:  ['The', ' cat', ' sat', ' on', ' the', ' mat']

  Step 0: '<|begin_of_text|>' -> 예측 대상: 'The'
  Step 1: 'The' -> 예측 대상: ' cat'
  Step 2: ' cat' -> 예측 대상: ' sat'
  Step 3: ' sat' -> 예측 대상: ' on'
  Step 4: ' on' -> 예측 대상: ' the'
  Step 5: ' the' -> 예측 대상: ' mat'


In [21]:
# 2) SFT Label Masking 시연
# Instruction 부분은 -100으로 masking, response 부분만 loss 계산

instruction = "What is the capital of France?"
response = "The capital of France is Paris."
full_text = instruction + " " + response

# tokenize
instruction_ids = tokenizer.encode(instruction)
full_ids = tokenizer.encode(full_text)

# label 생성: instruction 부분은 -100, response 부분은 실제 token ID
labels = [-100] * len(instruction_ids) + full_ids[len(instruction_ids):]

print("=== SFT Label Masking ===")
print(f"전체 token 수: {len(full_ids)}")
print(f"Instruction token 수: {len(instruction_ids)} (이 부분은 loss에서 제외)")
print()

print(f"{'Token':>20s} | {'Token ID':>10s} | {'Label':>10s} | Loss 계산?")
print("-" * 65)
for i, (tid, lbl) in enumerate(zip(full_ids, labels)):
    token_str = tokenizer.decode([tid])
    loss_yn = "❌ 제외" if lbl == -100 else "✅ 계산"
    print(f"{token_str:>20s} | {tid:>10d} | {lbl:>10d} | {loss_yn}")

=== SFT Label Masking ===
전체 token 수: 15
Instruction token 수: 8 (이 부분은 loss에서 제외)

               Token |   Token ID |      Label | Loss 계산?
-----------------------------------------------------------------
   <|begin_of_text|> |     128000 |       -100 | ❌ 제외
                What |       3923 |       -100 | ❌ 제외
                  is |        374 |       -100 | ❌ 제외
                 the |        279 |       -100 | ❌ 제외
             capital |       6864 |       -100 | ❌ 제외
                  of |        315 |       -100 | ❌ 제외
              France |       9822 |       -100 | ❌ 제외
                   ? |         30 |       -100 | ❌ 제외
                 The |        578 |        578 | ✅ 계산
             capital |       6864 |       6864 | ✅ 계산
                  of |        315 |        315 | ✅ 계산
              France |       9822 |       9822 | ✅ 계산
                  is |        374 |        374 | ✅ 계산
               Paris |      12366 |      12366 | ✅ 계산
                   . |         13 |  

In [22]:
# Packing 전: 개별 샘플 + padding
samples = [
    "Give three tips for staying healthy.",
    "What are primary colors?",
    "Describe the structure of an atom.",
    "How does photosynthesis work?",
]

max_len = 20  # 시연용 짧은 길이

print("=== Packing 전: 개별 padding ===")
total_tokens = 0
total_pad = 0
for s in samples:
    ids = tokenizer.encode(s)
    pad_count = max_len - len(ids)
    total_tokens += max_len
    total_pad += pad_count
    
    # 시각화: T=실제토큰, _=padding
    visual = "".join(["T" for _ in ids] + ["_" for _ in range(pad_count)])
    print(f"  [{visual}] 실제:{len(ids)} pad:{pad_count}")

print(f"\n  전체 {total_tokens} tokens 중 padding: {total_pad} ({total_pad/total_tokens*100:.1f}% 낭비!)")

=== Packing 전: 개별 padding ===
  [TTTTTTTT____________] 실제:8 pad:12
  [TTTTTT______________] 실제:6 pad:14
  [TTTTTTTT____________] 실제:8 pad:12
  [TTTTTTT_____________] 실제:7 pad:13

  전체 80 tokens 중 padding: 51 (63.7% 낭비!)


In [23]:
# Packing 후: 샘플들을 EOS로 연결
print("=== Packing 후: 연결된 시퀀스 ===")

all_ids = []
for s in samples:
    ids = tokenizer.encode(s)
    all_ids.extend(ids + [tokenizer.eos_token_id])  # EOS로 구분

# max_seq_len 단위로 잘라서 packed sequence 생성
packed_seqs = []
for i in range(0, len(all_ids), max_len):
    seq = all_ids[i : i + max_len]
    pad_count = max_len - len(seq)
    packed_seqs.append(seq + [tokenizer.pad_token_id] * pad_count)

total_tokens_packed = len(packed_seqs) * max_len
total_real = len(all_ids)
total_pad_packed = total_tokens_packed - total_real

for i, seq in enumerate(packed_seqs):
    visual = ""
    for tid in seq:
        if tid == tokenizer.pad_token_id and tid != tokenizer.eos_token_id:
            visual += "_"
        elif tid == tokenizer.eos_token_id:
            visual += "|"  # EOS를 구분자로 표시
        else:
            visual += "T"
    print(f"  Seq {i}: [{visual}]")

print(f"\n  전체 {total_tokens_packed} tokens 중 padding: {max(total_pad_packed, 0)} ({max(total_pad_packed,0)/total_tokens_packed*100:.1f}% 낭비)")
print(f"  -> 기존 {total_pad/total_tokens*100:.1f}% 낭비에서 {max(total_pad_packed,0)/total_tokens_packed*100:.1f}%로 개선!")

=== Packing 후: 연결된 시퀀스 ===
  Seq 0: [TTTTTTTT|TTTTTT|TTTT]
  Seq 1: [TTTT|TTTTTTT||||||||]

  전체 40 tokens 중 padding: 7 (17.5% 낭비)
  -> 기존 63.7% 낭비에서 17.5%로 개선!


In [24]:
# 실제 vanilla_train.py의 pack 함수와 동일한 방식
def pack(dataset, tokenizer, max_seq_len=1024):
    """Packing: 모든 샘플을 이어붙이고 max_seq_len+1 단위로 잘라서
    input_ids와 labels (1칸 shift) 생성"""
    all_token_ids = []
    for text in dataset:
        all_token_ids.extend(tokenizer.encode(text))

    packed_ds = []
    for i in range(0, len(all_token_ids), max_seq_len + 1):
        chunk = all_token_ids[i : i + max_seq_len + 1]
        if len(chunk) == (max_seq_len + 1):
            packed_ds.append({
                "input_ids": chunk[:-1],   # [0:max_seq_len]
                "labels": chunk[1:],        # [1:max_seq_len+1] (1칸 shift)
            })
    return packed_ds

# 샘플 데이터로 실행
packed = pack(samples, tokenizer, max_seq_len=15)
print(f"Packed sequences: {len(packed)}개")
for i, p in enumerate(packed):
    print(f"\n  Seq {i}:")
    print(f"    input_ids ({len(p['input_ids'])}): {p['input_ids'][:10]}...")
    print(f"    labels    ({len(p['labels'])}): {p['labels'][:10]}...")
    print(f"    -> input을 1칸 shift한 것이 label!")

Packed sequences: 1개

  Seq 0:
    input_ids (15): [128000, 36227, 2380, 10631, 369, 19994, 9498, 13, 128000, 3923]...
    labels    (15): [36227, 2380, 10631, 369, 19994, 9498, 13, 128000, 3923, 527]...
    -> input을 1칸 shift한 것이 label!
