In [None]:
import torch
# from transformer_lens import HookedTransformer
# import pandas as pd

# # 1. 모델 로드
# model = HookedTransformer.from_pretrained("models/gpt2/gpt2-small-124M.pth")
from transformer_lens import HookedTransformer, HookedTransformerConfig

# 1. GPT-2 Small에 맞는 설정 생성
cfg = HookedTransformerConfig.from_dict({
    "d_model": 768,
    "d_head": 64,
    "n_heads": 12,
    "n_layers": 12,
    "n_ctx": 1024,
    "d_mlp": 3072,
    "act_fn": "gelu",
    "tokenizer_name": "gpt2",
})

# 2. 빈 모델 생성
model = HookedTransformer(cfg)
model


In [None]:
# 3. 로컬 .pth 파일에서 가중치 로드
state_dict = torch.load(r"models\gpt2\gpt2-small-124M.pth")

# 4. 가중치 주입 (strict=False를 권장, 레이어 이름 형식이 다를 수 있음)
model.load_state_dict(state_dict, strict=False)

# 2. 분석할 프롬프트 설정 (Mary와 John 문제)
prompt = "When Mary and John went to the store, John gave a drink to"
logits, cache = model.run_with_cache(prompt)

# 3. 마지막 토큰 위치에서 각 레이어의 잔차 연결(Residual Stream) 값 추출
# [n_layers, pos, d_model] 형태의 데이터를 가져옵니다.
accumulated_residual, labels = cache.accumulated_resid(pos_slice=-1, return_labels=True)

# 4. 각 레이어의 결과물을 최종 출력(Logits)으로 변환
# model.unembed()는 내부 벡터를 우리가 읽을 수 있는 단어 점수로 바꿔줍니다.
logit_lens_logits = model.unembed(accumulated_residual) # [n_layers, d_vocab]

# 5. 각 레이어별로 가장 확률이 높은 단어 TOP 1 추출
top_tokens = torch.argmax(logit_lens_logits, dim=-1)
decoded_tokens = [model.to_string(token) for token in top_tokens]

# 결과 출력
for i, token in enumerate(decoded_tokens):
    print(f"Layer {i:2}: 가장 유력한 예측 단어 -> '{token}'")

In [None]:
model

In [None]:
import torch
from transformer_lens import HookedTransformer
import pandas as pd

# 1. 모델 로드
model = HookedTransformer.from_pretrained("gpt2-small")

# 2. 분석할 프롬프트 설정 (Mary와 John 문제)
prompt = "When Mary and John went to the store, John gave a drink to"
logits, cache = model.run_with_cache(prompt)

# 3. 마지막 토큰 위치에서 각 레이어의 잔차 연결(Residual Stream) 값 추출
# [n_layers, pos, d_model] 형태의 데이터를 가져옵니다.
accumulated_residual, labels = cache.accumulated_resid(pos_slice=-1, return_labels=True)

# 4. 각 레이어의 결과물을 최종 출력(Logits)으로 변환
# model.unembed()는 내부 벡터를 우리가 읽을 수 있는 단어 점수로 바꿔줍니다.
logit_lens_logits = model.unembed(accumulated_residual) # [n_layers, d_vocab]

# 5. 각 레이어별로 가장 확률이 높은 단어 TOP 1 추출
top_tokens = torch.argmax(logit_lens_logits, dim=-1)
decoded_tokens = [model.to_string(token) for token in top_tokens]

# 결과 출력
for i, token in enumerate(decoded_tokens):
    print(f"Layer {i:2}: 가장 유력한 예측 단어 -> '{token}'")

In [1]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 1024 # Can increase for longer reasoning traces
lora_rank = 32 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "openai-community/gpt2",
    max_seq_length = max_seq_length,
    load_in_4bit = False, # False for LoRA 16bit
    fast_inference = False, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.9, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha = lora_rank*2, # *2 speeds up training
    use_gradient_checkpointing = "unsloth", # Reduces memory usage
    random_state = 3407,
)

KeyboardInterrupt: 

In [8]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 1024 # Can increase for longer reasoning traces
lora_rank = 16 # Larger rank = smarter, but slower

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = r"d:\models\openai-community\gpt2-medium",
    max_seq_length = max_seq_length,
    load_in_4bit = False, # False for LoRA 16bit
    fast_inference = False, # Enable vLLM fast inference
    max_lora_rank = lora_rank,
    gpu_memory_utilization = 0.9, # Reduce if out of memory
)

model = FastLanguageModel.get_peft_model(
    model,
    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = [
        "c_attn", "attn.c_proj", "mlp.c_fc", "mlp.c_proj",
    ],
    lora_alpha = lora_rank*2, # *2 speeds up training
    use_gradient_checkpointing = "unsloth", # Reduces memory usage
    random_state = 3407,
)

==((====))==  Unsloth 2026.1.2: Fast Gpt2 patching. Transformers: 4.57.3.
   \\   /|    NVIDIA A10-24Q. Num GPUs = 1. Max memory: 23.937 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.9.1+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.5.1
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.
d:\models\openai-community\gpt2-medium does not have a padding token! Will use pad_token = <|endoftext|>.


In [None]:
model

In [9]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    try:
        instructions = examples["instruction"]
        inputs       = examples["input"]
        outputs      = examples["output"]
        texts = []
        for instruction, input, output in zip(instructions, inputs, outputs):
            # Must add EOS_TOKEN, otherwise your generation will go on forever!
            text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
            texts.append(text)
    except Exception as e:
        print(f"Error processing examples: {e}")
        texts = []
    return { "text" : texts, }

from datasets import load_dataset
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")
dataset = dataset.map(formatting_prompts_func,batched=True)
     

In [None]:
dataset[0]

In [None]:
column_names = dataset.column_names
if "text" not in column_names:
    print(f"에러: 'text' 필드를 찾을 수 없습니다. 현재 필드: {column_names}")

In [None]:
tokenizer.eos_token

In [None]:
# 1. 함수 수정 (tokenizer 인자를 추가로 받음)
def debugging_formatting_func(example, tokenizer): # tokenizer 추가
    text = example['text']
    
    if text is None:
        return {"text": ""} 
    
    try:
        # 인자로 받은 tokenizer 사용
        tokenizer(text, truncation=True, max_length=1024)
    except Exception as e:
        print(f"에러 발생 데이터: {text[:50]}...")
        raise e
    
    return example

print("--- 데이터셋 검증 시작 ---")

try:
    # 2. map 실행 시 fn_kwargs로 tokenizer 전달
    small_dataset = dataset.select(range(min(0000, len(dataset)))) 
    small_dataset.map(
        debugging_formatting_func, 
        fn_kwargs={"tokenizer": tokenizer}, # 여기서 전달!
        num_proc=40,
    )
    print("--- 100개 샘플 테스트 성공 ---")
    
except Exception as e:
    print(f"\n치명적 오류 발생: {e}")

--- 데이터셋 검증 시작 ---


Map (num_proc=40):   0%|          | 0/51760 [00:00<?, ? examples/s]

--- 100개 샘플 테스트 성공 ---


In [11]:
tokenizer

GPT2TokenizerFast(name_or_path='.\models\openai-community\gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [10]:
# 1. 밖에서 직접 토큰화 수행
def tokenize_function(examples, tokenizer):
    return tokenizer(examples["text"], truncation=True, max_length=128)

# num_proc을 명시적으로 1로 주어 밖에서 처리
dataset = dataset.map(tokenize_function,  fn_kwargs={"tokenizer": tokenizer},batched=True)

from trl import SFTConfig, SFTTrainer

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    # dataset_text_field="text",
    dataset_text_field=None, # 미리 토큰화했으므로 None으로 설정
    max_seq_length=max_seq_length,
    dataset_num_proc=1,
    packing=False,  # Can make training 5x faster for short sequences.
    args=SFTConfig(
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps=60,
        learning_rate=2e-4,
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",  # Use TrackIO/WandB etc
    ),
)

Map (num_proc=1):   0%|          | 0/51760 [00:00<?, ? examples/s]

In [4]:

# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A10-24Q. Max memory = 23.937 GB.
0.463 GB of memory reserved.


In [11]:
trainer_stats = trainer.train()

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 51,760 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 6,291,456 of 361,114,624 (1.74% trained)


Step,Training Loss
1,3.5666
2,3.4445
3,3.5341
4,3.3923
5,3.6541
6,3.586
7,3.1797
8,3.0751
9,3.1229
10,2.9992


In [12]:

#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

NameError: name 'start_gpu_memory' is not defined

In [13]:

# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Continue the fibonnaci sequence.", # instruction
        "1, 1, 2, 3, 5, 8", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nContinue the fibonnaci sequence.\n\n### Input:\n1, 1, 2, 3, 5, 8\n\n### Response:\n1, 1, 2, 3, 5, 8\n\n### Instruction:\nWrite a response that appropriately completes the request.\n\n### Input:\n### Response:\n### Instruction:\nWrite a response that appropriately completes the request.\n\n### Input:\n### Response:\n### Instruction:\nWrite']

In [8]:
# 3. 입력 데이터 준비 (프롬프트 포맷팅)
inputs = tokenizer(
    [
        "When Mary and John went to the store, John gave a drink to"
    ], return_tensors = "pt").to("cuda")

# 4. 텍스트 생성
outputs = model.generate(**inputs, max_new_tokens = 128, use_cache = True)

# 5. 결과 디코딩
response = tokenizer.batch_decode(outputs)
print(response[0])

When Mary and John went to the store, John gave a drink to Mary and said, "Mary, I have a request for you."

"I have a request for you," Mary replied.

"I have a request for you," John replied.

"I have a request for you," Mary replied.

"I have a request for you," John replied.

"I have a request for you," Mary replied.

"I have a request for you," John replied.

"I have a request for you," Mary replied.

"I have a request for you," John replied.

"I have a request for you,"


In [None]:
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
    tokenizer,
    chat_template = "qwen3-instruct",
)

In [None]:
for name, module in model.named_modules():
    print(name)