<a href="https://colab.research.google.com/github/hypro2/hands-on-LLM-from-colab/blob/main/llama3_2_1b_LoRA_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets evaluate peft

Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading peft-0.13.2-py3-none-any.whl

In [20]:
from datasets import load_dataset
from transformers import AutoTokenizer, LlamaForSequenceClassification

model_name = 'meta-llama/Llama-3.2-1B'
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model =  LlamaForSequenceClassification.from_pretrained(model_name, num_labels=2,device_map='auto',torch_dtype="auto")

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
tokenizer(tokenizer.pad_token)

{'input_ids': [128000, 128001], 'attention_mask': [1, 1]}

In [29]:
model.config.pad_token_id = 128001

In [21]:
model

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
   

In [22]:
for parameter in model.parameters():
    parameter.requires_grad = False

for parameter in model.score.parameters():
    parameter.requires_grad = True

In [23]:
from peft import LoraConfig, get_peft_model

# LoRA 설정
lora_config = LoraConfig(
    r=8,  # Low-rank의 차원
    lora_alpha=16,  # LoRA alpha
    lora_dropout=0.1,  # Dropout 비율
    task_type="SEQ_CLS",  # 시퀀스 분류
    target_modules=["q_proj", "k_proj","v_proj","o_proj"],
)
model = get_peft_model(model, lora_config)

LlamaConfig {
  "_name_or_path": "meta-llama/Llama-3.2-1B",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.44.2",
  "use_cache": true,
  "vocab_size": 128256
}

In [7]:
def preprocess_data(example, tokenizer):
    return tokenizer(example["document"], truncation=True)

dataset = load_dataset("nsmc", trust_remote_code=True)

processed_dataset = dataset.map(
    lambda example: preprocess_data(example, tokenizer),
    batched=True,
    remove_columns=["id", "document"]
).rename_column("label", "labels")

print(dataset)
print(processed_dataset)
print(dataset["train"][0])
print(processed_dataset["train"][0])

Map:   0%|          | 0/150000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 150000
    })
    test: Dataset({
        features: ['id', 'document', 'label'],
        num_rows: 50000
    })
})
DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 150000
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 50000
    })
})
{'id': '9976970', 'document': '아 더빙.. 진짜 짜증나네요 목소리', 'label': 0}
{'labels': 0, 'input_ids': [128000, 54059, 102519, 126015, 497, 118769, 49011, 250, 102249, 61415, 108231, 121279, 29102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [25]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

# max_length_collator = DataCollatorWithPadding(
#     tokenizer=tokenizer,
#     padding="max_length"
# )
# max_length_dataloader = DataLoader(
#     processed_dataset["train"],
#     collate_fn=max_length_collator,
#     batch_size=4,
#     shuffle=False
# )
# max_length_iterator = iter(max_length_dataloader)
# max_lnegth_batch = next(max_length_iterator)
# print("max_length 패딩 입력 id shape :", max_lnegth_batch["input_ids"].shape)

longest_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding="longest"
)
longest_dataloader = DataLoader(
    processed_dataset["train"],
    collate_fn=longest_collator,
    batch_size=4,
    shuffle=False
)
longest_iterator = iter(longest_dataloader)
longest_batch = next(longest_iterator)
print("longest 패딩 입력 id shape :", longest_batch["input_ids"].shape)

longest 패딩 입력 id shape : torch.Size([4, 26])


In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="text-classification",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    num_train_epochs=1,
    eval_steps=200,
    logging_steps=200,
    seed=42
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=longest_collator,
    train_dataset=processed_dataset["train"].select(range(10000)),
    eval_dataset=processed_dataset["test"].select(range(100))
)

trainer.train()

Step,Training Loss
200,0.6326


In [8]:
model.save_pretrained("text-classification-lora")

In [9]:
import torch

model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

text = "진짜 재밌었어요. 또 보러 갈거에요"
inputs = tokenizer(text, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs.to(device))
    print(outputs.logits)
    print(outputs.logits.argmax())

tensor([[0.1337, 0.4230]], device='cuda:0')
tensor(1, device='cuda:0')


In [None]:
import evaluate

yhat = trainer.predict(processed_dataset["test"])
predictions = yhat.predictions.argmax(axis=1)
references = yhat.label_ids

metric = evaluate.load("accuracy")
accuracy = metric.compute(predictions=predictions, references=references)
print(accuracy)

metric = evaluate.load("f1")
f1 = metric.compute(predictions=predictions, references=references)
print(f1)