In [1]:
!pip install peft

Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.12.0-py3-none-any.whl (296 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.4/296.4 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.12.0


In [3]:
# pip install accelerate
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from transformers import Trainer, BitsAndBytesConfig
import torch
import transformers
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_kbit_training,
    set_peft_model_state_dict,
    TaskType
)
torch.random.manual_seed(0)

<torch._C.Generator at 0x7bfcf0365a90>

In [None]:
!pip list

Package                          Version
-------------------------------- ---------------------
absl-py                          1.4.0
accelerate                       0.33.0
aiohappyeyeballs                 2.4.0
aiohttp                          3.10.5
aiosignal                        1.3.1
alabaster                        0.7.16
albucore                         0.0.14
albumentations                   1.4.14
altair                           4.2.2
annotated-types                  0.7.0
anyio                            3.7.1
argon2-cffi                      23.1.0
argon2-cffi-bindings             21.2.0
array_record                     0.5.1
arviz                            0.18.0
asn1crypto                       1.5.1
astropy                          6.1.3
astropy-iers-data                0.2024.8.27.10.28.29
astunparse                       1.6.3
async-timeout                    4.0.3
atpublic                         4.1.0
attrs                            24.2.0
audioread             

**모델 및 토크나이저 생성**

In [5]:
model = transformers.AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3.5-mini-instruct",
    torch_dtype=torch.bfloat16,
    device_map="auto",

)
model.enable_input_require_grads()
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")

lora_config = LoraConfig(
        r=8,
        lora_alpha=32,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

trainable params: 1,572,864 || all params: 3,822,652,416 || trainable%: 0.0411


In [None]:
import os
import numpy as np

In [None]:
os.getcwd()

'/content'

In [None]:
train_file = open('/content/drive/MyDrive/AI_Practice/ratings_train.txt', 'r', encoding='utf-8')
test_file = open('/content/drive/MyDrive/AI_Practice/ratings_test.txt', 'r', encoding='utf-8')


In [None]:
lines = train_file.read().split('\n')
lines.pop(0)
lines.pop(-1)

train_sentences = []
train_labels = []

for line in lines:
  tks = line.split('\t')
  train_sentences.append(tks[1])
  train_labels.append(tks[2])

lines = test_file.read().split('\n')
lines.pop(0)
lines.pop(-1)

test_sentences = []
test_labels = []

for line in lines:
  tks = line.split('\t')
  test_sentences.append(tks[1])
  test_labels.append(tks[2])


In [None]:
total_num = 2000
max_length = 256
input_ids = np.zeros(shape=[total_num, max_length], dtype=np.int32)
attention_mask = np.zeros(shape=[total_num, max_length], dtype=np.int32)
labels = np.zeros(shape=[total_num, max_length], dtype=np.int32)

for i in range(total_num):
    instruction = f'''입력된 코멘트가 긍정인지 부정인지 예측하시오.

    input sentence:
    {train_sentences[i]}'''

    if train_labels[i] == '1':
      answer = '긍정적인 코멘트입니다.'
    else:
      answer = '부정적인 코멘트입니다.'

    messages = [
        {"role": "system", "content": "You are a helpful AI assistant."},
        {"role": "user", "content": f"{instruction}"}
    ]

    source_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="np"
    )[0]
    target_ids = tokenizer(answer + tokenizer.eos_token, add_special_tokens=False, return_tensors='np')['input_ids'][0]

    if len(source_ids) + len(target_ids) >= max_length:
        continue

    input_ids[i, :len(source_ids)] = source_ids[:]
    input_ids[i, len(source_ids):len(source_ids) + len(target_ids)] = target_ids[:]
    attention_mask[i, :len(source_ids) + len(target_ids)] = [1] * (len(source_ids) + len(target_ids))
    labels[i, :len(source_ids)] = [-100] * (len(source_ids))
    labels[i, len(source_ids):len(source_ids) + len(target_ids)] = target_ids[:]

np.save('input_ids', input_ids[:])
np.save('attention_mask', attention_mask[:])
np.save('labels', labels[:])


In [None]:
from torch.utils.data import Dataset
from dataclasses import dataclass, field
import transformers
from typing import Optional, Dict, Sequence, List

IGNORE_INDEX = -100


In [None]:
class SupervisedDataset(Dataset):
    """Dataset for supervised fine-tuning."""

    def __init__(self, tokenizer: transformers.PreTrainedTokenizer):
        super(SupervisedDataset, self).__init__()
        self.data_index = 0

        self.input_ids = np.load(f'input_ids.npy')
        self.attention_mask = np.load(f'attention_mask.npy')
        self.labels = np.load(f'labels.npy')

        self.i = 0

        print(len(self.input_ids))
        self.max_num = self.input_ids.shape[0]

    def __len__(self):
        return int(self.input_ids.shape[0])

    def __getitem__(self, i) -> Dict[str, torch.Tensor]:

        input_ids = torch.tensor(self.input_ids[i], dtype=torch.long)
        attention_mask = torch.tensor(self.attention_mask[i], dtype=torch.long)
        labels = torch.tensor(self.labels[i], dtype=torch.long)
        labels[attention_mask == 0] = IGNORE_INDEX

        return dict(input_ids=input_ids, attention_mask=attention_mask, labels=labels)


@dataclass
class DataCollatorForSupervisedDataset(object):
    """Collate examples for supervised fine-tuning."""

    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids, attention_mask, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "attention_mask", "labels"))
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )
        attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True,
                                                         padding_value=self.tokenizer.pad_token_id)
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
        return dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=attention_mask,
        )


def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict:
    """Make dataset and collator for supervised fine-tuning."""
    train_dataset = SupervisedDataset(tokenizer=tokenizer)
    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
    print('dataset length:', len(train_dataset))
    return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)

@dataclass
class TrainingArguments(transformers.TrainingArguments):
    cache_dir: Optional[str] = field(default=None)
    optim: str = field(default="adamw_torch")
    model_max_length: int = field(
        default=512,
        metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."},
    )



In [None]:
training_args = TrainingArguments(
    'nsmc_phi',
    optim="paged_adamw_32bit",
    per_device_train_batch_size=64,
    gradient_accumulation_steps=2,
    logging_steps=1,
    save_strategy='steps',
    save_steps=100,
    num_train_epochs=3,
    learning_rate=5e-5,
    weight_decay=1e-1,
    warmup_steps=10,
    bf16=True,
    fp16=False,
    gradient_checkpointing=True,
    save_total_limit=3,
    log_level='debug',
)

data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=None)

trainer = Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module)
trainer.train()
model.save_pretrained(training_args.output_dir)


2000
dataset length: 2000


Using auto half precision backend
Currently training with a batch size of: 2


[2024-09-10 09:55:34,942] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


***** Running training *****
  Num examples = 2,000
  Num Epochs = 3
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 8
  Total optimization steps = 375
  Number of trainable parameters = 1,572,864
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
1,3.4746
2,3.4152
3,3.4577
4,3.5223
5,3.5501
6,3.467
7,3.4877
8,3.4631
9,3.4811
10,3.4475


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...

KeyboardInterrupt: 

In [None]:
model = model.merge_and_unload()

In [None]:
# 일부 데이터에 대해서만 평가(실행 시간을 줄이기 위함)
total_num = 200

cor = 0
count = 0

for i in range(total_num):
  instruction = f'''입력된 코멘트가 긍정인지 부정인지 예측하시오.

    input sentence:
    {test_sentences[i]}'''

  messages = [
        {"role": "system", "content": "You are a helpful AI assistant."},
        {"role": "user", "content": f"{instruction}"}
    ]

  input_ids = tokenizer.apply_chat_template(
      messages,
      add_generation_prompt=True,
      return_tensors="pt"
  ).to(model.device)

  terminators = [
    tokenizer.eos_token_id,
    tokenizer.convert_tokens_to_ids("<|endoftext|>")
  ]

  # generate에는 다양한 생성 옵션을 추가할 수 있음
  outputs = model.generate(
      input_ids,
      max_new_tokens=32,
      eos_token_id=terminators,
      do_sample=True,
      temperature=0.6,
      top_p=0.9,
  )

  prediction = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)

  if prediction.find('긍정') != -1:
    if test_labels[i] == '1':
      cor += 1
  if prediction.find('부정') != -1:
    if test_labels[i] == '0':
      cor += 1
  count += 1
  print(prediction)

print(cor / count)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
  return fn(*args, **kwargs)


The given input sentence "굳 ㅋ" appears to be a combination of Korean characters and a popular emoticon "ㅋ" which
The input sentence "GDNTOPCLASSINTHECLUB" appears to be a positive statement. The use of the term "GDNTOP
The given input sentence can be analyzed as having both positive and negative elements, but overall, it leans more towards a negative sentiment.

Here'


KeyboardInterrupt: 