In [1]:
pip install transformers datasets bitsandbytes peft accelerate scipy

You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import torch

from datasets import load_dataset
from huggingface_hub import notebook_login
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, DataCollatorForLanguageModeling

In [3]:
dataset = load_dataset('bogeumkim/emotion_cls')
dataset

DatasetDict({
    train: Dataset({
        features: ['inputs', 'labels'],
        num_rows: 19670
    })
})

In [4]:
dataset = dataset.map(
    lambda x: {'text': f"### 질문: {x['inputs']}\n\n### 감정: {x['labels']}<|endoftext|>" }
)

In [5]:
model_id = "EleutherAI/polyglot-ko-1.3b"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
dataset = dataset.map(lambda samples: tokenizer(samples["text"], padding=True, truncation=True, max_length=128), batched=True, remove_columns=['inputs', 'labels'])

In [7]:
dataset['train'][25]['text']

'### 질문: 우울한 상태에서 나도 모르게 화를 내버려\n\n### 감정: 우울/침울<|endoftext|>'

In [8]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [9]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [10]:
config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    target_modules=["query_key_value"], 
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

trainable params: 1572864 || all params: 729403392 || trainable%: 0.21563705588032142


In [11]:
dataset['train']

Dataset({
    features: ['text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 19670
})

In [12]:
tokenizer.pad_token = tokenizer.eos_token

trainer = Trainer(
    model=model,
    train_dataset=dataset["train"],
    args=TrainingArguments(
        per_device_train_batch_size=16,
        gradient_accumulation_steps=1,
        #max_steps=50, ## 초소량만 학습: 50 step만 학습. 약 4분정도 걸립니다.
        learning_rate=3e-4,
        num_train_epochs=3,
        fp16=True,
        logging_steps=300,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [13]:
trainer.train()

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
300,1.6528
600,1.4158
900,1.3901
1200,1.369
1500,1.3207
1800,1.316
2100,1.2984
2400,1.3038
2700,1.2624
3000,1.24


TrainOutput(global_step=3690, training_loss=1.3344784206814235, metrics={'train_runtime': 5003.7889, 'train_samples_per_second': 11.793, 'train_steps_per_second': 0.737, 'total_flos': 1.4809892170235904e+16, 'train_loss': 1.3344784206814235, 'epoch': 3.0})

In [14]:
model.eval()
model.config.use_cache = True

In [28]:
def gen(x):
    gened = model.generate(
        **tokenizer(
            f"### 질문: {x}\n\n### 감정:", 
            return_tensors='pt', 
            return_token_type_ids=False
        ), 
        max_new_tokens=3,
        temperature=0.001,
        do_sample=True,
        eos_token_id=2,
    )
    print(tokenizer.decode(gened[0]))

In [29]:
gen('회사에서 중요한 프로젝트를 혼자 하게 됐는데 솔직히 두렵고 무서워.')

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


### 질문: 회사에서 중요한 프로젝트를 혼자 하게 됐는데 솔직히 두렵고 무서워.

### 감정: 불안/걱정


In [30]:
gen('친구들이 요즘 나를 따돌려. 내가 혼자라는 게 너무 슬퍼.')

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


### 질문: 친구들이 요즘 나를 따돌려. 내가 혼자라는 게 너무 슬퍼.

### 감정: 고독감/


In [31]:
gen('선생님은 그래도 나를 이해해주실 줄 알았는데 정말 실망했어.')

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


### 질문: 선생님은 그래도 나를 이해해주실 줄 알았는데 정말 실망했어.

### 감정: 배신감/분노


In [36]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [37]:
model.push_to_hub('bogeumkim/polyglot-1.3b-qlora-emotion-classification')

CommitInfo(commit_url='https://huggingface.co/bogeumkim/polyglot-1.3b-qlora-emotion-classification/commit/cf1730ab7935928131c895e3f574e172e6a4ed88', commit_message='Upload model', commit_description='', oid='cf1730ab7935928131c895e3f574e172e6a4ed88', pr_url=None, pr_revision=None, pr_num=None)