<a href="https://colab.research.google.com/github/jinseriouspark/polaris-llm/blob/main/gemma_finetuning_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Instruction Fine-tuning

- 참고자료 : https://github.com/tsdata/langchain-ollama/blob/main/006_gemma_peft_qlora_colab/gemma_finetuning_koalpaca.ipynb

In [1]:
# 데이터셋 다운로드
!pip install -U datasets==2.17.0



In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 

In [None]:
from datasets import load_dataset

dataset = load_dataset("royboy0416/ko-alpaca")
print(dataset)

In [None]:
dataset['train'][3]

## Gemma dataset formatting

- 참고자료 https://ai.google.dev/gemma/docs/formatting?hl=ko

```<start_of_turn>user
knock knock<end_of_turn>
<start_of_turn>model
who is there<end_of_turn>
<start_of_turn>user
Gemma<end_of_turn>
<start_of_turn>model
Gemma who?<end_of_turn>
```

In [None]:
def gemma_formatting(example):

  # if 'input' has data:
  if example['input'] and len(example['input']) > 0:
    text = f'''<start_of_turn>user\n{example['instruction']}\n{example['input']}<end_of_turn>\n<start_of_turn>model\n{example['output']}\n<end_of_turn>'''
  else:
    text = f'''<start_of_turn>user\n{example['instruction']}\n<end_of_turn>\n<start_of_turn>model\n{example['output']}\n<end_of_turn>'''
  return {'prompt': text}

# dataset update

dataset = dataset.map(gemma_formatting)

In [None]:
dataset['train'][3]

## model load and tuning

1. model loading : gemma-7b load & instruction fine-tuning
2. evaluation : ??

In [None]:
!pip install -qU transformers==4.38.0 accelerate==0.27.1 bitsandbytes==0.42.0 peft==0.8.2 trl==0.7.10

In [None]:
!nvidia-smi

In [None]:
import torch
import pandas as pd
import numpy as np
import warnings
import json
import time

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer

from huggingface_hub import notebook_login


In [None]:
notebook_login()

In [None]:
 model_id = 'google/gemma-7b-it'

In [None]:
bnb_config = BitsAndBytesConfig(load_in_4bit = True,
                                 bnb_4bit_quant_type = 'nf4',
                                 bnb_4bit_compute_dtype = torch.bfloat16)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config = bnb_config,
                                             device_map = {"":0})
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token = True)

In [None]:
tokenizer.pad_token = tokenizer.eos_token

In [None]:
dataset = datasets.map(lambda samples: tokenizer(samples['prompt']),
                      batched = True)
dataset = dataset['train'].train_test_split(test_size = 0.2)
dataset

In [None]:
train_data = dataset['train']
test_data = dataset['test']

In [None]:
print(train_data[0])

In [None]:
def get_completion(query: str, model, tokenizer):
  prompt_template = f"""<start_of_turn>user
                      {query}<end_of_turn>
                      <start_of_turn>
                      model
                      """
  prompt = prompt_template.format(query = query)
  encoded = tokenizer(prompt, return_tensors = 'pt', add_special_tokens = True)
  model_inputs = encoded.to('cuda:0')
  generated_ids = model.generate(**model_inputs, max_num_toke=256)
  decoded = tokenizer.decode(generated_ids[0], skip_special_token = True)
  return decoded

In [None]:
# base model response
result = get_compltation("회사 때려치고 싶어. 그만둘까?", model, tokenizer)
print(result)

In [None]:
torch.cuda.empty_cache()

lora_config = LoraConfig(
    r = 32,
    target_modules = ['o_proj','q_proj','up_proj','v_proj','k_proj','down_proj','gate_proj']
    lora_dropout = 0.05,
    task_type = 'CAUSAL_LM'
    )
model = get_peft_model(model, lora_config)

trainer = SFTTrainer(
    model = model,
    train_dataset = train_data,
    eval_dataset = test_data,
    dataset_text_field = 'prompt',
    peft_config = lora_config,
    args = TrainingArguments(
        per_device_train_batch_size = 1,
        gradient_accumulate_steps = 4,
        warmup_steps = 10,
        max_steps = 100,
        learning_rate = 2e-4,
        fp16 = True,
        logging_steps = 10,
        output_dir = 'outputs',
        optim = 'paged_adamw_8bit'
    ),
    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm = False),
)

trainer.train()

In [None]:
# Fine tuning 후
result = get_compltation("회사 때려치고 싶어. 그만둘까?", model, tokenizer)
print(result)

In [None]:
# Fine tuning 후 2
result2 = get_compltation("나는 스타트업을 할거야. 말리지마", model, tokenizer)
print(result2)

# model save
new_model = 'gemma-7b-it-koalpaca-fuintuned'
trainer.model.save_pretrained(new_model)