## 모델 파인튜닝

### 데이터셋

In [None]:
!pwd

In [None]:
!ls -la ../input/test-train-datasets/datasets

In [None]:
!mkdir datasets

In [None]:
cp ../input/test-train-datasets/datasets/train.csv ./datasets

## 모델 관련

In [None]:
!pip install autotrain-advanced==0.7.77 -qqq

### 기존 모델을 새로운 데이터셋으로 튜닝

In [None]:
base_model = 'beomi/Yi-Ko-6B' # huggingface에 있는 원본 모델
finetuned_model = 'yi-ko-6b-text2sql' # 교육 후 모델

# 셋팅 옵션 리스트
# --model {base_model} \
# --project-name {finetuned_model} \
# --data-path
# --text-column text \

# --use-peft 양자화 옵션
!autotrain llm \
--train \
--model {base_model} \
--project-name {finetuned_model} \
--data-path datasets/ \
--text-column prompt \
--lr 2e-4 \
--batch-size 2 \
--epochs 1 \
--block-size 1024 \
--warmup-ratio 0.1 \
--lora-r 16 \
--lora-alpha 32 \
--lora-dropout 0.05 \
--weight-decay 0.01 \
--gradient-accumulation 8 \
--mixed-precision fp16 \
--use-peft \
--quantization int4 \
--trainer sft

### 기존 참조 모델과 튜닝 모델 머지

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, PeftModel

# model_name = base_mode
base_model = 'beomi/Yi-Ko-6B'
finetuned_model = 'yi-ko-6b-text2sql'

# device_map = {"": 0}
device_map = 'auto'

# LoRA(양자화)와 기초 모델 파라미터 합치기
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, finetuned_model)
model = model.merge_and_unload()

# 토크나이저 설정
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

config.json:   0%|          | 0.00/637 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

model-00001-of-00005.safetensors:   0%|          | 0.00/2.97G [00:00<?, ?B/s]

model-00002-of-00005.safetensors:   0%|          | 0.00/2.93G [00:00<?, ?B/s]

model-00003-of-00005.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

model-00004-of-00005.safetensors:   0%|          | 0.00/2.86G [00:00<?, ?B/s]

model-00005-of-00005.safetensors:   0%|          | 0.00/643M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/9.51k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.28M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

In [3]:
import os


In [4]:
# finetuning된 모델 hugging face 등록
model.push_to_hub(finetuned_model, use_temp_dir=False)

model-00001-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/shim-jh/yi-ko-6b-text2sql/commit/b03734a2c6c9dca5e69b4da3a8b485e01dacc92e', commit_message='Upload LlamaForCausalLM', commit_description='', oid='b03734a2c6c9dca5e69b4da3a8b485e01dacc92e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/shim-jh/yi-ko-6b-text2sql', endpoint='https://huggingface.co', repo_type='model', repo_id='shim-jh/yi-ko-6b-text2sql'), pr_revision=None, pr_num=None)

In [5]:
# 사용될(finetuned model 맞는) tokenzier 를 hugging face 등록
tokenizer.push_to_hub(finetuned_model, use_temp_dir=False)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/shim-jh/yi-ko-6b-text2sql/commit/627fb94b39d8ec9f72cd0a20789c72a4fb0be181', commit_message='Upload tokenizer', commit_description='', oid='627fb94b39d8ec9f72cd0a20789c72a4fb0be181', pr_url=None, repo_url=RepoUrl('https://huggingface.co/shim-jh/yi-ko-6b-text2sql', endpoint='https://huggingface.co', repo_type='model', repo_id='shim-jh/yi-ko-6b-text2sql'), pr_revision=None, pr_num=None)