In [None]:
# import
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from datasets import load_dataset
import transformers
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, pipeline
from transformers import Trainer, TrainingArguments, AutoModelWithLMHead
from copy import deepcopy
from transformers import AutoTokenizer
import pandas as pd
import argparse
import copy
import logging
import json
from dataclasses import dataclass, field

## Accerate
from accelerate import Accelerator

# About tqdm: https://github.com/tqdm/tqdm/#ipython-jupyter-integration
from tqdm.auto import tqdm, trange

# HuggingFace peft 라이브러리
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType
from transformers import AdamW


import numpy as np
def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str):
    """Collects the state dict and dump to disk."""
    state_dict = trainer.model.state_dict()
    if trainer.args.should_save:
        cpu_state_dict = {key: value.cpu() for key, value in list(state_dict.items())}
        del state_dict
        trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa

In [None]:
def set_seed(seed=42):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
## Set Seed
set_seed(CONFIG['seed'])

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


## 모델 준비
model_id = "EleutherAI/polyglot-ko-12.8b"  # safetensors 컨버팅된 레포
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)


model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map='auto')

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    padding_side="left",
    model_max_length=512,    
)


tokenizer.pad_token = tokenizer.eos_token
print(tokenizer)



In [None]:
# data config
IGNORE_INDEX = -100
DEFAULT_EOS_TOKEN = "</s>"

PROMPT_DICT = {
    "prompt_input": (
        "아래는 작업을 설명하는 명령어와 추가 컨텍스트를 제공하는 입력이 짝을 이루는 예제입니다."
        "요청을 적절히 완료하는 응답을 작성하세요.\n ### 명령어:'''{instruction}'''\n###  입력:'''{input}'''\n\n### 응답:"
    ),
     "prompt_no_input": (
        "아래는 작업을 설명하는 명령어와 추가 컨텍스트를 제공하는 입력이 짝을 이루는 예제입니다."
        "요청을 적절히 완료하는 응답을 작성하세요.\n ### 명령어:'''{instruction}'''\n\n### 응답:"
     ),
}

In [None]:
## prepare data
from typing import Optional, Dict, Sequence

class SFT_dataset(Dataset):
    '''SFT dataset by wygo'''
    def __init__(self, data_path_1_SFT: str, tokenizer: transformers.PreTrainedTokenizer, verbose=False):
        super(SFT_dataset, self).__init__()
        logging.warning("Loading data...")
        
        ## format
        pattern_instruction = 'instruction'  # 맞춤법 지시
        pattern_input = 'input'  # 입력문
        pattern_output = 'output'  # output

        ############################################################
        ## load dataset
        # 내 데이터셋엔 input이 없다
#         data_path_1_SFT = 'data_kochatgpt/korean_chatgpt_1_SFT.jsonl'
        with open(data_path_1_SFT, "r", encoding='utf-8-sig') as json_file:
            list_data_dict = json.load(json_file)
            if verbose:
                print('## data check ##')
                print((list_data_dict[0]))

        # {'prompt': '불고기용 고기 한우에요?',
        #  'completion': "'저는 인공지능 챗봇이며, 직접적으로 식품에 관한 정보를 가지고 있지 않습니다. 하지만 일반적으로 불고기용 고기는 한우, 쇠고기, 돼지고기 등 다양한 종류의 고기를 사용합니다. 하지만 한우는 대표적인 고급 육류로 알려져 있기 때문에, 한우를 사용하는 경우도 많습니다. 알러지나 개별 건강 상태에 따라 다를 수 있으니 충분한 정보 수집 후에 선택해 주시기 바랍니다.",
        #  'tokens': 193}        

        ############################################################
        ## 데이터셋 만들기, source와 target
        prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]  # 템플릿 가져오기

        # 입력
        sources = []
        for example in list_data_dict:
            if example.get(pattern_input, "") != "":
                tmp = prompt_input.format_map(example)
            else:
                tmp = prompt_no_input.format_map(example)
            sources.append(tmp)

        # 출력
        targets = []
        for example in list_data_dict:
            targets.append(f"{example[pattern_output]}{tokenizer.eos_token}")
        
        if verbose:
            idx = 0
            print(("source:",sources[idx]))
            print(("target:",targets[idx]))
            print("Tokenizing inputs... This may take some time...")

        ############################################################
        # data_dict = preprocess(sources, targets, tokenizer)  # https://github.com/Beomi/KoAlpaca/blob/04704348d58b8b1c2e2638d6437a04b4e8ba1823/train.py#L124
        examples = [s + t for s, t in zip(sources, targets)]

        # source data tokenized
        sources_tokenized = self._tokenize_fn(sources, tokenizer)  # source만
        examples_tokenized = self._tokenize_fn(examples, tokenizer)  # source + target


        ## 입력은 source, 출력은 source+target 이지만 학습은 target 부분만
        input_ids = examples_tokenized["input_ids"]
        labels = copy.deepcopy(input_ids)
        for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
            label[:source_len] = IGNORE_INDEX  # source 부분은 -100으로 채운다

        data_dict = dict(input_ids=input_ids, labels=labels)        
        
        self.input_ids = data_dict["input_ids"]
        self.labels = data_dict["labels"]
        logging.warning("Loading data done!!: %d"%(len(self.labels)))    
        
    def _tokenize_fn(self, strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
        """Tokenize a list of strings."""
        tokenized_list = [
            tokenizer(
                text,
                return_tensors="pt",
                padding="longest",
                max_length=tokenizer.model_max_length,
                truncation=True,
            )
            for text in strings
        ]
        input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
        input_ids_lens = labels_lens = [
            tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list
        ]
        return dict(
            input_ids=input_ids,
            labels=labels,
            input_ids_lens=input_ids_lens,
            labels_lens=labels_lens,
        )        
        
        
    def __len__(self):
        return len(self.input_ids)

    
    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        return dict(input_ids=self.input_ids[i], labels=self.labels[i])


@dataclass
class DataCollatorForSupervisedDataset(object):
    """Collate examples for supervised fine-tuning."""

    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
        return dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
        )

    

train_dataset = SFT_dataset(data_path_1_SFT="SFT_all_train.json", tokenizer=tokenizer, verbose=True)
eval_dataset  = SFT_dataset(data_path_1_SFT='SFT_all_eval.json', tokenizer=tokenizer, verbose=True)
data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)

# check
print('input : %s'%train_dataset.input_ids[0])
print('output: %s'%train_dataset.labels[0])
print(tokenizer.decode(train_dataset.input_ids[0]))
print(len(train_dataset), len(eval_dataset))

# Model

In [None]:
# QLoRA
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

# Define LoRA Config
lora_config = LoraConfig(r=16,
                        lora_alpha=32,
                        target_modules=["query_key_value"],
                        lora_dropout=0.05,
                        bias="none",
                        task_type="CAUSAL_LM",
                        )


In [None]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:

CONFIG ={
    'model': 'EleutherAI/polyglot-ko-12.8b', #'gogamza/kobart-base-v1',
    'model_save': 'output/polyglot-ko-12.8b',
    'base_path' : './',
    'learning_rate': 2e-4, 
    'seed': 42,
    'try': 'injection',
    'ratio': 0.95,
    'n_sentences' : 10,
    "train_batch_size": 1, 
    "valid_batch_size": 1, 
    "max_length": 512,
    "target_max_length": 100,
    "scheduler": 'CosineAnnealingLR', 
    "min_lr": 1e-6,
    "max_grad_norm": 1000,
    "T_max": 500,
    "weight_decay": 1e-6,
    "n_accumulate": 2,
    'grad_clipping': True,
    "device": torch.device("cuda" if torch.cuda.is_available() else "cpu"),
}



In [None]:
## 학습 (10min)
import gc
from transformers import EarlyStoppingCallback

gc.collect()
torch.cuda.empty_cache()
# training_args 수정 가능: https://github.com/Beomi/KoAlpaca/blob/main/train.sh 참고

training_args = TrainingArguments(
    output_dir="./output/SFT_all", #The output directory
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=2, # number of training epochs
    per_device_train_batch_size=4, # batch size for training
    per_device_eval_batch_size=4,  # batch size for evaluation
    learning_rate=CONFIG['learning_rate'],
    weight_decay=CONFIG['weight_decay'],
    gradient_accumulation_steps=CONFIG['n_accumulate'],
    max_grad_norm=CONFIG['max_grad_norm'],
    seed=CONFIG['seed'],
    load_best_model_at_end=True,
    greater_is_better=True,
    do_eval=True,
    evaluation_strategy='epoch',
    save_total_limit = 1, 
    report_to="wandb",
    save_strategy="epoch",
    optim="adafactor",
    fp16=True,
    resume_from_checkpoint=True
    )
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

model.config.use_cache = False 

In [None]:
import wandb
run = wandb.init(project='polyglot-all', 
                 config=CONFIG,
                 job_type='Train',
                 name = "ngram",
                 anonymous='must')

In [None]:
trainer.train()
## eval
trainer.evaluate()
model.config.use_cache = True  # silence the warnings. Please re-enable for inference!

In [None]:
trainer.model.save_pretrained('./output/SFT_aihub')
tokenizer.save_pretrained('./output/SFT_aihub')
run.finish()