In [None]:
#====================================================================================================
# loRA 파인튜닝 예제1
# - bigscience/bloom-3b 모델을 이용하여 PEFT(Parameter-Efficient Fine-tuning)->LoRA(Low-Rank Adaptation) 기법으로 파인튜닝하는 예시
# - 설명 : https://medium.com/@jeremyarancio/fine-tune-an-llm-on-your-personal-data-create-a-the-lord-of-the-rings-storyteller-6826dd614fa9
#
# 참조 DOC : https://github.com/jeremyarancio/llm-rpg/tree/main
# 참고소스: https://github.com/jeremyarancio/llm-rpg/blob/main/llm/training.py
#
# 참고소스: https://github.com/Beomi/KoAlpaca
# Huggingface
# 참조 DOC: https://huggingface.co/docs/peft/index
# 참조소스: https://github.com/huggingface/peft
#
# package 설치 
# peft: pip install peft
# load_in_8bit : pip install -i https://test.pypi.org/simple/ bitsandbytes-cudaXXX  (XXX는 CUDA version (e.g. 11.6 = 116))
# transfomers 4.27.1 이상으로 업데이트 : pip install -U transformers[pytorch]
# dispatch_model() got an unexpected keyword argument 'offload_index' 오류 => accelerate 업데이트 : pip install -U accelerate
# module 'bitsandbytes.nn' has no attribute 'Linear8bitLt' => pip install bitsandbytes==0.37.2
# 'MatmulLtState' object has no attribute 'memory_efficient_backward' 오류 => bitsandbytes 버전 0.37.2 설치 : pip install bitsandbytes==0.37.2
#====================================================================================================

import torch
from torch.utils.data import Dataset, random_split, DataLoader, RandomSampler, SequentialSampler 
import numpy as np
import pandas as pd
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast, AutoModelForCausalLM, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup

from tqdm.notebook import tqdm
import os
import time
from myutils import GPU_info, seed_everything, mlogging, SaveBERTModel, AccuracyForMLM

# 입력
#model_path='../data11/model/LLM/bigscience/bloomz-7b1-mt/'
model_path='beomi/KoAlpaca'

device = GPU_info()
print(device)

#seed 설정
seed_everything(111)

#logging 설정
logger =  mlogging(loggername="KoAlpaca", logfilename="../log/KoAlpaca")

In [None]:
# tokenizer 로딩 
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

In [None]:
# 모델 저장
### 전체모델 저장
TMP_OUT_PATH = '../data11/model/LLM/beomi/KoAlpaca/'
os.makedirs(TMP_OUT_PATH, exist_ok=True)
#torch.save(model, OUTPATH + 'pytorch_model.bin') 
# save_pretrained 로 저장하면 config.json, pytorch_model.bin 2개의 파일이 생성됨
model.save_pretrained(TMP_OUT_PATH)

# tokeinizer 파일 저장(vocab)
VOCAB_PATH = TMP_OUT_PATH
tokenizer.save_pretrained(VOCAB_PATH)
print(f'==> save_model : {TMP_OUT_PATH}')

In [None]:
# tokenizer 로딩 
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
# 모델 정의 하고, embedding size를 tokenizer 사이즈만큼 조정
from transformers import AutoModelForCausalLM
from torch import float32, nn, exp


# 참고 소스 : https://github.com/jeremyarancio/llm-rpg/blob/main/llm/training_utils.py
class CastOutputToFloat(nn.Sequential):
    def forward(self, x): 
        return super().forward(x).to(float32)

    
def prepare_model(model):
    for param in model.parameters():
        param.requires_grad = False  # freeze the model - train adapters later
        if param.ndim == 1:
            # cast the small parameters (e.g. layernorm) to fp32 for stability
            param.data = param.data.to(float32)
    model.gradient_checkpointing_enable()  # reduce number of stored activations
    model.enable_input_require_grads()
    model.lm_head = CastOutputToFloat(model.lm_head)
    return model


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    return f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"

'''
#model = GPT2LMHeadModel.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
  model_path, revision='KoGPT6B-ryan1.5b-float16',  # or float32 version: revision=KoGPT6B-ryan1.5b
  pad_token_id=tokenizer.eos_token_id,
  torch_dtype='auto', low_cpu_mem_usage=True
).to(device=device, non_blocking=True)

model.resize_token_embeddings(len(tokenizer))

model.to(device)
'''

# 모델 로딩
model = AutoModelForCausalLM.from_pretrained(model_path, return_dict=True, load_in_8bit=True, device_map="auto")

In [None]:
model

In [None]:
# lora 와 연동
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType

model = prepare_model(model)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM" # set this for CLM or Seq2Seq
)

#peft_config = LoraConfig(task_type=TaskType.CAUSAL_LM, inference_mode=False, r=16, lora_alpha=32, lora_dropout=0.1)

model = get_peft_model(model, lora_config)

In [None]:
model

In [None]:
print_trainable_parameters(model)

In [None]:
# 데이터 셋을 생성한다.
# => 문장뒤에 </s> 추가.
from datasets import Dataset
#corpus_path = "../data11/korpora/kowiki_20190620/wiki_20190620_small.txt"
corpus_path = "../data11/ai_hub/vs1/IT.과학.txt"
texts = ""
with open(corpus_path, 'r') as f:
    for line in f:
        if len(line) > 1:
            texts += line
        
    # 문장들에서 '\n'(띄어쓰기) -> '.</s>'로 치환함.
    texts = texts.replace("\n", "." + tokenizer.eos_token)
 
# 데이터 셋 생성
dataset = Dataset.from_dict({'text': [texts]})

In [None]:
from typing import Callable, Mapping

# tokenizer 처리 함수
def tokenize(element: Mapping, tokenizer: Callable, context_length: int) -> str:
    inputs = tokenizer(element['text'], truncation=True, return_overflowing_tokens=True, 
                       return_length=True, max_length=context_length)
    inputs_batch = []
    for length, input_ids in zip(inputs['length'], inputs['input_ids']):
        if length == context_length: # We drop the last input_ids that are shorter than max_length
            inputs_batch.append(input_ids)
    return {"input_ids": inputs_batch}

# tokenizer dataset 생성
context_length = 2048  # 문장 길이(모델의 허용할수 있는 최대 길이 설정)
test_size = 0.001
tokenized_dataset = dataset.map(tokenize, batched=True, fn_kwargs={'tokenizer': tokenizer, 'context_length': context_length},
                                         remove_columns=dataset.column_names)

tokenized_dataset_dict = tokenized_dataset.train_test_split(test_size=test_size, shuffle=True)

print(f'len(tokenized_dataset):{len(tokenized_dataset)}')
print(f'len(tokenized_dataset_dict):{len(tokenized_dataset_dict)}')


In [None]:
# 훈련 param 설정 
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
output_dir = './output'

training_args = TrainingArguments(
    per_device_train_batch_size=1,
    num_train_epochs=3,
    gradient_accumulation_steps=1,
    warmup_steps=100,
    weight_decay=0.1,
    learning_rate=3e-4, 
    fp16=True,
    logging_steps=1000,    # 훈련시, 로깅할 step 수 (크면 10000번 정도하고, 작으면 100번정도)
    output_dir=output_dir
)

trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset_dict['train'],
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

In [None]:
# 훈련 시작
model.config.use_cache = False  # silence warnings
trainer.train()
model.config.use_cache = True

In [None]:
# 훈련 완료후 저장
import os

OUT_PATH = '../data11/model/LLM/beomi/KoAlpaca-Polyglot-5.8B-LoRA-qa-1/'
os.makedirs(OUT_PATH, exist_ok=True) # 폴더 생성

# adapter_config.json 와 adapter_config.bin 파일 생성. 
model.save_pretrained(OUT_PATH)
 
# 실제 모델 파일 (pytorch_model.bin, training_args.bin)
trainer.save_state()  # 모델이 모든 가중치, 학습률, 알고리즘등을 저장.
trainer.save_model(output_dir=OUT_PATH) # 모델이 가중치만 저장

# pytorch_model.bin 이름을 adapter_model.bin 변경.
# => 그래야 huggingface 를 이용해서 불러올수 있음.
if os.path.exists(os.path.join(OUT_PATH, 'adapter_model.bin')):
    os.remove(os.path.join(OUT_PATH, 'adapter_model.bin'))
    print(f'remove adapter_model.bin')

if os.path.exists(os.path.join(OUT_PATH, 'pytorch_model.bin')):
    os.rename(os.path.join(OUT_PATH, 'pytorch_model.bin'), os.path.join(OUT_PATH, 'adapter_model.bin'))
    print(f'rename pytorch_model.bin')

In [None]:
# 여기서부터는 저장된 모델 평가하는 코드임.

In [None]:
# 여기서부터는 저장된 모델 평가하는 코드임.

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftConfig, PeftModel

# Import the model
hf_repo = '../data11/model/LLM/quantumaikr/KoreanLM_LoRA_1/'

config = PeftConfig.from_pretrained(hf_repo)
config.base_model_name_or_path = '../data11/model/LLM/quantumaikr/KoreanLM/'

model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, return_dict=True, load_in_8bit=True, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Lora model 불러오기
model = PeftModel.from_pretrained(model, hf_repo)

model.eval()

In [None]:
from myutils import GPU_info
device = GPU_info()

'''
수학은 기원전 600년 경에 살았던 탈레스로부터 시작됐다.
하지만 탈레스가 태어나기 전에도 수학을 연구한 사람이 있을 수도 있기 때문에, 인류의 역사와 더불어 시작되었다고 할 수 있다.
교역•분배•과세 등의 인류의 사회 생활에 필요한 모든 계산을 수학이 담당해 왔고, 농경 생활에 필수적인 천문 관측과 달력의 제정, 토지의 측량 또한 수학이 직접적으로 관여한 분야이다.
고대 수학을 크게 발전시킨 나라로는 이집트, 인도, 그리스, 중국 등이 있다.
그 중에서도 그리스는 처음으로 수학의 방정식에서 변수를 문자로 쓴 나라이다.

위 내용을 바탕으로 아래 질문에 대해 답변해 주세요
질문 : 고대 수학을 가장 크게 발전시킨 곳은?
'''
prompt ="모코엠시스 라는 IT 회사에 대해 간략히 설명해줘"

## Generate
max_new_tokens = 200
temperature = 0.5
do_sample = False

# Generate text
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"].to(device)

tokens = model.generate(
            input_ids=input_ids,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=do_sample,
            eos_token_id=tokenizer.eos_token_id,
            early_stopping=True
        )

print(tokenizer.decode(tokens[0]))

In [None]:
# 원본 모델 평가 해봄

In [None]:
# 원본 모델 로딩
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftConfig, PeftModel

# Import the model
model_path = '../data11/model/LLM/quantumaikr/KoreanLM/'

model1 = AutoModelForCausalLM.from_pretrained(model_path, return_dict=True, load_in_8bit=True, device_map='auto')
tokenizer1 = AutoTokenizer.from_pretrained(model_path)

model1.eval()

In [None]:
from myutils import GPU_info
device = GPU_info()

prompt ="모코엠시스 라는 IT 회사에 대해 간략히 설명해줘"

## Generate
max_new_tokens = 200
temperature = 0.5
do_sample = False

# Generate text
inputs = tokenizer1(prompt, return_tensors="pt")
input_ids = inputs["input_ids"].to(device)

tokens1 = model1.generate(
            input_ids=input_ids,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            do_sample=do_sample,
            eos_token_id=tokenizer1.eos_token_id,
            early_stopping=True
        )

print(tokenizer1.decode(tokens1[0]))