In [1]:
import torch

# CUDA 장치의 주요 버전과 부 버전을 가져옵니다.
major_version, minor_version = torch.cuda.get_device_capability()
major_version, minor_version

(9, 0)

In [2]:
try: import torch
except: raise ImportError("Install torch via `pip install torch`")
from packaging.version import Version as V
v = V(torch.__version__)
cuda = str(torch.version.cuda)
is_ampere = torch.cuda.get_device_capability()[0] >= 8
if cuda != "12.1" and cuda != "11.8": raise RuntimeError(f"CUDA = {cuda} not supported!")
if   v <= V('2.1.0'): raise RuntimeError(f"Torch = {v} too old!")
elif v <= V('2.1.1'): x = 'cu{}{}-torch211'
elif v <= V('2.1.2'): x = 'cu{}{}-torch212'
elif v  < V('2.3.0'): x = 'cu{}{}-torch220'
elif v  < V('2.4.0'): x = 'cu{}{}-torch230'
elif v  < V('2.5.0'): x = 'cu{}{}-torch240'
else: raise RuntimeError(f"Torch = {v} too new!")
x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "")
print(f'pip install --upgrade pip && pip install "unsloth[{x}] @ git+https://github.com/unslothai/unsloth.git"')

In [None]:
!
!pip install --upgrade torch torchvision torchaudio

In [None]:
import configparser

config = configparser.ConfigParser()
config.read('../config.ini')
api_key = config['HUGGINGFACEHUB']['API_TOKEN']
print(api_key)

In [None]:
base_model = "yanolja/EEVE-Korean-Instruct-10.8B-v1.0" 
huggingface_repo = "EEVE-Korean-AIDOK-10.8B"

In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 4096  
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=base_model, 
    max_seq_length=max_seq_length,  
    dtype=dtype,  
    load_in_4bit=load_in_4bit, 
    token = api_key
)

In [13]:
model = FastLanguageModel.get_peft_model(
    model,
    r=32, 
    lora_alpha=64, 
    lora_dropout=0.05, 
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=123,  
    use_rslora=False,
    loftq_config=None,  
)

In [14]:
from datasets import load_dataset

# EOS_TOKEN은 문장의 끝을 나타내는 토큰입니다. 이 토큰을 추가해야 합니다.
EOS_TOKEN = tokenizer.eos_token

# AlpacaPrompt를 사용하여 지시사항을 포맷팅하는 함수입니다.
alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Response:
{}"""


def formatting_prompts_func(examples):
    instructions = examples["instruction"] 
    outputs = examples["output"] 
    texts = []  
    for instruction, output in zip(instructions, outputs):
        text = alpaca_prompt.format(instruction, output) + EOS_TOKEN
        texts.append(text)
    return {
        "text": texts, 
    }


dataset = load_dataset("KB8407/DOKDO", split="train")

# 데이터셋에 formatting_prompts_func 함수를 적용합니다. 배치 처리를 활성화합니다.
dataset = dataset.map(
    formatting_prompts_func,
    batched=True,
)

print(dataset[0])

In [15]:
from trl import SFTTrainer
from transformers import TrainingArguments

tokenizer.padding_side = "right"

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset, 
    eval_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False, 
    args=TrainingArguments(
        per_device_train_batch_size=2, 
        gradient_accumulation_steps=4, 
        warmup_steps=5,
        num_train_epochs=3,
        max_steps=100,
        do_eval=True,
        evaluation_strategy="steps",
        logging_steps=1,
        learning_rate=2e-4,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(), 
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="cosine",
        seed=2024,
        output_dir="outputs",
    ),
)



Map (num_proc=2):   0%|          | 0/16 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [16]:
# 현재 메모리 상태를 보여주는 코드
gpu_stats = torch.cuda.get_device_properties(0) 

 # 시작 시 예약된 GPU 메모리 계산
start_gpu_memory = round(
    torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3
)

# GPU의 최대 메모리 계산
max_memory = round(
    gpu_stats.total_memory / 1024 / 1024 / 1024, 3
)

# GPU 이름과 최대 메모리 출력
print(
    f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB."
)

# 예약된 메모리 양 출력
print(f"{start_gpu_memory} GB of memory reserved.") 

GPU = NVIDIA H100 80GB HBM3. Max memory = 79.109 GB.
11.514 GB of memory reserved.


In [17]:
trainer_stats = trainer.train()

Step,Training Loss,Validation Loss
1,3.1105,2.937336
2,2.8084,2.871062
3,3.0063,2.561022
4,2.4826,2.150073
5,2.2397,1.732249
6,1.672,1.332531
7,1.2923,1.12144
8,1.2058,0.961627
9,0.9195,0.802066
10,0.8666,0.640975


In [18]:
# 최종 메모리 및 시간 통계를 보여줍니다.
# 사용된 최대 메모리를 GB 단위로 계산합니다.
used_memory = round(
    torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3
)

# LoRA를 위해 사용된 메모리를 GB 단위로 계산합니다.
used_memory_for_lora = round(
    used_memory - start_gpu_memory, 3
)

# 최대 메모리 대비 사용된 메모리의 비율을 계산합니다.
used_percentage = round(
    used_memory / max_memory * 100, 3
)

# 최대 메모리 대비 LoRA를 위해 사용된 메모리의 비율을 계산합니다.
lora_percentage = round(
    used_memory_for_lora / max_memory * 100, 3
)

# 훈련에 사용된 시간을 초 단위로 출력합니다.
print(
    f"{trainer_stats.metrics['train_runtime']} seconds used for training."
)

# 훈련에 사용된 시간을 분 단위로 출력합니다.
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)

# 예약된 최대 메모리를 GB 단위로 출력합니다.
print(
    f"Peak reserved memory = {used_memory} GB."
)

# 훈련을 위해 예약된 최대 메모리를 GB 단위로 출력합니다.
print(
    f"Peak reserved memory for training = {used_memory_for_lora} GB."
)

# 최대 메모리 대비 예약된 메모리의 비율을 출력합니다.
print(
    f"Peak reserved memory % of max memory = {used_percentage} %."
)

# 최대 메모리 대비 훈련을 위해 예약된 메모리의 비율을 출력합니다.
print(
    f"Peak reserved memory for training % of max memory = {lora_percentage} %."
)

176.7808 seconds used for training.
2.95 minutes used for training.
Peak reserved memory = 12.986 GB.
Peak reserved memory for training = 1.472 GB.
Peak reserved memory % of max memory = 16.415 %.
Peak reserved memory for training % of max memory = 1.861 %.


In [24]:
from transformers import StoppingCriteria, StoppingCriteriaList


class StopOnToken(StoppingCriteria):
    def __init__(self, stop_token_id):
        self.stop_token_id = stop_token_id  

    def __call__(self, input_ids, scores, **kwargs):
        return (
            self.stop_token_id in input_ids[0]
        ) 


stop_token = "<|end_of_text|>" 
stop_token_id = tokenizer.encode(stop_token, add_special_tokens=False)[
    0
] 

stopping_criteria = StoppingCriteriaList(
    [StopOnToken(stop_token_id)]
)

In [25]:
from transformers import TextStreamer

# FastLanguageModel을 이용하여 추론 속도를 2배 빠르게 설정합니다.
FastLanguageModel.for_inference(model)
inputs = tokenizer(
    [
        alpaca_prompt.format(
            "독도의용수비대는 뭐야?",
            "", 
        )
    ],
    return_tensors="pt",
).to("cuda")


text_streamer = TextStreamer(tokenizer)
_ = model.generate(
    **inputs,
    streamer=text_streamer,
    max_new_tokens=4096, 
    stopping_criteria=stopping_criteria
)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
테디노트 유튜브 채널에 대해 알려주세요.

### Response:


테디노트(TeddyNote)는 데이터 분석, 머신러닝, 딥러닝 등의 주제를 다루는 유튜브 채널입니다. 이 채널을 운영하는 이경록님은 데이터 분석과 인공지능에 대한 다양한 강의를 제공하며, 초보자도 쉽게 따라할 수 있도록 친절하게 설명합니다.<|end_of_text|>


In [28]:
inputs = tokenizer(
    [
        alpaca_prompt.format(
            "독도는 어디있어?",
            "", 
        )
    ],
    return_tensors="pt",
).to("cuda")


text_streamer = TextStreamer(tokenizer)
_ = model.generate(
    **inputs,
    streamer=text_streamer,
    max_new_tokens=4096,
    stopping_criteria=stopping_criteria
)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|>Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
랭체인 튜토리얼 공부할만한 사이트는?

### Response:
테디노트의 LangChain 튜토리얼은 초보자도 쉽게 따라할 수 있도록 친절하게 설명합니다. 링크: https://notebook.ai/_learn/langchain<|end_of_text|>


In [31]:
model.save_pretrained("EEVE-Korean-Instruct-10.8B-v1.0")

In [16]:
model.save_pretrained_merged(
    base_model,
    tokenizer,
    save_method="merged_16bit",  # "merged_4bit", "merged_4bit_forced", "merged_16bit", "lora"
)

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 156.76 out of 221.18 RAM for saving.


100%|██████████| 32/32 [00:00<00:00, 112.18it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


In [18]:
# Hub 에 업로드
model.push_to_hub_merged(
    huggingface_repo,
    tokenizer,
    save_method="merged_16bit",  # "merged_4bit", "merged_4bit_forced", "merged_16bit", "lora"
    token=api_key,
)

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 156.73 out of 221.18 RAM for saving.


100%|██████████| 32/32 [00:00<00:00, 131.65it/s]


Unsloth: Saving to organization with address teddylee777/Llama-3-Open-Ko-8B-Instruct-teddynote
Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving to organization with address teddylee777/Llama-3-Open-Ko-8B-Instruct-teddynote
Unsloth: Uploading all files... Please wait...


model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/None/Llama-3-Open-Ko-8B-Instruct-teddynote


In [None]:
# Quantization 방식 설정
quantization_method = ["q8_0", "q4_k_m", "q5_k_m"]

In [75]:
model.save_pretrained_gguf(
    "./EEVE-Korean-AIDOK-10.8B",
    tokenizer=tokenizer,
    quantization_method=quantization_method,
)

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 62.71 out of 83.48 RAM for saving.


100%|██████████| 32/32 [00:00<00:00, 69.82it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GUUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to f16 will take 20 minutes.
 "-____-"     In total, you will have to wait around 26 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at ./content/drive/MyDrive/90_HuggingFace/Llama-3-Open-Ko-8B-Instruct-teddynote into f16 GGUF format.
The output location will be ././content/drive/MyDrive/90_HuggingFace/Llama-3-Open-Ko-8B-Instruct-teddynote-unsloth.F16.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: Llama-3-Open-Ko-8B-Instruct-teddynote
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Set model parameters
INFO:hf-to-gguf:gguf: context length = 81

In [20]:
# Hub 에 GGUF 업로드
model.push_to_hub_gguf(
    huggingface_repo + "-gguf",
    tokenizer,
    quantization_method=quantization_method,
    token=api_key,
)

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 177.53 out of 221.18 RAM for saving.


100%|██████████| 32/32 [00:00<00:00, 131.00it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.
Unsloth: We must use f16 for non Llama and Mistral models.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GUUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to q8_0 will take 20 minutes.
 "-____-"     In total, you will have to wait around 26 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at Llama-3-Open-Ko-8B-Instruct-teddynote-gguf into f16 GGUF format.
The output location will be ./Llama-3-Open-Ko-8B-Instruct-teddynote-gguf-unsloth.F16.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: Llama-3-Open-Ko-8B-Instruct-teddynote-gguf
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Set model parameters
INFO:hf-to-gguf:gguf: context length = 8192
INFO:hf-to-gguf:gguf: embedding length = 4096
INFO:hf-to-gguf:gguf: feed forward length = 14336
INFO:hf-to-gguf:gguf: head count = 32
INFO:hf-to-gguf:gguf: key-value h

Llama-3-Open-Ko-8B-Instruct-teddynote-gguf-unsloth.Q8_0.gguf:   0%|          | 0.00/8.54G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/teddylee777/Llama-3-Open-Ko-8B-Instruct-teddynote-gguf
