# GPU Check

In [1]:
import os
import torch

# gpu parall
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  # Arrange GPU devices starting from 0
os.environ["CUDA_VISIBLE_DEVICES"]= "0,1,2,3"  # Set the GPUs 2 and 3 to use

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print('Device:', device)
print('Current cuda device:', torch.cuda.current_device())
print('Count of using GPUs:', torch.cuda.device_count())

Device: cuda
Current cuda device: 0
Count of using GPUs: 4


## Dataset

In [3]:
import pandas as pd
from datasets import Dataset

In [6]:
TRAIN_DATA_PATH="/home/laststar/source/rag-service/data/dataset_v0.2.csv"
ROOT_DIR = "/home/laststar/data/model/rag-service"

In [7]:
df = pd.read_csv(TRAIN_DATA_PATH)
df.head()

Unnamed: 0,text
0,<s>[INST] 너는 누구니? [/INST] 저는 라임에스엔씨의 AI 안내 챗봇 ...
1,<s>[INST] 당신은 누구입니까? [/INST] 저는 라임에스엔씨의 AI 안내 ...
2,<s>[INST] 당신은 어떤 역할을 하고 있나요? [/INST] 저는 라임에스엔씨...
3,<s>[INST] 너는 어떤 존재야? [/INST] 저는 라임에스엔씨의 AI 안내 ...
4,<s>[INST] 당신은 누구신가요? [/INST] 저는 라임에스엔씨의 AI 안내 ...


In [8]:
dataset = Dataset.from_pandas(df)

# Train

In [9]:
from datasets import load_dataset

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging
)

from peft import LoraConfig
from trl import SFTTrainer

In [10]:
import os
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
current_dir = f'{ROOT_DIR}/{timestamp}'

print(current_dir)

/home/laststar/data/model/rag-service/20241105_093300


In [11]:
# Base Model
base_model = "beomi/Llama-3-Open-Ko-8B"

# New save Directory Path
save_apdater_model_dir = f"{current_dir}/adapter"
save_model_dir = f"{current_dir}/model"
save_output_dir = f"{current_dir}/result"

print("base model : ", base_model)
print("adapter dir : ", save_apdater_model_dir)
print("model dir : ", save_model_dir)
print("output dir : ", save_output_dir)

base model :  beomi/Llama-3-Open-Ko-8B
adapter dir :  /home/laststar/data/model/rag-service/20241105_093300/adapter
model dir :  /home/laststar/data/model/rag-service/20241105_093300/model
output dir :  /home/laststar/data/model/rag-service/20241105_093300/result


In [12]:
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    atten_implementation = "flash_attention_2"
    torch_dtype = torch.bfloat16
else:
    atten_implementation = "eager"
    torch_dtype = torch.float16

# QLoRA config
quant_config = BitsAndBytesConfig(
    load_in_8bit = True,
    bnb_8bit_quant_type="nf4",
    bnb_8bit_compute_dtype=torch_dtype,
    bnb_8bit_use_double_quant=False
)

Unused kwargs: ['bnb_8bit_quant_type', 'bnb_8bit_compute_dtype', 'bnb_8bit_use_double_quant']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


In [13]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [15]:
model.config.use_cache = False
model.config.pretraining_tp = 1

In [16]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

In [20]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [21]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    task_type="CAUSAL_LM"
)

In [22]:
training_params = TrainingArguments(
    output_dir = save_output_dir,
    num_train_epochs=15,
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

In [23]:
trainer = SFTTrainer(
    model = model,
    train_dataset = dataset,
    peft_config = peft_params,
    dataset_text_field = "text",
    max_seq_length = None,
    tokenizer = tokenizer,
    args = training_params,
    packing = False
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/260 [00:00<?, ? examples/s]

In [24]:
from datetime import datetime
print("current time:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

current time: 2024-11-05 09:40:09


In [25]:
trainer.train()



Step,Training Loss
25,1.957
50,1.4401
75,1.1768
100,0.911
125,0.6499
150,0.567
175,0.4212
200,0.4632
225,0.3327
250,0.3406




TrainOutput(global_step=975, training_loss=0.34753063030731984, metrics={'train_runtime': 1231.1806, 'train_samples_per_second': 3.168, 'train_steps_per_second': 0.792, 'total_flos': 1.152604555542528e+16, 'train_loss': 0.34753063030731984, 'epoch': 15.0})

In [26]:
from datetime import datetime
print("current time:", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

current time: 2024-11-05 10:00:42


In [27]:
trainer.save_model(save_apdater_model_dir)

In [28]:
from peft import PeftModel

In [29]:
post_model = AutoModelForCausalLM.from_pretrained(
    base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
)

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [30]:
model = PeftModel.from_pretrained(post_model, save_apdater_model_dir, device_map='auto', torch_dtype=torch.float16)
model = model.merge_and_unload()

In [31]:
model.save_pretrained(save_model_dir)
tokenizer.save_pretrained(save_model_dir)

('/home/laststar/data/model/rag-service/20241105_093300/model/tokenizer_config.json',
 '/home/laststar/data/model/rag-service/20241105_093300/model/special_tokens_map.json',
 '/home/laststar/data/model/rag-service/20241105_093300/model/tokenizer.json')

In [32]:
[name for name, param in model.state_dict().items() if 'SCB' in name]

[]

# Build

In [33]:
LLAMA_CPP_DIR = "/home/laststar/framework/llama.cpp"

In [34]:
!mkdir $current_dir/quantized_model/
!ls -al $current_dir

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


total 24
drwxrwxr-x  6 laststar laststar 4096 11월  5 10:01 .
drwxrwxr-x  6 laststar laststar 4096 11월  5 09:40 ..
drwxrwxr-x  2 laststar laststar 4096 11월  5 10:00 adapter
drwxrwxr-x  2 laststar laststar 4096 11월  5 10:01 model
drwxrwxr-x  2 laststar laststar 4096 11월  5 10:01 quantized_model
drwxrwxr-x 42 laststar laststar 4096 11월  5 10:00 result


In [35]:
!python $LLAMA_CPP_DIR/convert_hf_to_gguf.py $save_model_dir --outtype f16 --outfile $current_dir/quantized_model/FP16.gguf

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO:hf-to-gguf:Loading model: model
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00004.safetensors'
INFO:hf-to-gguf:token_embd.weight,           torch.float16 --> F16, shape = {4096, 128256}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.float16 --> F32, shape = {4096}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.float16 --> F16, shape = {14336, 4096}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.float16 --> F16, shape = {4096, 14336}
INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.float16 --> F16, shape = {4096, 14336}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,       torch.float16 --> F32, shape = {4096}
INFO:hf-to-gguf:blk.0.attn_k.weight,         torch.float16 --> F16, shape = {4096, 1024}
INFO:hf-to-gguf:blk.0.attn_output.weight,    torch.float16 --> F16, shape = {4096, 4

In [36]:
quantized_path = f'{current_dir}/quantized_model/'
methods = ['q5_k_m']

import os

for m in methods:
    qtype = f'{quantized_path}/{m.upper()}.gguf'
    os.system(f"{LLAMA_CPP_DIR}/llama-quantize {quantized_path}/FP16.gguf " + qtype + " " + m)

ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
ggml_cuda_init: found 4 CUDA devices:
  Device 0: Tesla V100-DGXS-32GB, compute capability 7.0, VMM: yes
  Device 1: Tesla V100-DGXS-32GB, compute capability 7.0, VMM: yes
  Device 2: Tesla V100-DGXS-32GB, compute capability 7.0, VMM: yes
  Device 3: Tesla V100-DGXS-32GB, compute capability 7.0, VMM: yes
main: build = 3870 (841713e1)
main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
main: quantizing '/home/laststar/data/model/rag-service/20241105_093300/quantized_model//FP16.gguf' to '/home/laststar/data/model/rag-service/20241105_093300/quantized_model//Q5_K_M.gguf' as Q5_K_M
llama_model_loader: loaded meta data with 27 key-value pairs and 291 tensors from /home/laststar/data/model/rag-service/20241105_093300/quantized_model//FP16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_l


main: quantize time = 46680.04 ms
main:    total time = 46680.04 ms


In [37]:
print(f'{LLAMA_CPP_DIR}/llama-cli -m {quantized_path}Q5_K_M.gguf -n 90 --repeat_penalty 1.0 --color -i -r "User:" -f {LLAMA_CPP_DIR}/prompts/chat-with-bob.txt')

/home/laststar/framework/llama.cpp/llama-cli -m /home/laststar/data/model/rag-service/20241105_093300/quantized_model/Q5_K_M.gguf -n 90 --repeat_penalty 1.0 --color -i -r "User:" -f /home/laststar/framework/llama.cpp/prompts/chat-with-bob.txt


In [38]:
trainer.args



In [39]:
trainer

<trl.trainer.sft_trainer.SFTTrainer at 0x7f9415782410>

In [40]:
dir(trainer)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_activate_neftune',
 '_add_sm_patterns_to_gitignore',
 '_created_lr_scheduler',
 '_deactivate_neftune',
 '_evaluate',
 '_finish_current_push',
 '_fsdp_qlora_plugin_updates',
 '_gather_and_numpify',
 '_get_collator_with_removed_columns',
 '_get_eval_sampler',
 '_get_learning_rate',
 '_get_output_dir',
 '_get_train_sampler',
 '_globalstep_last_logged',
 '_hp_search_setup',
 '_inner_training_loop',
 '_load_best_model',
 '_load_callback_state',
 '_load_from_checkpoint',
 '_load_optimizer_and_scheduler',
 '_load_rng_state',
 '_loggers_initialized',
 '_maybe_log_save_evaluate',
 '_memory_tracker',
 '_move_mode