# 가상환경 프롬프트에서 set CUDA_VISIBLE_DEVICES=0 입력 후 코드 실행

In [1]:
import os
import pandas as pd
from datasets import Dataset,load_dataset
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from huggingface_hub import notebook_login, login
import gc
import transformers
import time

# Constants
base_model_name = "meta-llama/Llama-2-7b-chat-hf"
new_model = "llama-2-7b-manual_automation_final_test"
path = 'C:\\Users\\khu\\Desktop\\manual\\automation\\'+new_model
token = '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'
lora_r = 64
lora_alpha = 16
lora_dropout = 0.1
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False
num_train_epochs = 1
fp16 = False
bf16 = False
per_device_train_batch_size = 4
per_device_eval_batch_size = 4
gradient_accumulation_steps = 1
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 0
logging_steps = 25
max_seq_length = 1024
packing = False
device_map = {"": 0}

# Functions
def load_and_preprocess_data(file_path):
    df = pd.read_excel(file_path)
    df = df.dropna()

    dataset = Dataset.from_pandas(df)

    if "__index_level_0__" in dataset.column_names:
        dataset = dataset.remove_columns(["__index_level_0__"])

    return dataset


def fine_tune_llama_model(dataset ,output_dir= "./results_final_test", base_model_name=base_model_name):
    compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=use_4bit,
        bnb_4bit_quant_type=bnb_4bit_quant_type,
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=use_nested_quant,
    )

    if compute_dtype == torch.float16 and use_4bit:
        major, _ = torch.cuda.get_device_capability()
        if major >= 8:
            print("=" * 80)
            print("Your GPU supports bfloat16: accelerate training with bf16=True")
            print("=" * 80)

    model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        quantization_config=bnb_config,
        device_map=device_map,
        use_auth_token=token,
    )
    model.config.use_cache = False
    model.config.pretraining_tp = 1

    tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True, use_auth_token=token)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right" 

    peft_config = LoraConfig(
        lora_alpha=lora_alpha,
        lora_dropout=lora_dropout,
        r=lora_r,
        bias="none",
        task_type="CAUSAL_LM",
    )

    training_arguments = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        optim=optim,
        save_steps=save_steps,
        logging_steps=logging_steps,
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        fp16=fp16,
        bf16=bf16,
        max_grad_norm=max_grad_norm,
        max_steps=max_steps,
        warmup_ratio=warmup_ratio,
        group_by_length=group_by_length,
        lr_scheduler_type=lr_scheduler_type,
        report_to="tensorboard"
    )

    trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        peft_config=peft_config,
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        tokenizer=tokenizer,
        args=training_arguments,
        packing=packing,
    )

    trainer.train()

    trainer.model.save_pretrained(new_model)

    
def run_tensorboard(log_dir="results/runs"):
    get_ipython().run_line_magic('load_ext', 'tensorboard')

    get_ipython().run_line_magic('tensorboard', f'--logdir {log_dir}')


def reload_and_merge_model(
    trust_remote_code=True
):

    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        low_cpu_mem_usage=True,
        return_dict=True,
        torch_dtype=torch.float16,
        device_map=device_map,
        use_auth_token=token,
    )

    model = PeftModel.from_pretrained(base_model, path)
    model = model.merge_and_unload()
    model.save_pretrained(path)

    tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=trust_remote_code, use_auth_token=token)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    tokenizer.save_pretrained(path)
    return model, tokenizer


def push_model_and_tokenizer_to_hub(model_path):
    # token = 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
    notebook_login()
    
    model.push_to_hub(model_path, use_temp_dir=False)
    tokenizer.push_to_hub(model_path, use_temp_dir=False)

    
def empty_cache():
    for obj in ['model', 'pipe', 'trainer']:
        if obj in locals():
            del locals()[obj]
    gc.collect()
    gc.collect()
    torch.cuda.empty_cache()

    
def model_load():
    model = AutoModelForCausalLM.from_pretrained(path)
    return model


def tokenizer_load():
    tokenizer = AutoTokenizer.from_pretrained(path)
    return tokenizer


def model_use(model, tokenizer, prompt):    
    pipeline = transformers.pipeline(
        'text-generation',
        model=model,
        tokenizer = tokenizer,
        torch_dtype=torch.float32,
        device_map='auto',
        max_length=200
    )

    start_time = time.time()  
    result = pipeline(f"[INST] {prompt} [/INST]")
    end_time = time.time() 

    elapsed_time = end_time - start_time
    print(result[0]['generated_text'])
    print(f"Time taken: {elapsed_time:.4f} seconds")
    
# Main
if __name__ == "__main__":
    dataset = load_and_preprocess_data('C:\\Users\\khu\\Desktop\\manual\\data\\processed_ver3.xlsx')
    fine_tune_llama_model(dataset , output_dir="./results_automation_test")
    # run_tensorboard(log_dir="results/runs")
    empty_cache()
    reload_and_merge_model()
    # push_model_and_tokenizer_to_hub(model_path=path)
    model = model_load()
    tokenizer = tokenizer_load()
    model_use(model, tokenizer, 'What should I do if warning number 3003 is raised?')



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin C:\Users\khu\miniconda3\envs\manual\lib\site-packages\bitsandbytes\libbitsandbytes_cuda117.dll
CUDA SETUP: CUDA runtime path found: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.7\bin\cudart64_110.dll
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary C:\Users\khu\miniconda3\envs\manual\lib\site-packages\bitsandbytes\libbitsandbytes_cuda117.dll...


  warn(msg)
  warn(msg)
  warn(msg)


Your GPU supports bfloat16: accelerate training with bf16=True




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Map:   0%|          | 0/8577 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
25,3.8901
50,2.921
75,2.0272
100,1.6169
125,1.4482
150,1.3738
175,1.3246
200,1.3605
225,1.3157
250,1.3794


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Time taken: 51.0937 seconds
