In [None]:
# 필요한 라이브러리 설치
!pip install bitsandbytes transformers peft accelerate trl datasets

In [None]:
# 필요한 모듈 임포트
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from transformers import LlamaForCausalLM
from peft import LoraConfig, prepare_model_for_kbit_training
from datasets import Dataset
import pandas as pd
from trl import SFTTrainer
import warnings

# 경고 메시지 무시
warnings.filterwarnings("ignore")

In [None]:
# Google Drive를 마운트하여 파일에 접근할 수 있도록 설정
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 데이터 로드
train = pd.read_csv('/content/drive/MyDrive/재정정보 AI 검색 알고리즘 경진대회/pymupdf4llm/finetuning_prompt_cerebro.csv')

# 모델 및 토크나이저 설정
model_id = "I-BRICKS/Cerebro_BM_solar_v01"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = LlamaForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation='eager',
    trust_remote_code=True
)

# 데이터셋 준비
dataset = Dataset.from_dict({"text": train['prompt'].values})
dataset = dataset.shuffle()

# 모델 준비 (k-bit 학습을 위해)
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
# LoRA 설정
lora_config_dict = {
    'lora_r': 8,
    'lora_alpha': 32,
    'lora_dropout': 0.05,
    'lora_bias': "none",
    'lora_task_type': "CAUSAL_LM",
    'target_modules': ["q_proj", "up_proj", "o_proj", "k_proj", "down_proj", "gate_proj", "v_proj"]
}

peft_config = LoraConfig(
    lora_alpha=lora_config_dict['lora_alpha'],
    lora_dropout=lora_config_dict['lora_dropout'],
    r=lora_config_dict['lora_r'],
    bias=lora_config_dict['lora_bias'],
    task_type=lora_config_dict['lora_task_type'],
    target_modules=lora_config_dict['target_modules']
)

In [None]:
# 학습 파라미터 설정
train_param = {
    'num_train_epochs': 2,
    'per_device_train_batch_size': 4,
    'per_device_eval_batch_size': 4,
    'gradient_accumulation_steps': 1,
    'optim': "adamw_torch",
    'save_steps': 300,
    'logging_steps': 10,
    'learning_rate': 0.0002,
    'weight_decay': 0.01,
    'max_grad_norm': 1,
    'warmup_ratio': 0.06,
    'group_by_length': False,
    'lr_scheduler_type': 'cosine'
}

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/재정정보 AI 검색 알고리즘 경진대회/pymupdf4llm/finetune/",
    num_train_epochs=train_param['num_train_epochs'],
    per_device_train_batch_size=train_param['per_device_train_batch_size'],
    per_device_eval_batch_size=train_param['per_device_eval_batch_size'],
    gradient_accumulation_steps=train_param['gradient_accumulation_steps'],
    optim=train_param['optim'],
    save_steps=train_param['save_steps'],
    logging_steps=train_param['logging_steps'],
    learning_rate=train_param['learning_rate'],
    weight_decay=train_param['weight_decay'],
    max_grad_norm=train_param['max_grad_norm'],
    warmup_ratio=train_param['warmup_ratio'],
    group_by_length=train_param['group_by_length'],
    lr_scheduler_type=train_param['lr_scheduler_type']
)

# 모델 학습 및 저장
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_args,
    packing=False
)

In [None]:
model.config.use_cache = False
trainer.train(resume_from_checkpoint=False)
trainer.model.save_pretrained(training_args.output_dir)