In [1]:
import os
from glob import glob
from tqdm import tqdm
import torch
from datasets import load_dataset
import pytorch_lightning as pl
from rouge import Rouge # 모델의 성능을 평가하기 위한 라이브러리입니다.

from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, DefaultDataCollator
from transformers import EarlyStoppingCallback
from peft import get_peft_model, LoraConfig, TaskType

import wandb # 모델 학습 과정을 손쉽게 Tracking하고, 시각화할 수 있는 라이브러리입니다.

import os


In [2]:
import wandb
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33myoohyeonji12[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
class Preprocess:
    def __init__(self, eos_token: str):
        self.eos_token = eos_token

    def make_input(self, dataset, is_test=False):
        input_data = []
        target_data = []
        
        for data in tqdm(dataset):
            dialogue = str(data['dialogue']) + self.eos_token
            input_data.append(dialogue)
            
            if not is_test:
                summary = str(data['summary']) + self.eos_token
                target_data.append(summary)
        
        if is_test:
            return input_data, None
        else:
            return input_data, target_data


In [3]:
def load_tokenizer_and_model(config):
    model_name = config['model_name']

    # 양자화 설정
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
    )
    
    # 모델과 토크나이저 로드
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)
    
    
    special_tokens_dict = {'additional_special_tokens': config['special_tokens']}
    tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))
    # 패딩 토큰 설정
    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer



import random

from torch.utils.data import TensorDataset

class CustomTensorDataset(TensorDataset):
    def __getitem__(self, index):
        tensors = super().__getitem__(index)
        return {
            "input_ids": tensors[0],
            "attention_mask": tensors[1],
            "labels": tensors[2]
        }


def prepare_data(config, tokenizer):
    preprocessor = Preprocess(eos_token='</s>')
    
    dataset_dict = load_dataset('csv', data_files=config['data_files'])
    
    encoder_input_train, decoder_input_train = preprocessor.make_input(dataset_dict['train'], is_test=False)
    encoder_input_val, decoder_input_val = preprocessor.make_input(dataset_dict['validation'], is_test=False)
    
    tokenized_inputs_train = tokenizer(encoder_input_train, truncation=True, max_length=config['max_input_length'], return_tensors='pt', padding=True)
    tokenized_targets_train = tokenizer(decoder_input_train, truncation=True, max_length=config['max_target_length'], return_tensors='pt', padding=True)

    tokenized_inputs_val = tokenizer(encoder_input_val, truncation=True, max_length=config['max_input_length'], return_tensors='pt', padding=True)
    tokenized_targets_val = tokenizer(decoder_input_val, truncation=True, max_length=config['max_target_length'], return_tensors='pt', padding=True)
    
    train_dataset = CustomTensorDataset(tokenized_inputs_train['input_ids'], 
                                  tokenized_inputs_train['attention_mask'],
                                  tokenized_targets_train['input_ids'])

    eval_dataset = CustomTensorDataset(tokenized_inputs_val['input_ids'],
                                 tokenized_inputs_val['attention_mask'],
                                 tokenized_targets_val['input_ids'])
    
    # validation 데이터에서 20% 샘플링
    eval_subset = random.sample(list(eval_dataset), int(len(eval_dataset) * 0.18)) 

    return train_dataset, eval_subset



In [4]:
def load_trainer(config, model, tokenizer, train_dataset, eval_dataset):
    # QLoRA 설정
    lora_config = LoraConfig(
        r=4, 
        lora_alpha=32, 
        target_modules=["q_proj", "v_proj"], 
        lora_dropout=0.1, 
        task_type=TaskType.CAUSAL_LM  # Autoregressive language model을 사용
    )
    
    lora_model = get_peft_model(model, lora_config)

    training_args = TrainingArguments(
        output_dir=config['output_dir'],
        overwrite_output_dir=True,
        num_train_epochs=config['num_train_epochs'],
        learning_rate=config['learning_rate'],
        per_device_train_batch_size=config['train_batch_size'],
        per_device_eval_batch_size=config['eval_batch_size'],
        warmup_ratio=config['warmup_ratio'],
        weight_decay=config['weight_decay'],
        lr_scheduler_type=config['lr_scheduler_type'],
        evaluation_strategy='steps',
        save_strategy='steps',
        fp16=True,
        logging_dir=config['logging_dir'],
        logging_strategy='steps',
        logging_steps=100,
        dataloader_pin_memory=True,
        load_best_model_at_end=True,
        optim="paged_adamw_8bit",
        gradient_accumulation_steps=4
    )

    wandb.init(
        project=config['wandb']['project'],
        name=config['wandb']['name'],
    )

    # 모델 checkpoint를 wandb에 저장하도록 환경 변수를 설정합니다.
    os.environ["WANDB_LOG_MODEL"]="true"
    os.environ["WANDB_WATCH"]="false"

    early_stopping_callback = EarlyStoppingCallback(
        early_stopping_patience=2,
        early_stopping_threshold=0.01
    )

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )
    
    trainer = Trainer(
        model=lora_model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        callbacks=[early_stopping_callback],
        data_collator=data_collator,
    )
    
    return trainer
config = {
    'model_name': 'beomi/OPEN-SOLAR-KO-10.7B',
    'special_tokens': [
        '#Person3#', '#Person1#', '#CardNumber#', '#Person4#', '#SSN#',
        '#CarNumber#', '#Address#', '#DateOfBirth#', '#Person5#', 
        '#PassportNumber#', '#PhoneNumber#', '#Person7#', '#Email#', 
        '#Person6#', '#Person#', '#Person2#'
    ],
    'max_input_length': 512,
    'max_target_length': 100,
    'data_files': {
        'train': os.path.abspath('../data/train.csv'),
        'validation': os.path.abspath('../data/dev.csv')
    },
    'output_dir': './results',
    'num_train_epochs': 2,
    'learning_rate': 3e-5,
    'train_batch_size': 1,
    'eval_batch_size': 1,
    'warmup_ratio': 0.05,
    'weight_decay': 0.001,
    'lr_scheduler_type': 'linear',
    'logging_dir': './logs',
    'inference': {
        'remove_tokens': ['</s>', '<pad>']
    },
    "wandb": {
        "project": "solar_api",
        "name": "4bits_QLoRA"
    },
}


In [5]:
config = {
    'model_name': 'beomi/OPEN-SOLAR-KO-10.7B',
    'special_tokens': [
        '#Person3#', '#Person1#', '#CardNumber#', '#Person4#', '#SSN#',
        '#CarNumber#', '#Address#', '#DateOfBirth#', '#Person5#', 
        '#PassportNumber#', '#PhoneNumber#', '#Person7#', '#Email#', 
        '#Person6#', '#Person#', '#Person2#'
    ],
    'max_input_length': 512,
    'max_target_length': 100,
    'data_files': {
        'train': os.path.abspath('../data/train.csv'),
        'validation': os.path.abspath('../data/dev.csv')
    },
    'output_dir': './results',
    'num_train_epochs': 2,
    'learning_rate': 3e-5,
    'train_batch_size': 1,
    'eval_batch_size': 1,
    'warmup_ratio': 0.05,
    'weight_decay': 0.001,
    'lr_scheduler_type': 'linear',
    'logging_dir': './logs',
    'inference': {
        'remove_tokens': ['</s>', '<pad>']
    },
    "wandb": {
        "project": "solar_api",
        "name": "4bits_QLoRA"
    },
}


In [6]:
def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('-'*10, f'device : {device}', '-'*10,)
    print(torch.__version__)
    
    model, tokenizer = load_tokenizer_and_model(config)
    print('-'*10,"tokenizer special tokens : ",tokenizer.special_tokens_map,'-'*10)
    train_dataset, eval_dataset = prepare_data(config, tokenizer)
    
    trainer = load_trainer(config, model, tokenizer, train_dataset, eval_dataset)
    trainer.train()
    
    # 평가 진행
    all_preds = []
    all_labels = []

    for batch in tqdm(trainer.eval_dataset, desc="Evaluating"):
        input_ids = batch["input_ids"]
        labels = batch["labels"]

        with torch.no_grad():
            outputs = model.generate(input_ids=input_ids.to(device), max_length=config['max_length'], early_stopping=True)

        preds = tokenizer.decode(outputs)
        labels = tokenizer.decode(labels)

        all_preds.extend(preds)
        all_labels.extend(labels)

    # 정확한 평가를 위해 미리 정의된 불필요한 생성토큰들을 제거합니다.
    replaced_predictions = all_preds.copy()
    replaced_labels = all_labels.copy()
    remove_tokens = config['inference']['remove_tokens']
    for token in remove_tokens:
        replaced_predictions = [sentence.replace(token," ") for sentence in replaced_predictions]
        replaced_labels = [sentence.replace(token," ") for sentence in replaced_labels]

    print('-'*150)
    print(f"PRED: {replaced_predictions[0]}")
    print(f"GOLD: {replaced_labels[0]}")
    print('-'*150)
    print(f"PRED: {replaced_predictions[1]}")
    print(f"GOLD: {replaced_labels[1]}")
    print('-'*150)
    print(f"PRED: {replaced_predictions[2]}")
    print(f"GOLD: {replaced_labels[2]}")

    # 평가 지표 계산
    rouge = Rouge()
    scores = rouge.get_scores(replaced_predictions, replaced_labels, avg=True)
    print(scores)


In [None]:
if __name__ == "__main__":
    main()