In [None]:
from rouge_score import rouge_scorer, scoring
import datasets

from peft import LoraConfig, get_peft_model, PeftModel, TaskType
import pandas as pd
import os
import re
import json
import yaml
from glob import glob
from tqdm import tqdm
from pprint import pprint
import torch
import pytorch_lightning as pl
from rouge import Rouge 
import numpy as np

from torch.utils.data import Dataset , DataLoader
from transformers import AutoTokenizer, BartForConditionalGeneration, BartConfig, AutoConfig, AutoModelForCausalLM, GPT2LMHeadModel
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import Trainer, TrainingArguments
from transformers import EarlyStoppingCallback
from transformers import DataCollatorForLanguageModeling
import wandb 

In [None]:
# config 설정에 tokenizer 모듈이 사용되므로 미리 tokenizer를 정의
tokenizer = AutoTokenizer.from_pretrained("cateto/korean-gpt-neox-125M")

In [None]:
config_data = {
    "general": {
        "data_path": "../data/", # 모델 생성에 필요한 데이터 경로를 사용자 환경에 맞게 지정
        "model_name": 'cateto/korean-gpt-neox-125M', # 불러올 모델의 이름을 사용자 환경에 맞게 지정
        "output_dir": "/data/ephemeral/home/checkpoint/decoder_only_1/" # 모델의 최종 출력 값을 저장할 경로를 설정
    },
    "tokenizer": {
        "encoder_max_len": 512,
        "decoder_max_len": 100,
        "bos_token": f"{tokenizer.bos_token}",
        "eos_token": f"{tokenizer.eos_token}",
        # 특정 단어들이 분해되어 tokenization이 수행되지 않도록 special_tokens을 지정
        "special_tokens": ['#Person1#', '#Person2#', '#Person3#', '#Person4#', '#Person5#', '#Person6#', '#Person7#','#PhoneNumber#', '#Address#', '#PassportNumber#', '#CarNumber#', '#SSN#', '#CardNumber#', '#Email#', '#DateOfBirth#']
    },
    "training": {
        "overwrite_output_dir": True,
        "num_train_epochs": 20,
        "learning_rate": 1e-5,
        "per_device_train_batch_size": 8,
        "per_device_eval_batch_size": 1,
        "warmup_ratio": 0.1,
        "weight_decay": 0.01,
        "lr_scheduler_type": 'cosine',
        "optim": 'adamw_torch',
        "gradient_accumulation_steps": 1,
        "evaluation_strategy": 'epoch',
        "save_strategy": 'epoch',
        "save_total_limit": 3,
        "fp16": True,
        "load_best_model_at_end": True,
        "seed": 42,
        "logging_dir": "./logs",
        "logging_strategy": "epoch",
        # "predict_with_generate": True,
        "generation_max_length": 100,
        "do_train": True,
        "do_eval": True,
        "early_stopping_patience": 3,
        "early_stopping_threshold": 0.001,
        "report_to": "wandb" # (선택) wandb를 사용할 때 설정
    },
    # (선택)
    "wandb": {
        "entity": "wandb_repo",
        "project": "project_name",
        "name": "run_name"
    },
    "inference": {
        "ckt_path": "model ckt path", # 사전 학습이 진행된 모델의 checkpoint를 저장할 경로를 설정
        "result_path": "./prediction/",
        "no_repeat_ngram_size": 2,
        "early_stopping": True,
        "generate_max_length": 100,
        "num_beams": 2,
        "batch_size" : 32,
        # 정확한 모델 평가를 위해 제거할 불필요한 생성 토큰들을 정의
        "remove_tokens": ['<usr>', f"{tokenizer.bos_token}", f"{tokenizer.eos_token}", f"{tokenizer.pad_token}"]
    }
}

In [None]:
# 모델의 구성 정보
config_path = "./config2_decoder_only1.yaml"
with open(config_path, "w") as file:
    yaml.dump(config_data, file, allow_unicode=True)

In [None]:
config_path = "./config2_decoder_only1.yaml"

with open(config_path, "r") as file:
    loaded_config = yaml.safe_load(file)

pprint(loaded_config)

In [None]:
# (선택)wandb config 설정
loaded_config['wandb']['entity'] = "jungminseo"
loaded_config['wandb']['name'] = "Decoder_only_v1_model_cateto/korean-gpt-neox-125M"
loaded_config['wandb']['project'] = "NLP_Dialogue_Summarization"

In [None]:
# wandb 설정 내용을 확인
loaded_config['wandb']

In [None]:
# 데이터 전처리를 위한 클래스로, 데이터셋을 데이터프레임으로 변환하고 인코더와 디코더의 입력을 생성
class Preprocess:
    def __init__(self,
            bos_token: str,
            eos_token: str,
        ) -> None:

        self.bos_token = bos_token
        self.eos_token = eos_token

    @staticmethod
    # 실험에 필요한 컬럼을 가져옴
    def make_set_as_df(file_path, is_train = True):
        if is_train:
            df = pd.read_csv(file_path)
            train_df = df[['fname','dialogue','summary']]
            return train_df
        else:
            df = pd.read_csv(file_path)
            test_df = df[['fname','dialogue']]
            return test_df

    # GPT 모델의 입력, 출력 형태를 맞추기 위해 전처리를 진행   ### 수정
    def make_input(self, dataset, doc_tokenizer, sum_tokenizer, doc_max_length, sum_max_len, is_test = False):
        if is_test:
            # inference 시에는 document 만 주어지고, 마지막에 bos_token 을 붙여 생성 시작
            dialogue_text = dataset['dialogue']
            summary_test = dataset['summary']
            # <pad> <pad> d_1 d_2 d_3 ... d_n <bos>
            dialogue = [doc_tokenizer(dialogues, padding = 'max_length', truncation=True, max_length=doc_max_length-1, add_special_tokens=True)['input_ids'] + [doc_tokenizer.bos_token_id] for dialogues in dialogue_text.values]
            labels = [[-100] * sum_max_len for _ in dialogue]

            out = {"input_ids": dialogue, "labels": labels}
            print("inference을 위한 데이터에서 tokenizing 된 input 형태")
            print(dialogue[-1])
            print(doc_tokenizer.convert_ids_to_tokens(dialogue[-1]))

        else:
            dialogue_text = dataset['dialogue']
            summary_test = dataset['summary']

            dialogue = [doc_tokenizer(dialogues, padding = 'max_length', truncation=True, max_length=doc_max_length-1, add_special_tokens=True)['input_ids'] + [doc_tokenizer.bos_token_id] for dialogues in dialogue_text.values]
            summary = [sum_tokenizer(summaries + sum_tokenizer.eos_token, padding= 'max_length', truncation=True, max_length=sum_max_len, add_special_tokens=True)['input_ids'] for summaries in summary_test.values]

            tokenized_senetences = [dialogue + summary for (dialogue, summary) in zip(dialogue, summary)]

            labels = [[-100] * len(dialogue) + summary for (dialogue, summary) in zip(dialogue, summary)]
            labels = [[-100 if token == sum_tokenizer.pad_token_id else token for token in l] for l in labels]
            out = {"input_ids": tokenized_senetences, "labels": labels}

            # document 와 summary를 이어 붙여서 모델 학습에 사용. document 뒤에는 bos_token 을 붙여 생성 시작을 명시하고, summary 를 붙인 후 맨 뒤에는 eos_token 으로 생성의 끝을 명시
            # document를 padding 할 때는 side를 left로 주고, summary를 padding 할 때는 side를 right 로 주어 연속된 문장이 생성될 수 있도록 함
            # <pad> <pad> d_1 d_2 d_3 ... d_n <bos> s_1 s_2 ... s_m <eos>
            print("학습을 위한 데이터에서 tokenizing 된 input 형태")
            print(tokenized_senetences[-1])
            print(doc_tokenizer.convert_ids_to_tokens(tokenized_senetences[-1]))
            print("학습을 위한 데이터에서 label의 형태")
            print(labels[-1])

        return out


In [None]:
# Train에 사용되는 Dataset 클래스를 정의
class DatasetForTrain(Dataset):
    def __init__(self, dialogue, tokenizer):
        self.dataset = dialogue
        self.tokenizer = tokenizer

    def __getitem__(self, idx): # 해당하는 idx데이터를 모델에 입력가능한 형태로 가공하여 반환
        input_ids = torch.LongTensor(self.dataset["input_ids"][idx])
        labels = torch.LongTensor(self.dataset["labels"][idx])
        attention_mask =input_ids.ne(self.tokenizer.pad_token_id)
        return dict(input_ids = input_ids,
                    labels = labels,
                    attention_mask = attention_mask)

    def __len__(self):
        return len(self.dataset["input_ids"])

# Validation에 사용되는 Dataset 클래스를 정의
class DatasetForVal(Dataset):
    def __init__(self, dialogue, tokenizer):
        self.dataset = dialogue
        self.tokenizer = tokenizer

    def __getitem__(self, idx): # 해당하는 idx데이터를 모델에 입력가능한 형태로 가공하여 반환
        input_ids = torch.LongTensor(self.dataset["input_ids"][idx])
        labels = torch.LongTensor(self.dataset["labels"][idx])
        attention_mask =input_ids.ne(self.tokenizer.pad_token_id)
        return dict(input_ids = input_ids,
                    labels = labels,
                    attention_mask = attention_mask)

    def __len__(self):
        return len(self.dataset["input_ids"])

# Test에 사용되는 Dataset 클래스를 정의
class DatasetForInference(Dataset):
    def __init__(self, encoder_input, test_id, len):
        self.encoder_input = encoder_input
        self.test_id = test_id
        self.len = len

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encoder_input.items()}
        item['ID'] = self.test_id[idx]
        return item

    def __len__(self):
        return self.len


In [None]:
# tokenization 과정까지 진행된 최종적으로 모델에 입력될 데이터를 출력
def prepare_train_dataset(config, preprocessor, data_path, doc_tokenizer, sum_tokenizer, doc_max_len, sum_max_len):
    train_file_path = os.path.join(data_path,'train.csv')
    val_file_path = os.path.join(data_path,'dev_50.csv')

    # train, validation에 대해 각각 데이터프레임을 구축
    train_data = preprocessor.make_set_as_df(train_file_path)
    val_data = preprocessor.make_set_as_df(val_file_path)

    print('-'*150)
    print(f'train_data:\n {train_data["dialogue"][0]}')
    print(f'train_label:\n {train_data["summary"][0]}')

    print('-'*150)
    print(f'val_data:\n {val_data["dialogue"][0]}')
    print(f'val_label:\n {val_data["summary"][0]}')

    # tokenizing
    tokenized_train  = preprocessor.make_input(train_data, doc_tokenizer, sum_tokenizer, doc_max_len, sum_max_len)
    tokenized_val = preprocessor.make_input(val_data, doc_tokenizer, sum_tokenizer, doc_max_len, sum_max_len)
    print('-'*10, 'Load data complete', '-'*10,)

    #tokenized_encoder_inputs = tokenizer(encoder_input_train, return_tensors="pt", padding=True,
    #                        add_special_tokens=True, truncation=True, max_length=config['tokenizer']['encoder_max_len'], return_token_type_ids=False)
    #tokenized_decoder_inputs = tokenizer(decoder_input_train, return_tensors="pt", padding=True,
    #                    add_special_tokens=True, truncation=True, max_length=config['tokenizer']['decoder_max_len'], return_token_type_ids=False)
    #tokenized_decoder_ouputs = tokenizer(decoder_output_train, return_tensors="pt", padding=True,
    #                    add_special_tokens=True, truncation=True, max_length=config['tokenizer']['decoder_max_len'], return_token_type_ids=False)

    train_inputs_dataset = DatasetForTrain(tokenized_train, doc_tokenizer)

    #val_tokenized_encoder_inputs = tokenizer(encoder_input_val, return_tensors="pt", padding=True,
    #                    add_special_tokens=True, truncation=True, max_length=config['tokenizer']['encoder_max_len'], return_token_type_ids=False)
    #val_tokenized_decoder_inputs = tokenizer(decoder_input_val, return_tensors="pt", padding=True,
    #                    add_special_tokens=True, truncation=True, max_length=config['tokenizer']['decoder_max_len'], return_token_type_ids=False)
    #val_tokenized_decoder_ouputs = tokenizer(decoder_output_val, return_tensors="pt", padding=True,
    #                    add_special_tokens=True, truncation=True, max_length=config['tokenizer']['decoder_max_len'], return_token_type_ids=False)

    val_inputs_dataset = DatasetForVal(tokenized_val, doc_tokenizer)

    print('-'*10, 'Make dataset complete', '-'*10,)
    return train_inputs_dataset, val_inputs_dataset

In [None]:
def compute_metrics(config, tokenizer, pred):
    labels = pred.label_ids

    # pred.predictions이 튜플인 경우와 아닌 경우를 처리.
    if isinstance(pred.predictions, tuple):
        logits = pred.predictions[0]
    else:
        logits = pred.predictions

    preds = logits.argmax(-1)

    dialogue_max_len = config['tokenizer']['encoder_max_len']
    decoded_preds = tokenizer.batch_decode(preds[:, dialogue_max_len:], skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels[:, dialogue_max_len:], skip_special_tokens=True)

    print('-'*150)
    print(f"PRED: {decoded_preds[0]}")
    print(f"GOLD: {decoded_labels[0]}")
    print('-'*150)
    print(f"PRED: {decoded_preds[1]}")
    print(f"GOLD: {decoded_labels[1]}")
    print('-'*150)
    print(f"PRED: {decoded_preds[2]}")
    print(f"GOLD: {decoded_labels[2]}")

    # ROUGE score 계산
    metric = datasets.load_metric("rouge")
    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )

    # ROUGE 결과를 추출합니다.
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    result = {k: round(v, 4) for k, v in result.items()}

    return {
        'Rouge-1': result.get('rouge1', 0.0),
        'Rouge-2': result.get('rouge2', 0.0),
        'Rouge-L': result.get('rougeL', 0.0)
    }


In [None]:
# 학습을 위한 trainer 클래스와 매개변수를 정의합니다.
def load_trainer_for_train(config,generate_model,tokenizer, train_inputs_dataset,val_inputs_dataset):

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    print('-'*10, 'Make training arguments', '-'*10,)
    # set training args
    training_args = TrainingArguments(
                output_dir=config['general']['output_dir'], # model output directory
                overwrite_output_dir=config['training']['overwrite_output_dir'],
                num_train_epochs=config['training']['num_train_epochs'],  # total number of training epochs
                learning_rate=config['training']['learning_rate'], # learning_rate
                per_device_train_batch_size=config['training']['per_device_train_batch_size'], # batch size per device during training
                per_device_eval_batch_size=config['training']['per_device_eval_batch_size'],# batch size for evaluation
                warmup_ratio=config['training']['warmup_ratio'],  # number of warmup steps for learning rate scheduler
                weight_decay=config['training']['weight_decay'],  # strength of weight decay
                lr_scheduler_type=config['training']['lr_scheduler_type'],
                optim =config['training']['optim'],
                gradient_accumulation_steps=config['training']['gradient_accumulation_steps'],
                evaluation_strategy=config['training']['evaluation_strategy'], # evaluation strategy to adopt during training
                save_strategy =config['training']['save_strategy'],
                save_total_limit=config['training']['save_total_limit'], # number of total save model.
                fp16=config['training']['fp16'],
                load_best_model_at_end=config['training']['load_best_model_at_end'], # 최종적으로 가장 높은 점수 저장
                seed=config['training']['seed'],
                logging_dir=config['training']['logging_dir'], # directory for storing logs
                logging_strategy=config['training']['logging_strategy'],
                # predict_with_generate=config['training']['predict_with_generate'], #To use BLEU or ROUGE score
                # generation_max_length=config['training']['generation_max_length'],
                do_train=config['training']['do_train'],
                do_eval=config['training']['do_eval'],
                report_to=config['training']['report_to'] # (선택) wandb
            )

    # (선택) wandb를 사용하기 위해 초기화 
    wandb.init(
        entity=config['wandb']['entity'],
        project=config['wandb']['project'],
        name=config['wandb']['name'],
    )

    # (선택) 모델 checkpoint를 wandb에 저장하도록 환경 변수를 설정
    os.environ["WANDB_LOG_MODEL"]="true"
    os.environ["WANDB_WATCH"]="false"

    # EarlyStopping 기능
    MyCallback = EarlyStoppingCallback(
        early_stopping_patience=config['training']['early_stopping_patience'],
        early_stopping_threshold=config['training']['early_stopping_threshold']
    )
    print('-'*10, 'Make training arguments complete', '-'*10,)
    print('-'*10, 'Make trainer', '-'*10,)

    # Trainer 클래스를 정의
    trainer = Trainer(
        model=generate_model, # 사용자가 사전 학습하기 위해 사용할 모델
        args=training_args,
        train_dataset=train_inputs_dataset,
        eval_dataset=val_inputs_dataset,
        compute_metrics = lambda pred: compute_metrics(config, tokenizer,pred),
        tokenizer = tokenizer,
        data_collator=data_collator,
        callbacks = [MyCallback]
    )
    print('-'*10, 'Make trainer complete', '-'*10,)

    return trainer

In [None]:
"""
QLoRA 적용시


from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

def load_tokenizer_and_model_for_train(config, device):
    print('-'*10, 'Load tokenizer & model', '-'*10,)
    print('-'*10, f'Model Name : {config["general"]["model_name"]}', '-'*10,)
    model_name = config['general']['model_name']

    # model의 hyperparameter를 setting
    model_config = AutoConfig.from_pretrained(model_name)

    # Tokenizer를 불러옵니다.
    doc_tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
    sum_tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="right")

    # 양자화 설정 (8비트)
    quantization_config = BitsAndBytesConfig(load_in_8bit=True)

    # 사전 학습된 모델을 양자화 설정과 함께 불러옵니다.
    generate_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        config=model_config,
        quantization_config=quantization_config  # 양자화 설정 적용
    )

    # Special tokens를 추가
    special_tokens_dict = {'additional_special_tokens': config['tokenizer']['special_tokens']}
    doc_tokenizer.add_special_tokens(special_tokens_dict)
    sum_tokenizer.add_special_tokens(special_tokens_dict)

    # 사전에 special token을 추가했으므로 재구성 해줍니다.
    generate_model.resize_token_embeddings(len(doc_tokenizer))

    # 모델의 파라미터를 Freeze합니다.
    for param in generate_model.parameters():
        param.requires_grad = False

    # LoRA 설정
    lora_config = LoraConfig(
        r=8,  # rank
        lora_alpha=16,
        lora_dropout=0.05,
        inference_mode=False,
        bias="none",
        task_type="CAUSAL_LM"
    )

    # LoRA를 모델에 적용합니다.
    generate_model = get_peft_model(generate_model, lora_config)

    # 모델을 디바이스에 할당합니다.
    generate_model.to(device)

    print(generate_model.config)
    print('-'*10, 'Load tokenizer & model complete', '-'*10,)

    return generate_model, doc_tokenizer, sum_tokenizer

"""

In [None]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM

def load_tokenizer_and_model_for_train(config, device):
    print('-'*10, 'Load tokenizer & model', '-'*10,)
    print('-'*10, f'Model Name : {config["general"]["model_name"]}', '-'*10,)
    model_name = config['general']['model_name']

    # 모델의 하이퍼파라미터를 설정
    model_config = AutoConfig.from_pretrained(model_name)

    # Tokenizer를 불러오기
    doc_tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
    sum_tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="right")

    # Special tokens를 추가
    special_tokens_dict = {'additional_special_tokens': config['tokenizer']['special_tokens']}
    doc_tokenizer.add_special_tokens(special_tokens_dict)
    sum_tokenizer.add_special_tokens(special_tokens_dict)

    # 패딩 토큰이 없으면 추가
    if doc_tokenizer.pad_token is None:
        doc_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    if sum_tokenizer.pad_token is None:
        sum_tokenizer.add_special_tokens({'pad_token': '[PAD]'})


    # 사전에 special token을 추가했으므로 재구성 
    generate_model = AutoModelForCausalLM.from_pretrained(
        model_name,
        config=model_config
    )
    generate_model.resize_token_embeddings(len(doc_tokenizer))

    generate_model.to(device)

    print(generate_model.config)
    print('-'*10, 'Load tokenizer & model complete', '-'*10,)

    return generate_model, doc_tokenizer, sum_tokenizer


In [None]:
def main(config):
    # device 정의
    device = torch.device('cuda:0' if torch.cuda.is_available()  else 'cpu')
    print('-'*10, f'device : {device}', '-'*10,)
    print(torch.__version__)

    # 사용할 모델과 tokenizer를 불러오기
    generate_model , doc_tokenizer, sum_tokenizer = load_tokenizer_and_model_for_train(config,device)
    print('-'*10,"tokenizer special tokens : ",tokenizer.special_tokens_map,'-'*10)

    # 학습에 사용할 데이터셋을 불러오기
    preprocessor = Preprocess(config['tokenizer']['bos_token'], config['tokenizer']['eos_token']) # decoder_start_token: str, eos_token: str
    data_path = config['general']['data_path']
    train_inputs_dataset, val_inputs_dataset = prepare_train_dataset(config,preprocessor, data_path, doc_tokenizer, sum_tokenizer, config['tokenizer']['encoder_max_len'], config['tokenizer']['decoder_max_len'])

    # Trainer 클래스를 불러오기
    trainer = load_trainer_for_train(config, generate_model, doc_tokenizer, train_inputs_dataset,val_inputs_dataset)
    trainer.train()   # 모델 학습을 시작

    # (선택) 모델 학습이 완료된 후 wandb를 종료
    wandb.finish()

In [None]:
if __name__ == "__main__":
    main(loaded_config)