In [1]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import datasets
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoModel, AutoTokenizer
import pandas as pd

import os
import sys
import json
from tqdm import tqdm
from datetime import datetime
import argparse
from omegaconf import OmegaConf

  from .autonotebook import tqdm as notebook_tqdm
2024-11-14 09:30:08.208001: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-14 09:30:08.233899: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def make_dataset(train_data_path, validation_data_path):

    # load files
    loaded_data_dict = {
        'train': {
            'err_sentence': [],
            'cor_sentence': [],
        },
        'validation': {
            'err_sentence': [],
            'cor_sentence': [],
        }
    }
    # train
    with open(train_data_path, 'r') as f:
        _temp_json = json.load(f)
    loaded_data_dict['train']['err_sentence'].extend(list(map(lambda x: x['annotation']['err_sentence'], _temp_json['data'])))
    loaded_data_dict['train']['cor_sentence'].extend(list(map(lambda x: x['annotation']['cor_sentence'], _temp_json['data'])))
    print(f'train data :', len(_temp_json['data']))
    
    # validation
    with open(validation_data_path, 'r') as f:
        _temp_json = json.load(f)
    loaded_data_dict['validation']['err_sentence'].extend(list(map(lambda x: x['annotation']['err_sentence'], _temp_json['data'])))
    loaded_data_dict['validation']['cor_sentence'].extend(list(map(lambda x: x['annotation']['cor_sentence'], _temp_json['data'])))
    print(f'validation data :', len(_temp_json['data']))
    
    dataset_dict = {}
    for _trg in loaded_data_dict.keys():
        dataset_dict[_trg] = datasets.Dataset.from_dict(loaded_data_dict[_trg], split=_trg)
    dataset = datasets.DatasetDict(dataset_dict)
    return dataset

In [3]:
data = make_dataset('/home/yjtech2/Desktop/yurim/LLM/DATA/train/맞춤법오류_자유게시판.json', '/home/yjtech2/Desktop/yurim/LLM/DATA/validation/맞춤법오류_자유게시판.json')

data

train data : 14400
validation data : 1800


DatasetDict({
    train: Dataset({
        features: ['err_sentence', 'cor_sentence'],
        num_rows: 14400
    })
    validation: Dataset({
        features: ['err_sentence', 'cor_sentence'],
        num_rows: 1800
    })
})

In [4]:
def preprocess_function(df, tokenizer, src_col, tgt_col, max_length):
    '''
    This is a function for preprocessing dataset.
    
    Args:
        df (Dataset): A data in the dataset.
        tokenizer (AutoTokenizer): Model tokenizer.
        src_col (str): Source column name.
        tgt_col (str): Target column name.
        max_length (int): Max length.
    Returns:
        model_inputs (Dataset):
            - input_ids (list): Input data.
            - labels (list): Labeled Data.
    '''
    inputs = df[src_col]
    targets = df[tgt_col]
    model_inputs = tokenizer(
        inputs,
        max_length=max_length,
        truncation=True,
    )
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_length, truncation=True)
        
    model_inputs["labels"] = labels['input_ids']
    return model_inputs

In [5]:
def train(config):
    '''
    This is a function for training.
    
    Args:
        config (Dict): Config dict is made by config.yaml
    '''
    # Load model and tokenizer
    _now_time = datetime.now().__str__()
    print(f'[{_now_time}] ====== Model Load Start ======')
    model = AutoModelForSeq2SeqLM.from_pretrained(config.pretrained_model_name)
    tokenizer = AutoTokenizer.from_pretrained(config.pretrained_model_name)
    _now_time = datetime.now().__str__()
    print(f'[{_now_time}] ====== Model Load Finished ======')
    
    # Load data collator
    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
    
    # load data and set data
    print(f'[{_now_time}] ====== Data Load Start ======')
    _now_time = datetime.now().__str__()
    dataset = make_dataset('/home/yjtech2/Desktop/yurim/LLM/DATA/train/맞춤법오류_자유게시판.json', '/home/yjtech2/Desktop/yurim/LLM/DATA/validation/맞춤법오류_자유게시판.json')
    _now_time = datetime.now().__str__()
    print(f'[{_now_time}] ====== Data Load Finished ======')
    
    # data preprocessing
    print(f'[{_now_time}] ====== Data Preprocessing Start ======')
    _now_time = datetime.now().__str__()
    dataset_tokenized = dataset.map(lambda d: preprocess_function(d, tokenizer, config.src_col, config.tgt_col, config.max_length), batched=True, batch_size=config.per_device_train_batch_size)
    _now_time = datetime.now().__str__()
    print(f'[{_now_time}] ====== Data Preprocessing Finished ======')
    
    # set training arguments
    training_args = Seq2SeqTrainingArguments(
        output_dir=config.output_dir,
        learning_rate=config.learning_rate,
        per_device_train_batch_size=config.per_device_train_batch_size,
        per_device_eval_batch_size=config.per_device_eval_batch_size,
        num_train_epochs=config.num_train_epochs,
        fp16=config.fp16,
        weight_decay=config.weight_decay,
        do_eval=config.do_eval,
        evaluation_strategy=config.evaluation_strategy,
        warmup_ratio=config.warmup_ratio,
        log_level=config.log_level,
        logging_dir=config.logging_dir,
        logging_strategy=config.logging_strategy,
        logging_steps=config.logging_steps,
        eval_steps=config.eval_steps,
        save_strategy=config.save_strategy,
        save_steps=config.save_steps,
        save_total_limit=config.save_total_limit,
        load_best_model_at_end=config.load_best_model_at_end,
        metric_for_best_model=config.metric_for_best_model,
        greater_is_better=config.greater_is_better,
        dataloader_num_workers=config.dataloader_num_workers,
        group_by_length=config.group_by_length,
        report_to=config.report_to,
        ddp_find_unused_parameters=config.ddp_find_unused_parameters,
     )
    
    # set trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset_tokenized['train'],
        eval_dataset=dataset_tokenized['validation'],
        tokenizer=tokenizer,
        data_collator=data_collator,
    )
    
    # start train
    trainer.train()

In [6]:
if __name__ == '__main__':

    # # parse inputs args
    # parser = argparse.ArgumentParser()
    # parser.add_argument('--config_file')
    # args = parser.parse_args(sys.argv[1:])
    
    # load config file
    config_file = "/home/yjtech2/Desktop/yurim/LLM/Pre_processing/remove_terminology/config/base-config.yaml"
    config = OmegaConf.load(config_file)
    
    # make save path
    save_path = './'
    os.makedirs(save_path, exist_ok=True)
    
    # set device
    os.environ["CUDA_VISIBLE_DEVICES"] = config.CUDA_VISIBLE_DEVICES
    os.environ['TOKENIZERS_PARALLELISM'] = 'true'
    
    _now_time = datetime.now().__str__()
    print(f'[{_now_time}] ========== Train Start ==========')
    
    # call main method
    print(f'DEVICE : {config.CUDA_VISIBLE_DEVICES}')
    print(f'MODEL NAME : {config.pretrained_model_name}')
    print(f'TRAIN FILE PATH :')
    # for _path in config.train_data_path_list:
    #     print(f' - {_path}')
    # print(f'VALIDATION FILE PATH :')
    # for _path in config.validation_data_path_list:
    #     print(f' - {_path}')
    print(f'SAVE PATH : {config.output_dir}')
    train(config)
    
    _now_time = datetime.now().__str__()
    print(f'[{_now_time}] ========== Train Finished ==========')

DEVICE : -1
MODEL NAME : gogamza/kobart-base-v2
TRAIN FILE PATH :
SAVE PATH : ./result/corraction


You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.
You passed along `num_labels=3` with an incompatible id to label map: {'0': 'NEGATIVE', '1': 'POSITIVE'}. The number of labels wil be overwritten to 2.


train data : 14400
validation data : 1800


Map: 100%|██████████| 14400/14400 [00:00<00:00, 39854.47 examples/s]
Map: 100%|██████████| 1800/1800 [00:00<00:00, 37803.93 examples/s]
The following columns in the training set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids, err_sentence, cor_sentence. If token_type_ids, err_sentence, cor_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 14400
  Num Epochs = 30
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 6750
  Number of trainable parameters = 123859968




You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
200,1.5644,0.567303
400,0.4278,0.436476
600,0.2345,0.425695
800,0.136,0.450843
1000,0.0888,0.464369
1200,0.0615,0.460905
1400,0.0456,0.475703
1600,0.0374,0.49377
1800,0.0314,0.489376
2000,0.0215,0.5015


The following columns in the evaluation set don't have a corresponding argument in `BartForConditionalGeneration.forward` and have been ignored: token_type_ids, err_sentence, cor_sentence. If token_type_ids, err_sentence, cor_sentence are not expected by `BartForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1800
  Batch size = 64
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than 

