## Pre-setting

In [1]:
import torch
import gc

from transformers import TrainingArguments, Trainer, EarlyStoppingCallback \
    , AutoModelForSequenceClassification, AutoTokenizer

from etc_plugin import delete_folder, check_exists_cuda, load_jsonl
from model_plugin import set_seed, extract_annotation_keys, define_datasets \
    , reshape_to_1d, compute_metrics

In [2]:
MODEL_NAME = 'klue/roberta-base'
TR_VL_SPLIT = 0.6
VL_TS_SPLIT = 0.8

## Model

In [10]:
def aspect_trainer(tokenizer, main_category, train_aspect_klue_sets, validation_aspect_klue_sets):
    length = len(train_aspect_klue_sets)

    for i in range(length):
        tmp_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
    
        output_dir = f'./{main_category}/model_aspect_' + str(i)
    
        pre_output_dir = f'./{main_category}/model_aspect_' + str(i-1)
        delete_folder(pre_output_dir)
    
        training_ars = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=5,
            per_device_train_batch_size=4,
            per_device_eval_batch_size=4,
            save_total_limit=5,
            save_strategy = "epoch",
            learning_rate=1e-5,
            weight_decay=0.01,
            evaluation_strategy='epoch',
            metric_for_best_model = 'f1',
            load_best_model_at_end = True,
        )
    
        trainer = Trainer(
            model=tmp_model,
            args=training_ars,
            train_dataset=train_aspect_klue_sets[i],
            eval_dataset=validation_aspect_klue_sets[i],
            tokenizer=tokenizer,
            compute_metrics = lambda x: compute_metrics(val='aspect', p=x),
            callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]
        )
    
        print("=============")
        print(i+1, ") Aspect Training ...")
        trainer.train()
        tmp_model.save_pretrained(output_dir + "_best")
    
        # GPU Clean
        with torch.no_grad(): tmp_model
        del tmp_model
        gc.collect()
        torch.cuda.empty_cache()

    # 마지막 폴더 지우기
    final_output_dir = f'{main_category}/model_aspect_' + str(length-1)
    delete_folder(final_output_dir)

In [12]:
def sentiment_trainer(tokenizer, num_added_toks, main_category, train_sentiment_klue_sets, validation_sentiment_klue_sets):
    output_dir = f'./{main_category}/model_sentiment'
    
    tmp_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)
    tmp_model.resize_token_embeddings(tokenizer.vocab_size + num_added_toks)
    
    training_ars = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        save_total_limit=5,
        save_strategy = "epoch",
        learning_rate=1e-5,
        weight_decay=0.01,
        evaluation_strategy='epoch',
        metric_for_best_model = 'f1',
        load_best_model_at_end = True,
    )
    
    trainer = Trainer(
        model=tmp_model,
        args=training_ars,
        train_dataset=train_sentiment_klue_sets,
        eval_dataset=validation_sentiment_klue_sets,
        tokenizer=tokenizer,
        compute_metrics = lambda x: compute_metrics(val='sentiment', p=x),
        callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]
    )
    
    print("=============")
    print("Sentiment Training ...")
    trainer.train()
    tmp_model.save_pretrained(output_dir + "_best")

    delete_folder(f'./{main_category}/model_sentiment')

## Main

In [None]:
if __name__ == '__main__':
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    check_exists_cuda(device)
    
    set_seed()
    main_categories = [
        '스킨케어',
        '헤어_바디케어',
        '메이크업_뷰티소품',
        '남성화장품'
    ]
    
    for main_category in main_categories:
        print(f"================={main_category}====================")
        jsonl_file_path = f"./preprocessed_data/{main_category}.jsonl"
        data = load_jsonl(jsonl_file_path)
    
        result = extract_annotation_keys(data)
        original_aspects = sorted(result)
        category_with_original_aspects = [f'{main_category}#{aspect}' for aspect in original_aspects]
        
        data_len = len(data)
        
        trains = data[:int(data_len*TR_VL_SPLIT)]
        validations = data[int(data_len*TR_VL_SPLIT):int(data_len*VL_TS_SPLIT)]
        
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        
        special_tokens_dict = {
            'additional_special_tokens': category_with_original_aspects
        }
        
        num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
        
        train_ASP_datas, train_ASP_labels, train_SEN_data, train_SEN_labels = define_datasets(trains, main_category, category_with_original_aspects)
        validation_ASP_datas, validation_ASP_labels, validation_SEN_data, validation_SEN_labels = define_datasets(validations, main_category, category_with_original_aspects)
        
        # 1차원 변환, aspect는 리스트 반환하고 sentiment는 단일 반환
        train_aspect_klue_sets = reshape_to_1d('aspect', train_ASP_datas, train_ASP_labels, tokenizer)
        validation_aspect_klue_sets = reshape_to_1d('aspect', validation_ASP_datas, validation_ASP_labels, tokenizer)
        
        train_sentiment_klue_sets = reshape_to_1d('sentiment', train_SEN_data, train_SEN_labels, tokenizer)
        validation_sentiment_klue_sets = reshape_to_1d('sentiment', validation_SEN_data, validation_SEN_labels, tokenizer)
        
        # Aspect model train & test
        aspect_trainer(tokenizer, main_category, train_aspect_klue_sets, validation_aspect_klue_sets)
        
        # Sentiment model train & test
        sentiment_trainer(tokenizer, num_added_toks, main_category, train_sentiment_klue_sets, validation_sentiment_klue_sets)
        
        print("================================================")