In [1]:
import numpy as np
import torch
import torch.nn.functional as F
import os
import random
import gc

from tqdm import tqdm
from torch.utils.data import DataLoader
from sklearn.utils import shuffle
from transformers import TrainingArguments, Trainer, EvalPrediction, EarlyStoppingCallback \
    , AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from plugin import delete_folder, check_exists_cuda, load_jsonl

In [2]:
MODEL_NAME = 'klue/roberta-large'

original_aspects = [ # 나중에 데이터로부터 불러와서 자동으로 만들어보자 !
    '가격', '기능/효과', '디자인',
    '발림성', '보습력/수분감/쿨링감', '사용감',
    '성분', '용기', '용량', '유통기한',
    '윤기/피부(톤)', '자극성', '제품구성', '제형',
    '지속력/유지력', '편의성/활용성',
    '품질', '피부타입', '향', '흡수력'
]

category_with_original_aspects = [ # main category 받아서 하도록 나중에 바꾸기 !
    f'남성화장품#{aspect}' for aspect in original_aspects
]

sentiment_id_to_str = ['1', '-1', '0']  # pos: 0, neg: 1, neu: 2로 변환
sentiment_str_to_id = {sentiment_id_to_str[i]: i for i in range(len(sentiment_id_to_str))}

In [3]:
class klue_Dataset(torch.utils.data.Dataset):
    """
    Input: 정규표현식, 개수가 적은 속성 제거 등으로 전처리된 데이터셋
    Ouput: 1차원 텐서(__getitem__) / 샘플의 수(__len__)
    """
    def __init__(self, dataset, label):
        self.dataset = dataset # {'input_ids': ~, 'token_type_ids': ~, 'attention_mask': ~, 'entity_ids' : ~}
        self.label = label

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.dataset.items()}
        item['label'] = torch.tensor(self.label[idx])
        
        return item

    def __len__(self):
        return len(self.label)

In [4]:
def set_seed(seed):
    """
    seed value를 고정하는 함수
    """
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [5]:
def define_datasets(data, main_category): # train, validation, test별로 들어옴 !
    """
    속성 & 감성 관련 데이터 및 레이블을 정의하는 함수
    """
    ASP_datas = [[] for i in range(len(original_aspects))]
    ASP_labels = [[] for i in range(len(original_aspects))]

    SEN_data = []
    SEN_labels = []

    for i, pair in enumerate(category_with_original_aspects):
        for datas in data:
            review = datas['raw_text']
            annotations = datas['annotation']
            check_point = False
            
            ASP_datas[i].append(review)
            
            for annotation in annotations:
                entity_property = f'{main_category}#' + annotation[0]
                sentiment = annotation[1]

                if entity_property == pair:
                    check_point = True
                    
            if check_point:
                ASP_labels[i].append(1)
                SEN_data.append(review + " " + pair)
                SEN_labels.append(sentiment_str_to_id[sentiment])
            
            else:
                ASP_labels[i].append(0)
                
        ASP_datas[i], ASP_labels[i] = shuffle(ASP_datas[i], ASP_labels[i], random_state = 42)
        
    SEN_data, SEN_labels = shuffle(SEN_data, SEN_labels, random_state = 42)
    
    return ASP_datas, ASP_labels, SEN_data, SEN_labels

In [6]:
def reshape_to_1d(val, Datas, labels, tokenizer): # train, validation, test별로 들어옴 !
    """
    Class를 이용해 1차원 텐서로 변경하는 함수
    """
    if val == 'aspect':
        klue_sets = []

        for i in range(len(category_with_original_aspects)):
            tok_sentence = tokenizer(Datas[i], return_tensors="pt", padding='max_length' \
                            , truncation=True, max_length=256, add_special_tokens=True)  
            
            klue_sets.append(klue_Dataset(tok_sentence, labels[i]))
        
        return klue_sets
    
    elif val == 'sentiment':
        sen_tok_sentence = tokenizer(Datas, return_tensors="pt", padding='max_length' \
                            , truncation=True,max_length=256, add_special_tokens=True)  
        
        SEN_klue_sets = klue_Dataset(sen_tok_sentence, labels)

        return SEN_klue_sets

In [7]:
def compute_metrics(val, p: EvalPrediction):
    """
    Input:
      val이 aspect라면    average = 'binary',  (이진 분류)
      val이 sentiment라면 average = 'weighted' (다중클래스 분류)
    Output:
      평가지표 점수
    """
    if val == 'aspect':
        average = 'binary'
    elif val == 'sentiment':
        average = 'weighted'
    
    labels = p.label_ids
    preds = p.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average=average)
    acc = accuracy_score(labels, preds)

    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}

In [8]:
def show_test_evaluation(val, infer_labels, infers):
    """
    속성/감성 모델의 테스트 평가지표 결과를 출력하는 함수
    """
    if val == 'aspect':
        length = len(category_with_original_aspects)
        average = 'binary'
    elif val == 'sentiment':
        length = 1
        average = 'weighted'
    
    for x in range(0, length):
        print(x, "th Test.....")
        labelss = []
        for i in infer_labels[x]:
            for j in i:
                labelss.append(j)

        precision, recall, f1, _ = precision_recall_fscore_support(labelss, infers[x], average=average)
        acc = accuracy_score(labelss, infers[x])

        print("Accuracy: ", acc)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1-score:", f1)

## Main

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
check_exists_cuda(device)

set_seed(42)
main_categories = ['남성화장품', ]

# test
main_category = main_categories[0]
data = load_jsonl(f'./preprocessed_data/{main_category}.jsonl')

data_len = len(data)

trains = data[:int(data_len*0.6)]
validations = data[int(data_len*0.6):int(data_len*0.8)]
tests = data[int(data_len*0.8):]

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

special_tokens_dict = {
    'additional_special_tokens': category_with_original_aspects
}

num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

train_ASP_datas, train_ASP_labels, train_SEN_data, train_SEN_labels = define_datasets(trains, main_category)
validation_ASP_datas, validation_ASP_labels, validation_SEN_data, validation_SEN_labels = define_datasets(validations, main_category)
test_ASP_datas, test_ASP_labels, test_SEN_data, test_SEN_labels = define_datasets(tests, main_category)

Device: <class 'torch.cuda.device'>
Count of using GPUs: 1
Current cuda device: 0


In [10]:
# 1차원 변환, aspect는 리스트 반환하고 sentiment는 단일 반환
train_aspect_klue_sets = reshape_to_1d('aspect', train_ASP_datas, train_ASP_labels, tokenizer)
validation_aspect_klue_sets = reshape_to_1d('aspect', validation_ASP_datas, validation_ASP_labels, tokenizer)
test_aspect_klue_sets = reshape_to_1d('aspect', test_ASP_datas, test_ASP_labels, tokenizer)

train_sentiment_klue_sets = reshape_to_1d('sentiment', train_SEN_data, train_SEN_labels, tokenizer)
validation_sentiment_klue_sets = reshape_to_1d('sentiment', validation_SEN_data, validation_SEN_labels, tokenizer)
test_sentiment_klue_sets = reshape_to_1d('sentiment', test_SEN_data, test_SEN_labels, tokenizer)

In [11]:
### aspect trainer
for i in range(len(category_with_original_aspects)):
    tmp_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

    output_dir = './model_aspect_' + str(i)

    pre_output_dir = './model_aspect_' + str(i-1)
    delete_folder(pre_output_dir)

    training_ars = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=10,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        save_total_limit=5,
        save_strategy = "epoch",
        learning_rate=1e-5,
        weight_decay=0.01,
        evaluation_strategy='epoch',
        metric_for_best_model = 'f1',
        load_best_model_at_end = True,
    )

    trainer = Trainer(
        model=tmp_model,
        args=training_ars,
        train_dataset=train_aspect_klue_sets[i],
        eval_dataset=validation_aspect_klue_sets[i],
        tokenizer=tokenizer,
        compute_metrics = lambda x: compute_metrics(val='aspect', p=x),
        callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
    )

    print("=============")
    print(i+1, ") Aspect Training ...")
    trainer.train()
    tmp_model.save_pretrained(output_dir + "_best")

    # GPU Clean
    with torch.no_grad(): tmp_model
    del tmp_model
    gc.collect()
    torch.cuda.empty_cache()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


not exists:  ./model_aspect_-1


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


1 ) Aspect Training ...


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
final_output_dir = 'model_aspect_' + str(len(category_with_original_aspects)-1)
delete_folder(final_output_dir)

In [None]:
### aspect evaluate tests

infers = [[] for i in range(len(category_with_original_aspects))]
infer_labels = [[] for i in range(len(category_with_original_aspects))]

for i in range(len(category_with_original_aspects)):
    print("=============")
    print(i+1, ") Aspect Test ...")
    BEST_MODEL_NAME = './model_aspect_' + str(i) + "_best"
    
    model = AutoModelForSequenceClassification.from_pretrained(BEST_MODEL_NAME)
    model.to(device)
    dataloader = DataLoader(test_aspect_klue_sets[i], batch_size=4, shuffle=False)

    model.eval()
    output_pred = []
    output_prob = []
    labels = []

    for z, data in enumerate(tqdm(dataloader)):
        with torch.no_grad():
            outputs = model(
                input_ids=data['input_ids'].to(device),
                attention_mask=data['attention_mask'].to(device),
                token_type_ids=data['token_type_ids'].to(device)
            )
        logits = outputs[0]
        prob = F.softmax(logits, dim=-1).detach().cpu().numpy()
        logits = logits.detach().cpu().numpy()
        result = np.argmax(logits, axis=-1)
        labels.append(data['label'].tolist())

        output_pred.append(result)
        output_prob.append(prob)

    pred_answer, output_prob = np.concatenate(output_pred).tolist(), np.concatenate(output_prob, axis=0).tolist()
    
    infers[i].extend(pred_answer)
    infer_labels[i].extend(labels)

In [None]:
# print aspect test scores
show_test_evaluation('aspect', infer_labels, infers)

In [None]:
### sentiment trainer

output_dir = './model_sentiment'

tmp_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)
tmp_model.resize_token_embeddings(tokenizer.vocab_size + num_added_toks)

training_ars = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_total_limit=5,
    save_strategy = "epoch",
    learning_rate=1e-5,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    metric_for_best_model = 'f1',
    load_best_model_at_end = True,
)

trainer = Trainer(
    model=tmp_model,
    args=training_ars,
    train_dataset=train_sentiment_klue_sets,
    eval_dataset=validation_sentiment_klue_sets,
    tokenizer=tokenizer,
    compute_metrics = lambda x: compute_metrics(val='sentiment', p=x),
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

print("=============")
print("Sentiment Training ...")
trainer.train()
tmp_model.save_pretrained(output_dir + "_best")

In [None]:
delete_folder('./model_sentiment')

In [None]:
### sentiment evaluate tests

print("=============")
print("Sentiment Test ...")
BEST_MODEL_NAME = './model_sentiment_best'

model = AutoModelForSequenceClassification.from_pretrained(BEST_MODEL_NAME)
model.to(device)
dataloader = DataLoader(test_sentiment_klue_sets, batch_size=4, shuffle=False)

model.eval()
output_pred = []
output_prob = []
labels = []

for z, data in enumerate(tqdm(dataloader)):
    with torch.no_grad():
        outputs = model(
            input_ids=data['input_ids'].to(device),
            attention_mask=data['attention_mask'].to(device),
            token_type_ids=data['token_type_ids'].to(device)
        )
    logits = outputs[0]
    prob = F.softmax(logits, dim=-1).detach().cpu().numpy()
    logits = logits.detach().cpu().numpy()
    result = np.argmax(logits, axis=-1)
    labels.append(data['label'].tolist())

    output_pred.append(result)
    output_prob.append(prob)

pred_answer, output_prob = np.concatenate(output_pred).tolist(), np.concatenate(output_prob, axis=0).tolist()

infers = [pred_answer]
infer_labels = [labels]

In [None]:
# print sentiment test scores
show_test_evaluation('aspect', infer_labels, infers)