In [1]:
from tqdm import tqdm
import numpy as np
import torch
import torch.nn.functional as F
import os
import random
import gc

from torch.utils.data import DataLoader
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score
from transformers import TrainingArguments, Trainer, EvalPrediction
from transformers import EarlyStoppingCallback
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from plugin import delete_folder, check_exists_cuda, load_jsonl

In [2]:
MODEL_NAME = 'klue/roberta-large'

entity_property_pair = [ # 나중에 데이터로부터 불러와서 자동으로 만들어보자 !
    '가격', '기능/효과', '디자인', '밀착력/접착력',
    '발림성', '보습력/수분감/쿨링감', '사용감',
    '색상', '성분', '용기', '용량', '유통기한',
    '윤기/피부(톤)', '자극성', '제품구성', '제형',
    '지속력/유지력', '편의성/활용성',
    '품질', '피부타입', '향', '흡수력'
]

rep_entity_property_pair = [
    '남성화장품#가격', '남성화장품#기능/효과', '남성화장품#디자인',
    '남성화장품#밀착력/접착력', '남성화장품#발림성', '남성화장품#보습력/수분감/쿨링감',
    '남성화장품#사용감', '남성화장품#색상', '남성화장품#성분', '남성화장품#용기',
    '남성화장품#용량', '남성화장품#유통기한', '남성화장품#윤기/피부(톤)', '남성화장품#자극성',
    '남성화장품#제품구성', '남성화장품#제형', '남성화장품#지속력/유지력', '남성화장품#편의성/활용성',
    '남성화장품#품질', '남성화장품#피부타입', '남성화장품#향', '남성화장품#흡수력'
]

polarity_id_to_name = ['1', '-1', '0']  # pos: 0, neg: 1, neu: 2로 변환
polarity_name_to_id = {polarity_id_to_name[i]: i for i in range(len(polarity_id_to_name))}

In [3]:
class klue_Dataset(torch.utils.data.Dataset):
    """
    Input: 정규표현식, 개수가 적은 속성 제거 등으로 전처리된 데이터셋
    Ouput: 1차원 텐서(__getitem__) / 샘플의 수(__len__)
    """
    def __init__(self, dataset, label):
        self.dataset = dataset # {'input_ids': ~, 'token_type_ids': ~, 'attention_mask': ~, 'entity_ids' : ~}
        self.label = label

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.dataset.items()}
        item['label'] = torch.tensor(self.label[idx])
        
        return item

    def __len__(self):
        return len(self.label)

In [7]:
def set_seed(seed):
    """
    seed value를 고정하는 함수
    """
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [8]:
def define_datasets(data, main_category): # train, validation, test별로 들어옴 !
    """
    속성 & 감성 관련 데이터 및 레이블을 정의하는 함수
    """
    ACD_Datas = [[] for i in range(len(entity_property_pair))]
    ACD_labels = [[] for i in range(len(entity_property_pair))]

    POL_Datas = []
    POL_labels = []

    for idx, pair in enumerate(rep_entity_property_pair):
        for datas in data:
            sen = datas['raw_text']
            annos = datas['annotation']
            check_point = False
            
            ACD_Datas[idx].append(sen)
            
            for annotation in annos:
                entity_property = f'{main_category}#' + annotation[0]  # raw_data의 annotation 추출
                polarity = annotation[1]

                if entity_property == pair:
                    check_point = True
                    
            if check_point:
                ACD_labels[idx].append(1)
                POL_Datas.append(sen + " " + pair)
                POL_labels.append(polarity_name_to_id[polarity])
            
            else:
                ACD_labels[idx].append(0)
                
        ACD_Datas[idx], ACD_labels[idx] = shuffle(ACD_Datas[idx], ACD_labels[idx], random_state = 42)
        
    POL_Datas, POL_labels = shuffle(POL_Datas, POL_labels, random_state = 42)
    
    return ACD_Datas, ACD_labels, POL_Datas, POL_labels

In [9]:
def reshape_to_1d(val, Datas, labels, tokenizer): # train, validation, test별로 들어옴 !
    """
    Class를 이용해 1차원 텐서로 변경하는 함수
    """
    if val == 'aspect':
        klue_sets = []

        for i in range(len(rep_entity_property_pair)):
            tok_sen = tokenizer(Datas[i], return_tensors="pt", padding='max_length',
                                truncation=True, max_length=256, add_special_tokens=True)  
            
            klue_sets.append(klue_Dataset(tok_sen, labels[i]))
        
        return klue_sets
    
    elif val == 'sentiment':
        pol_tok_sen = tokenizer(Datas, return_tensors="pt", padding='max_length',
                            truncation=True,max_length=256, add_special_tokens=True)  
        
        POL_klue_sets = klue_Dataset(pol_tok_sen, labels)

        return POL_klue_sets

In [10]:
def compute_metrics(val, p: EvalPrediction):
    """
    Input:
      val이 aspect라면    average = 'binary',  (이진 분류)
      val이 sentiment라면 average = 'weighted' (다중클래스 분류)
    Output:
      평가지표 점수
    """
    if val == 'aspect':
        average = 'binary'
    elif val == 'sentiment':
        average = 'weighted'
    
    labels = p.label_ids
    preds = p.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average=average)
    acc = accuracy_score(labels, preds)

    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

In [11]:
def show_test_evaluation(val, infer_labels, infers):
    if val == 'aspect':
        length = len(rep_entity_property_pair)
        average = 'binary'
    elif val == 'sentiment':
        length = 1
        average = 'weighted'
    
    for x in range(0, length):
        print(x, "th Test.....")
        labelss = []
        for i in infer_labels[x]:
            for j in i:
                labelss.append(j)

        print(len(labelss), len(infers[x]))
        precision, recall, f1, _ = precision_recall_fscore_support(labelss, infers[x], average=average)
        acc = accuracy_score(labelss, infers[x])

        print("Accuracy: ", acc)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1-score:", f1)

## Main

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
check_exists_cuda(device)

set_seed(42)
main_categories = ['남성화장품']

# test
main_category = main_categories[0]
data = load_jsonl(f'./preprocessed_data/{main_category}.jsonl')

trains = data[:2823]
validations = data[2823:3763]
tests = data[3763:]

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

special_tokens_dict = {
    'additional_special_tokens': rep_entity_property_pair
}

num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

train_ACD_Datas, train_ACD_labels, train_POL_Datas, train_POL_labels = define_datasets(trains, main_category)
validation_ACD_Datas, validation_ACD_labels, validation_POL_Datas, validation_POL_labels = define_datasets(validations, main_category)
test_ACD_Datas, test_ACD_labels, test_POL_Datas, test_POL_labels = define_datasets(tests, main_category)

Device: <class 'torch.cuda.device'>
Count of using GPUs: 1
Current cuda device: 0


In [17]:
# 1차원 변환, aspect는 리스트 반환하고 sentiment는 단일 반환
train_aspect_klue_sets = reshape_to_1d('aspect', train_ACD_Datas, train_ACD_labels, tokenizer)
validation_aspect_klue_sets = reshape_to_1d('aspect', validation_ACD_Datas, validation_ACD_labels, tokenizer)
test_aspect_klue_sets = reshape_to_1d('aspect', test_ACD_Datas, test_ACD_labels, tokenizer)

train_sentiment_klue_sets = reshape_to_1d('sentiment', train_POL_Datas, train_POL_labels, tokenizer)
validation_sentiment_klue_sets = reshape_to_1d('sentiment', validation_POL_Datas, validation_POL_labels, tokenizer)
test_sentiment_klue_sets = reshape_to_1d('sentiment', test_POL_Datas, test_POL_labels, tokenizer)

In [None]:
### aspect trainer
for i in range(len(rep_entity_property_pair)):
    tmp_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

    odir = './ABSA' + str(i)

    pre_odir = './ABSA' + str(i-1)
    delete_folder(pre_odir)

    training_ars = TrainingArguments(
        output_dir=odir,
        num_train_epochs=10,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        save_total_limit=5,
        save_strategy = "epoch",
        learning_rate=1e-5,
        weight_decay=0.01,
        evaluation_strategy='epoch',
        metric_for_best_model = 'f1',
        load_best_model_at_end = True,
    )

    trainer = Trainer(
        model=tmp_model,
        args=training_ars,
        train_dataset=train_aspect_klue_sets[i],
        eval_dataset=validation_aspect_klue_sets[i],
        tokenizer=tokenizer,
        compute_metrics = lambda x: compute_metrics(val='aspect', p=x),
        callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
    )

    print(i, "th Trarining.... ========================================")
    trainer.train()
    tmp_model.save_pretrained(odir + "_best")

    # GPU Clean
    with torch.no_grad(): tmp_model
    del tmp_model
    gc.collect()
    torch.cuda.empty_cache()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


not exists:  ./ABSA-1


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.




Epoch,Training Loss,Validation Loss


In [None]:
final_odir = 'ABSA' + str(len(rep_entity_property_pair)-1)
delete_folder(final_odir)

In [None]:
### aspect evaluate tests

infers = [[] for i in range(len(rep_entity_property_pair))]
infer_labels = [[] for i in range(len(rep_entity_property_pair))]

for i in range(len(rep_entity_property_pair)):
    print(i, "th Test.... ========================================")
    BEST_MODEL_NAME = './ABSA' + str(i) + "_best"
    
    model = AutoModelForSequenceClassification.from_pretrained(BEST_MODEL_NAME)
    model.to(device)
    dataloader = DataLoader(test_aspect_klue_sets[i], batch_size=4, shuffle=False)

    model.eval()
    output_pred = []
    output_prob = []
    labels = []

    for z, data in enumerate(tqdm(dataloader)):
        with torch.no_grad():
            outputs = model(
                input_ids=data['input_ids'].to(device),
                attention_mask=data['attention_mask'].to(device),
                token_type_ids=data['token_type_ids'].to(device)
            )
        logits = outputs[0]
        prob = F.softmax(logits, dim=-1).detach().cpu().numpy()
        logits = logits.detach().cpu().numpy()
        result = np.argmax(logits, axis=-1)
        labels.append(data['label'].tolist())

        output_pred.append(result)
        output_prob.append(prob)

    pred_answer, output_prob = np.concatenate(output_pred).tolist(), np.concatenate(output_prob, axis=0).tolist()
    
    infers[i].extend(pred_answer)
    infer_labels[i].extend(labels)

In [None]:
# print aspect test scores
show_test_evaluation('aspect', infer_labels, infers)

In [None]:
### sentiment trainer

odir = './ABSA_pol'

tmp_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3)
tmp_model.resize_token_embeddings(tokenizer.vocab_size + num_added_toks)

training_ars = TrainingArguments(
    output_dir=odir,
    num_train_epochs=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    save_total_limit=5,
    save_strategy = "epoch",
    learning_rate=1e-5,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    metric_for_best_model = 'f1',
    load_best_model_at_end = True,
)

trainer = Trainer(
    model=tmp_model,
    args=training_ars,
    train_dataset=train_sentiment_klue_sets,
    eval_dataset=validation_sentiment_klue_sets,
    tokenizer=tokenizer,
    compute_metrics = lambda x: compute_metrics(val='sentiment', p=x),
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

print("0th Trarining.... ========================================")
trainer.train()
tmp_model.save_pretrained(odir + "_best")

In [None]:
delete_folder('./ABSA_pol')

In [None]:
### sentiment evaluate tests

print(0, "th Test.... ========================================")
BEST_MODEL_NAME = './ABSA_pol_best'

model = AutoModelForSequenceClassification.from_pretrained(BEST_MODEL_NAME)
model.to(device)
dataloader = DataLoader(test_sentiment_klue_sets, batch_size=4, shuffle=False)

model.eval()
output_pred = []
output_prob = []
labels = []

for z, data in enumerate(tqdm(dataloader)):
    with torch.no_grad():
        outputs = model(
            input_ids=data['input_ids'].to(device),
            attention_mask=data['attention_mask'].to(device),
            token_type_ids=data['token_type_ids'].to(device)
        )
    logits = outputs[0]
    prob = F.softmax(logits, dim=-1).detach().cpu().numpy()
    logits = logits.detach().cpu().numpy()
    result = np.argmax(logits, axis=-1)
    labels.append(data['label'].tolist())

    output_pred.append(result)
    output_prob.append(prob)

pred_answer, output_prob = np.concatenate(output_pred).tolist(), np.concatenate(output_prob, axis=0).tolist()

infers = [pred_answer]
infer_labels = [labels]

In [None]:
# print sentiment test scores
show_test_evaluation('aspect', infer_labels, infers)