In [None]:
!pip install transformers
!pip install pororo

In [None]:
!git clone https://github.com/jjonhwa/Relation_Extraction.git

In [5]:
import pandas as pd
import numpy as np
import torch

from transformers import AutoModel, AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

%cd Relation_Extraction
from dataset import preprocessing_dataset,load_data
from utils.get_cls import get_model_tokenizer, get_cls_token, get_dataset_cls_hidden

/content/Relation_Extraction


In [None]:
from pororo import Pororo
mt = Pororo(task = 'translation', lang = 'multi')

### Label이 적은 Data 확인

In [None]:
# Entity.ipynb 에서 저장한 후 활용
data = pd.read_csv("/content/Relation_Extraction/data/train_final_pororo_sub.csv")
data.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [None]:
# 각 Label별 개수 확인
label_dict = {}
for label in data['label'] :
    if label not in label_dict.keys() :
        label_dict[label] = 1
    else :
        label_dict[label] += 1 

In [None]:
# Label의 총 개수가 500개 이하일 경우, Low Group에 추가
low_group = [] 
for key, value in label_dict.items() :
    if value <= 500 :
        low_group.append(key)

In [None]:
# 총 30개의 label 중 17개의 label이 500개 이하
len(low_group)

17

In [None]:
# Low Group에 속하는 Data만 따로 Pandas DataFrame 형태로 제작
low_data = pd.DataFrame()
for label in data['label'].unique() :
    if label in low_group :
        low_tmp_data = data[data['label'] == label]
        low_data = pd.concat([low_data, low_tmp_data])
low_data.reset_index(drop = True, inplace = True)

### CLS Token을 만들기 위한 Backbone Model 불러오기

In [None]:
# klue/roberta-large Model을 CLS Token을 만들기 위한 Backbone Model로 활용
model, tokenizer = get_model_tokenizer('klue/roberta-large')

Downloading:   0%|          | 0.00/375 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/734k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/173 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/547 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

Some weights of the model checkpoint at klue/roberta-large were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it f

### Data Augmentation
- 여러 방식으로 번역 진행
- 원문의 Hidden vector와 번역결과물의 Hidden vector의 Cosine Similarity 계산
- 가장 높은 번역물부터 'subject entity'와 'object entity'를 가지고 있는 결과물을 추가데이터로 활용

In [None]:
first_aug_dict = {'id' : [],
                  'sentence' : [],
                  'label' : [],
                  'subject_entity' : [],
                  'object_entity' : []}

for i in tqdm(range(len(low_data))) :
    sub, obj = eval(low_data['subject_entity'][i]), eval(low_data['object_entity'][i])
    translation = []

    # 원문의 entity들을 특정 문자로 변경 / subject_entity => 'AA', object_entity => 'BB'
    tmp_sentence = low_data['sentence'][i].replace(sub, 'AA').replace(obj, 'BB')
    
    # 한국어 -> 영어 -> 한국어
    ko_en = mt(tmp_sentence, src = 'ko', tgt = 'en')
    first_tmp_sentence = mt(ko_en, src = 'en', tgt = 'ko')
    translation.append(first_tmp_sentence)

    # 한국어 -> 일본어 -> 한국어
    ko_ja = mt(tmp_sentence, src = 'ko', tgt = 'ja')
    second_tmp_sentence = mt(ko_ja, src = 'ja', tgt = 'ko')
    translation.append(second_tmp_sentence)

    # 한국어 -> 영어 -> 중국어 -> 한국어
    en_zh = mt(ko_en, src = 'en', tgt = 'zh')
    third_tmp_sentence = mt(en_zh, src = 'zh', tgt = 'ko')
    translation.append(third_tmp_sentence)

    # 한국어 -> 일본어 -> 영어 -> 한국어
    ja_en = mt(ko_ja, src = 'ja', tgt = 'en')
    forth_tmp_sentence = mt(ja_en, src = 'en', tgt = 'ko')
    translation.append(forth_tmp_sentence)

    # 한국어 -> 영어 -> 일본어 -> 한국어 
    en_ja = mt(ko_en, src = 'en', tgt = 'ja')
    fifth_tmp_sentence = mt(en_ja, src = 'ja', tgt = 'ko')
    translation.append(fifth_tmp_sentence)

    # 번역 결과물과 원문의 cls token에 대한 hidden vector 추출
    dataset_cls_hidden = get_dataset_cls_hidden(translation)
    query_cls_hidden = get_cls_token(tmp_sentence, model, tokenizer)

    # Cosine 유사도 계산
    cos_sim = cosine_similarity(query_cls_hidden, dataset_cls_hidden)
    final_dict = {}
    for trans, cs in zip(translation, cos_sim[0]) :
        final_dict[trans] = cs

    final_check = sorted(final_dict.items(), key = lambda x : x[1], reverse = True)

    # 가장 Cosine 유사도가 높으면서, subject_entity와 object_entity를 포함하는 문장만 추가
    check_count = 0
    for j in range(len(final_check)) :
        if 'AA' in final_check[j][0] and 'BB' in final_check[j][0]:
            aug_sentence = final_check[j][0].replace('AA', sub).replace('BB', obj)
            first_aug_dict['id'].append(0)
            first_aug_dict['sentence'].append(aug_sentence)
            first_aug_dict['label'].append(low_data['label'][i])
            first_aug_dict['subject_entity'].append(low_data['subject_entity'][i])
            first_aug_dict['object_entity'].append(low_data['object_entity'][i])
            check_count += 1
            if check_count == 1 :
                break
        
    aug_dataset = pd.DataFrame(first_aug_dict)
    aug_dataset.to_csv('/content/Relation_Extraction/data/aug_data.csv', index = False)