In [None]:
!pip install pororo
!pip install transformers

In [None]:
from pororo import Pororo

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
!git clone https://github.com/jjonhwa/Relation_Extraction.git

In [None]:
%cd Relation_Extraction
from dataset import load_data, label_to_num

from transformers import AutoTokenizer
from tqdm import tqdm

import pandas as pd

In [None]:
train_dataset = load_data("/content/Relation_Extraction/data/train.csv")
test_dataset = load_data("/content/Relation_Extraction/data/test_data.csv")
train_label = label_to_num(train_dataset['label'])

In [None]:
MODEL_NAME = 'klue/bert-base'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/425 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/243k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/125 [00:00<?, ?B/s]

## Train/Test - Typed Entity Marker

In [None]:
ner = Pororo(task='ner', lang = 'ko')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.



### Train Dataset 생성

In [None]:
train_dataset['NER_SUB'] = ''
train_dataset['NER_OBJ'] = '' 

In [None]:
special_token_list = []

for i in tqdm(range(len(train_dataset))) :
    sub, obj = train_dataset['subject_entity'][i], train_dataset['object_entity'][i]
    
    # sub, obj 모양 전처리 => entity 앞 뒤의 작은따옴표 제거 => 이후 Tagging한 후 복원
    # ex) '손흥민' -> 손흥민 -> [PERSON]손흥민[/PERSON] -> '[PERSON]손흥민[/PERSON]'
    sub, obj = sub.strip(), obj.strip()
    if sub[0] == "'" :
        sub = sub[1:]
        add_sub = "'"
    elif sub[0] == '"':
        sub = sub[1:]
        add_sub = '"'
    if obj[0] == "'" :
        obj = obj[1:]
        add_obj = "'"
    elif obj[0] == '"':
        obj = obj[1:]
        add_obj = '"'
    if sub[-1] == "'" or sub[-1] == '"':
        sub = sub[:-1]
    if obj[-1] == "'" or obj[-1] == '"':
        obj = obj[:-1]
    
    # ner이 먹히지 않는 경우도 있기 때문에 try-except문으로 ner찾기
    try :
        ner_sub = ner(sub)
    except :
        ner_sub = ''
    
    try :
        ner_obj = ner(obj)
    except :
        ner_obj = ''
        
    # ner의 개수가 2개 이상일 경우는 애매한 분류로 판단 => 1개일 경우에만 NER Tagging
    if len(ner_sub) == 1:
        change_sub = '[' + ner_sub[0][1] + ']' + sub + '[/' + ner_sub[0][1] + ']'
        train_dataset['NER_SUB'][i] = ' ' + add_sub + change_sub + add_sub
        train_dataset['sentence'][i] = train_dataset['sentence'][i].replace(sub, change_sub)
        special_token_list.append('[' + ner_sub[0][1] + ']')
        special_token_list.append('[/' + ner_sub[0][1] + ']')
    else :
        train_dataset['NER_SUB'][i] = train_dataset['subject_entity'][i]
    if len(ner_obj) == 1 :
        change_obj = '[' + ner_obj[0][1] + ']' + obj + '[/' + ner_obj[0][1] + ']'
        train_dataset['NER_OBJ'][i] = ' ' + add_obj + change_obj + add_obj
        train_dataset['sentence'][i] = train_dataset['sentence'][i].replace(obj, change_obj)
        special_token_list.append('[' + ner_obj[0][1] + ']')
        special_token_list.append('[/' + ner_obj[0][1] + ']')
    else :
        train_dataset['NER_OBJ'][i] = train_dataset['object_entity'][i]

In [None]:
# 기존의 "subject_entity", "object_entity"를 제거하고 새로 만든 entity인 NER_SUB, NER_OBJ를 subject_entity, object_entity로 변경
train_dataset.drop(['subject_entity', 'object_entity'], axis = 1, inplace = True)
train_dataset.columns = ['id', 'sentence', 'label', 'subject_entity', 'object_entity']
train_dataset.tail()

In [None]:
# Train 1차 수정: Tagging이 붙어있지 않을 경우, [O]...[/O]로 전부 태깅
for i in tqdm(range(len(train_dataset))) :
    sub, obj = train_dataset['subject_entity'][i], train_dataset['object_entity'][i]
    sub_check = True
    obj_check = True

    if '[' in sub and '[/' in sub :
        sub_check = False
    
    if '[' in obj and '[/' in obj :
        obj_check = False
    
    # sub, obj 모양 전처리 => entity 앞 뒤의 작은따옴표 제거 => 이후 Tagging한 후 복원
    sub, obj = sub.strip(), obj.strip()
    if sub[0] == "'" :
        sub = sub[1:]
        add_sub = "'"
    elif sub[0] == '"':
        sub = sub[1:]
        add_sub = '"'
    if obj[0] == "'" :
        obj = obj[1:]
        add_obj = "'"
    elif obj[0] == '"':
        obj = obj[1:]
        add_obj = '"'
    if sub[-1] == "'" or sub[-1] == '"':
        sub = sub[:-1]
    if obj[-1] == "'" or obj[-1] == '"':
        obj = obj[:-1]

    if sub_check :
        change_sub = "[O]" + sub + "[/O]"
        train_dataset['subject_entity'][i] = ' ' + add_sub + change_sub + add_sub
        train_dataset['sentence'][i] = train_dataset['sentence'][i].replace(sub, change_sub)
    if obj_check : 
        change_obj = "[O]" + obj + "[/O]"
        train_dataset['object_entity'][i] = ' ' + add_obj + change_obj + add_obj
        train_dataset['sentence'][i] = train_dataset['sentence'][i].replace(obj, change_obj)

In [None]:
train_dataset.tail()

In [None]:
# Train 2차 수정: SUB, OBJ 추가 => [PERSON]손흥민[/PERSON] -> [SUB-PERSON]손흥민[/SUB-PERSON]
special_token_list = []
for i in tqdm(range(len(train_dataset))) :
    sub, obj = train_dataset['subject_entity'][i], train_dataset['object_entity'][i]
    
    # sub, obj 모양 전처리 => entity 앞 뒤의 작은따옴표 제거 => 이후 Tagging한 후 복원
    sub, obj = sub.strip(), obj.strip()
    if sub[0] == "'" :
        sub = sub[1:]
        add_sub = "'"
    elif sub[0] == '"':
        sub = sub[1:]
        add_sub = '"'
    if obj[0] == "'" :
        obj = obj[1:]
        add_obj = "'"
    elif obj[0] == '"':
        obj = obj[1:]
        add_obj = '"'
    if sub[-1] == "'" or sub[-1] == '"':
        sub = sub[:-1]
    if obj[-1] == "'" or obj[-1] == '"':
        obj = obj[:-1]
    
    change_sub = '[SUB-' + sub.split('[')[1] + '[/SUB-' + sub.split('/')[-1]
    change_obj = '[OBJ-' + obj.split('[')[1] + '[/OBJ-' + obj.split('/')[-1]
    train_dataset['subject_entity'][i] = ' ' + add_sub + change_sub + add_sub
    train_dataset['object_entity'][i] = ' ' + add_obj + change_obj + add_obj
    train_dataset['sentence'][i] = train_dataset['sentence'][i].replace(sub, change_sub)
    train_dataset['sentence'][i] = train_dataset['sentence'][i].replace(obj, change_obj)

    special_token_list.append('[SUB-' + sub.split('[')[1].split(']')[0] + ']')
    special_token_list.append('[/SUB-' + sub.split('[')[1].split(']')[0] + ']')
    special_token_list.append('[OBJ-' + obj.split('[')[1].split(']')[0] + ']')
    special_token_list.append('[/OBJ-' + obj.split('[')[1].split(']')[0] + ']')

In [None]:
train_dataset.tail()

In [None]:
train_dataset.to_csv('../dataset/train/train_final_pororo_sub.csv')

### Test Dataset 생성

In [None]:
test_dataset['NER_SUB'] = ''
test_dataset['NER_OBJ'] = ''

In [None]:
for i in tqdm(range(len(test_dataset))) :
    sub, obj = test_dataset['subject_entity'][i], test_dataset['object_entity'][i]
    
    # sub, obj 모양 전처리 => entity 앞 뒤의 작은따옴표 제거 => 이후 Tagging한 후 복원
    # ex) '손흥민' -> 손흥민 -> [PERSON]손흥민[/PERSON] -> '[PERSON]손흥민[/PERSON]'
    sub, obj = sub.strip(), obj.strip()
    if sub[0] == "'" :
        sub = sub[1:]
        add_sub = "'"
    elif sub[0] == '"':
        sub = sub[1:]
        add_sub = '"'
    if obj[0] == "'" :
        obj = obj[1:]
        add_obj = "'"
    elif obj[0] == '"':
        obj = obj[1:]
        add_obj = '"'
    if sub[-1] == "'" or sub[-1] == '"':
        sub = sub[:-1]
    if obj[-1] == "'" or obj[-1] == '"':
        obj = obj[:-1]
    
    # ner이 먹히지 않는 경우도 있기 때문에 try-except문으로 ner찾기
    try :
        ner_sub = ner(sub)
    except :
        ner_sub = ''
    
    try :
        ner_obj = ner(obj)
    except :
        ner_obj = ''
        
    # ner의 개수가 2개 이상일 경우는 애매한 분류로 판단 => 1개일 경우에만 NER Tagging
    if len(ner_sub) == 1:
        change_sub = '[' + ner_sub[0][1] + ']' + sub + '[/' + ner_sub[0][1] + ']'
        test_dataset['NER_SUB'][i] = ' ' + add_sub + change_sub + add_sub
        test_dataset['sentence'][i] = test_dataset['sentence'][i].replace(sub, change_sub)
    else :
        test_dataset['NER_SUB'][i] = test_dataset['subject_entity'][i]
    if len(ner_obj) == 1 :
        change_obj = '[' + ner_obj[0][1] + ']' + obj + '[/' + ner_obj[0][1] + ']'
        test_dataset['NER_OBJ'][i] = ' ' + add_obj + change_obj + add_obj
        test_dataset['sentence'][i] = test_dataset['sentence'][i].replace(obj, change_obj)
    else :
        test_dataset['NER_OBJ'][i] = test_dataset['object_entity'][i] 

In [None]:
# 기존의 "subject_entity", "object_entity"를 제거하고 새로 만든 entity인 NER_SUB, NER_OBJ를 subject_entity, object_entity로 변경
test_dataset.drop(['subject_entity', 'object_entity'], axis = 1, inplace = True)
test_dataset.columns = ['id', 'sentence', 'label', 'subject_entity', 'object_entity']

In [None]:
# Test 1차 수정: Tagging이 붙어있지 않을 경우, [O]...[/O]로 전부 태깅
for i in tqdm(range(len(test_dataset))) :
    sub, obj = test_dataset['subject_entity'][i], test_dataset['object_entity'][i]
    sub_check = True
    obj_check = True

    if '[' in sub and '[/' in sub :
        sub_check = False
    
    if '[' in obj and '[/' in obj :
        obj_check = False
    
    # sub, obj 모양 전처리 => entity 앞 뒤의 작은따옴표 제거 => 이후 Tagging한 후 복원
    sub, obj = sub.strip(), obj.strip()
    if sub[0] == "'" :
        sub = sub[1:]
        add_sub = "'"
    elif sub[0] == '"':
        sub = sub[1:]
        add_sub = '"'
    if obj[0] == "'" :
        obj = obj[1:]
        add_obj = "'"
    elif obj[0] == '"':
        obj = obj[1:]
        add_obj = '"'
    if sub[-1] == "'" or sub[-1] == '"':
        sub = sub[:-1]
    if obj[-1] == "'" or obj[-1] == '"':
        obj = obj[:-1]

    if sub_check :
        change_sub = "[O]" + sub + "[/O]"
        test_dataset['subject_entity'][i] = ' ' + add_sub + change_sub + add_sub
        test_dataset['sentence'][i] = test_dataset['sentence'][i].replace(sub, change_sub)
    if obj_check : 
        change_obj = "[O]" + obj + "[/O]"
        test_dataset['object_entity'][i] = ' ' + add_obj + change_obj + add_obj
        test_dataset['sentence'][i] = test_dataset['sentence'][i].replace(obj, change_obj)

In [None]:
# Test 2차 수정: SUB, OBJ 추가 => [PERSON]손흥민[/PERSON] -> [SUB-PERSON]손흥민[/SUB-PERSON]
for i in tqdm(range(len(test_dataset))) :
    sub, obj = test_dataset['subject_entity'][i], test_dataset['object_entity'][i]

    # sub, obj 모양 전처리 => entity 앞 뒤의 작은따옴표 제거 => 이후 Tagging한 후 복원
    sub, obj = sub.strip(), obj.strip()
    if sub[0] == "'" :
        sub = sub[1:]
        add_sub = "'"
    elif sub[0] == '"':
        sub = sub[1:]
        add_sub = '"'
    if obj[0] == "'" :
        obj = obj[1:]
        add_obj = "'"
    elif obj[0] == '"':
        obj = obj[1:]
        add_obj = '"'
    if sub[-1] == "'" or sub[-1] == '"':
        sub = sub[:-1]
    if obj[-1] == "'" or obj[-1] == '"':
        obj = obj[:-1]
    
    change_sub = '[SUB-' + sub.split('[')[1] + '[/SUB-' + sub.split('/')[-1]
    change_obj = '[OBJ-' + obj.split('[')[1] + '[/OBJ-' + obj.split('/')[-1]
    test_dataset['subject_entity'][i] = ' ' + add_sub + change_sub + add_sub
    test_dataset['object_entity'][i] = ' ' + add_obj + change_obj + add_obj
    test_dataset['sentence'][i] = test_dataset['sentence'][i].replace(sub, change_sub)
    test_dataset['sentence'][i] = test_dataset['sentence'][i].replace(obj, change_obj)

In [None]:
test_dataset.to_csv('/content/Relation_Extraction/data/test_final_pororo_sub.csv', index = False)

### Special Token 추가 및 저장

In [None]:
# Special Token 추가되는지 확인
added_token_num = tokenizer.add_special_tokens({"additional_special_tokens":list(set(special_token_list))})

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

model_config = AutoConfig.from_pretrained(MODEL_NAME)
model_config.num_labels = 30

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, config = model_config)
model.to(device)

In [None]:
print(model.get_input_embeddings())
model.resize_token_embeddings(tokenizer.vocab_size + added_token_num)
print(model.get_input_embeddings())

In [None]:
# Special Token을 txt 형태로 저장
with open('/content/Relation_Extraction/data/pororo_special_token.txt', 'w', encoding = 'UTF-8') as f :
    for token in special_token_list :
        f.write(token + '\n')

## Typed Entity Marker Punct (Feat, ENG)

In [None]:
train_dataset = pd.read_csv('/content/Relation_Extraction/data/train_final_pororo_sub.csv')
train_dataset.drop(['Unnamed: 0'], axis = 1, inplace = True)
test_dataset = pd.read_csv('/content/Relation_Extraction/data/test_final_pororo_sub.csv')

In [None]:
# original
train_dataset.head(1)

Unnamed: 0,id,sentence,label,subject_entity,object_entity
0,0,〈Something〉는 [OBJ-PERSON]조지 해리슨[/OBJ-PERSON]이 ...,no_relation,'[SUB-PERSON]비틀즈[/SUB-PERSON]','[OBJ-PERSON]조지 해리슨[/OBJ-PERSON]'


### Train Dataset

In [None]:
# [SUB-PERSON]황의조[/SUB-PERSON] -> @*person*황의조@
# [OBJ-CITY]보르도[/OBJ-CITY] -> #^city^보르도#
punct_special_token = []

for i in tqdm(range(len(train_dataset))) :
    sub, obj = eval(train_dataset['subject_entity'][i]), eval(train_dataset['object_entity'][i])
    
    sub_type = sub.split(']')[0].split('-')[1].lower()
    obj_type = obj.split(']')[0].split('-')[1].lower()
    first_sub_type = sub.split(']')[0] + ']'
    first_obj_type = obj.split(']')[0] + ']'
    last_sub_type = '[' + sub.split('[')[-1]
    last_obj_type = '[' + obj.split('[')[-1]

    change_sub_type = '@*' + sub_type + '*'
    change_obj_type = '#^' + obj_type + '^'
    change_last_sub = '@'
    change_last_obj = '#'

    train_dataset['subject_entity'][i] = train_dataset['subject_entity'][i].replace(first_sub_type, change_sub_type)
    train_dataset['subject_entity'][i] = train_dataset['subject_entity'][i].replace(last_sub_type, change_last_sub)
    train_dataset['object_entity'][i] = train_dataset['object_entity'][i].replace(first_obj_type, change_obj_type)
    train_dataset['object_entity'][i] = train_dataset['object_entity'][i].replace(last_obj_type, change_last_obj)

    change_sub = eval(train_dataset['subject_entity'][i])
    change_obj = eval(train_dataset['object_entity'][i])

    train_dataset['sentence'][i] = train_dataset['sentence'][i].replace(sub, change_sub)
    train_dataset['sentence'][i] = train_dataset['sentence'][i].replace(obj, change_obj)

    punct_special_token.append(sub_type)
    punct_special_token.append(obj_type)

100%|██████████| 32470/32470 [02:33<00:00, 211.67it/s]


In [None]:
train_dataset.head()

Unnamed: 0,id,sentence,label,subject_entity,object_entity
0,0,〈Something〉는 #^person^조지 해리슨#이 쓰고 @*person*비틀즈...,no_relation,'@*person*비틀즈@','#^person^조지 해리슨#'
1,1,호남이 기반인 바른미래당·#^organization^대안신당#·@*organizat...,no_relation,'@*organization*민주평화당@','#^organization^대안신당#'
2,2,K리그2에서 성적 1위를 달리고 있는 @*organization*광주FC@는 지난 ...,org:member_of,'@*organization*광주FC@','#^organization^한국프로축구연맹#'
3,3,균일가 생활용품점 (주)@*organization*아성다이소@(대표 #^person...,org:top_members/employees,'@*organization*아성다이소@','#^person^박정부#'
4,4,#^date^1967#년 프로 야구 드래프트 1순위로 @*organization*요...,no_relation,'@*organization*요미우리 자이언츠@','#^date^1967#'


### Test Datset

In [None]:
for i in tqdm(range(len(test_dataset))) :
    sub, obj = eval(test_dataset['subject_entity'][i]), eval(test_dataset['object_entity'][i])
    
    sub_type = sub.split(']')[0].split('-')[1].lower()
    obj_type = obj.split(']')[0].split('-')[1].lower()
    first_sub_type = sub.split(']')[0] + ']'
    first_obj_type = obj.split(']')[0] + ']'
    last_sub_type = '[' + sub.split('[')[-1]
    last_obj_type = '[' + obj.split('[')[-1]

    change_sub_type = '@*' + sub_type + '*'
    change_obj_type = '#^' + obj_type + '^'
    change_last_sub = '@'
    change_last_obj = '#'

    test_dataset['subject_entity'][i] = test_dataset['subject_entity'][i].replace(first_sub_type, change_sub_type)
    test_dataset['subject_entity'][i] = test_dataset['subject_entity'][i].replace(last_sub_type, change_last_sub)
    test_dataset['object_entity'][i] = test_dataset['object_entity'][i].replace(first_obj_type, change_obj_type)
    test_dataset['object_entity'][i] = test_dataset['object_entity'][i].replace(last_obj_type, change_last_obj)

    change_sub = eval(test_dataset['subject_entity'][i])
    change_obj = eval(test_dataset['object_entity'][i])

    test_dataset['sentence'][i] = test_dataset['sentence'][i].replace(sub, change_sub)
    test_dataset['sentence'][i] = test_dataset['sentence'][i].replace(obj, change_obj)

In [None]:
train_dataset.to_csv('../dataset/train/train_typed_entity_marker_punct.csv', index = False)
test_dataset.to_csv('../dataset/test/test_typed_entity_marker_punct.csv', index = False)

### Special Token 추가 및 저장

In [None]:
with open('/content/Relation_Extraction/data/pororo_special_token_punct.txt', 'w', encoding = 'UTF-8') as f :
    for token in punct_special_token :
        f.write(token + '\n')

## Typed Entity Marker Punct (Feat, KOR)

In [None]:
from pororo import Pororo
mt = Pororo(task = 'translation', lang = 'multi')






In [None]:
train_dataset = pd.read_csv('/content/Relation_Extraction/data/train_final_pororo_sub.csv')
train_dataset.drop(['Unnamed: 0'], axis = 1, inplace = True)
test_dataset = pd.read_csv('/content/Relation_Extraction/data/test_final_pororo_sub.csv')

In [None]:
special_token_list = []
with open('/content/Relation_Extraction/data/pororo_special_token_punct.txt', 'r', encoding = 'UTF-8') as f :
    for token in f :
        special_token_list.append(token.split('\n')[0])

In [None]:
for token in special_token_list :
    print(token, ' : ', mt(token, src = 'en', tgt = 'ko'))

quantity  :  물량
person  :  사람
term  :  임기
o  :  O
event  :  이벤트
study_field  :  스터디 필드
material  :  물질
city  :  도시
time  :  시간
animal  :  동물
location  :  위치
disease  :  질병
civilization  :  문명
occupation  :  직업
organization  :  조직
country  :  나라
artifact  :  유물
date  :  날짜
plant  :  식물
theory  :  이론


### Train Dataset

In [None]:
punct_special_token = []

for i in tqdm(range(len(train_dataset))) :
    sub, obj = eval(train_dataset['subject_entity'][i]), eval(train_dataset['object_entity'][i])
    
    sub_type = sub.split(']')[0].split('-')[1].lower()
    obj_type = obj.split(']')[0].split('-')[1].lower()
    if sub_type == 'o' :
        kor_sub_type = '없음'
    elif sub_type == 'material' :
        kor_sub_type = '재료'
    else :
        kor_sub_type = mt(sub_type, src = 'en', tgt = 'ko')
    if obj_type == 'o' :
        kor_obj_type = '없음' 
    elif sub_type == 'material' :
        kor_obj_type = '재료'
    else :
        kor_obj_type = mt(obj_type, src = 'en', tgt = 'ko')

    first_sub_type = sub.split(']')[0] + ']'
    first_obj_type = obj.split(']')[0] + ']'
    last_sub_type = '[' + sub.split('[')[-1]
    last_obj_type = '[' + obj.split('[')[-1]

    change_sub_type = '@*' + kor_sub_type + '*'
    change_obj_type = '#^' + kor_obj_type + '^'
    change_last_sub = '@'
    change_last_obj = '#'

    train_dataset['subject_entity'][i] = train_dataset['subject_entity'][i].replace(first_sub_type, change_sub_type)
    train_dataset['subject_entity'][i] = train_dataset['subject_entity'][i].replace(last_sub_type, change_last_sub)
    train_dataset['object_entity'][i] = train_dataset['object_entity'][i].replace(first_obj_type, change_obj_type)
    train_dataset['object_entity'][i] = train_dataset['object_entity'][i].replace(last_obj_type, change_last_obj)

    change_sub = eval(train_dataset['subject_entity'][i])
    change_obj = eval(train_dataset['object_entity'][i])

    train_dataset['sentence'][i] = train_dataset['sentence'][i].replace(sub, change_sub)
    train_dataset['sentence'][i] = train_dataset['sentence'][i].replace(obj, change_obj)

    punct_special_token.append(kor_sub_type)
    punct_special_token.append(kor_obj_type)

In [None]:
train_dataset.head()

Unnamed: 0,id,sentence,label,subject_entity,object_entity
0,0,〈Something〉는 #^사람^조지 해리슨#이 쓰고 @*사람*비틀즈@가 1969년...,no_relation,'@*사람*비틀즈@','#^사람^조지 해리슨#'
1,1,호남이 기반인 바른미래당·#^조직^대안신당#·@*조직*민주평화당@이 우여곡절 끝에 ...,no_relation,'@*조직*민주평화당@','#^조직^대안신당#'
2,2,K리그2에서 성적 1위를 달리고 있는 @*조직*광주FC@는 지난 26일 #^조직^한...,org:member_of,'@*조직*광주FC@','#^조직^한국프로축구연맹#'
3,3,균일가 생활용품점 (주)@*조직*아성다이소@(대표 #^사람^박정부#)는 코로나19 ...,org:top_members/employees,'@*조직*아성다이소@','#^사람^박정부#'
4,4,#^날짜^1967#년 프로 야구 드래프트 1순위로 @*조직*요미우리 자이언츠@에게 ...,no_relation,'@*조직*요미우리 자이언츠@','#^날짜^1967#'


### Test Datset

In [None]:
for i in tqdm(range(len(test_dataset))) :
    sub, obj = eval(test_dataset['subject_entity'][i]), eval(test_dataset['object_entity'][i])
    
    sub_type = sub.split(']')[0].split('-')[1].lower()
    obj_type = obj.split(']')[0].split('-')[1].lower()
    if sub_type == 'o' :
        kor_sub_type = '없음'
    elif sub_type == 'material' :
        kor_sub_type = '재료'
    else :
        kor_sub_type = mt(sub_type, src = 'en', tgt = 'ko')
    if obj_type == 'o' :
        kor_obj_type = '없음' 
    elif sub_type == 'material' :
        kor_obj_type = '재료'
    else :
        kor_obj_type = mt(obj_type, src = 'en', tgt = 'ko')

    first_sub_type = sub.split(']')[0] + ']'
    first_obj_type = obj.split(']')[0] + ']'
    last_sub_type = '[' + sub.split('[')[-1]
    last_obj_type = '[' + obj.split('[')[-1]

    change_sub_type = '@*' + kor_sub_type + '*'
    change_obj_type = '#^' + kor_obj_type + '^'
    change_last_sub = '@'
    change_last_obj = '#'

    test_dataset['subject_entity'][i] = test_dataset['subject_entity'][i].replace(first_sub_type, change_sub_type)
    test_dataset['subject_entity'][i] = test_dataset['subject_entity'][i].replace(last_sub_type, change_last_sub)
    test_dataset['object_entity'][i] = test_dataset['object_entity'][i].replace(first_obj_type, change_obj_type)
    test_dataset['object_entity'][i] = test_dataset['object_entity'][i].replace(last_obj_type, change_last_obj)

    change_sub = eval(test_dataset['subject_entity'][i])
    change_obj = eval(test_dataset['object_entity'][i])

    test_dataset['sentence'][i] = test_dataset['sentence'][i].replace(sub, change_sub)
    test_dataset['sentence'][i] = test_dataset['sentence'][i].replace(obj, change_obj)

In [None]:
train_dataset.to_csv('/content/Relation_Extraction/data/train_punct_kor.csv', index = False)
test_dataset.to_csv('/content/Relation_Extraction/data/test_punct_kor.csv', index = False)

### Special Token 추가 및 저장

In [None]:
with open('/content/Relation_Extraction/data/pororo_special_token_punct_kor.txt', 'w', encoding = 'UTF-8') as f :
    for token in punct_special_token :
        f.write(token + '\n')