In [None]:
# Github Clone
!git clone https://github.com/jjonhwa/KLUE-NLI.git

# Dataset Unzip
!tar -zxvf /content/KLUE-NLI/data/klue-nli-v1.1.tar.gz
!unzip -q /content/KLUE-NLI/data/open.zip -d /content/data

# Transformer Library Download
!pip install transformers

# Kor NLI github clone
!git clone https://github.com/kakaobrain/KorNLUDatasets.git

# Original Dataset Unzip
!unzip -q '/content/KLUE-NLI/data/open.zip' -d '/content/data'

In [None]:
import pandas as pd
import json
import os
import matplotlib.pyplot as plt

from tqdm import tqdm
from transformers import AutoTokenizer

%cd KLUE-NLI
from utils.mk_data import read_json, create_pandas
from utils.nlpdata_eda import corpus_statistic_with_graph

### KLUE OFFICIAL DATASET 추가 (Only Dev)

In [None]:
# Data Path 지정
data_dir = "./klue-nli-v1.1"
valid_filename = "klue-nli-v1.1_dev.json"
valid_file_path = os.path.join(data_dir, valid_filename)

# Dataset 만들기
valid_json = read_json(valid_file_path)
valid_df = create_pandas(valid_json)

In [None]:
valid_df.drop(['guid', 'source'], axis = 1, inplace = True)
valid_df.to_csv('/content/KLUE-NLI/data/klue_dev.csv')

### Kor NLI DATASET 추가 (with Token Length)

In [None]:
# Human Translated Dataset만 활용
kakao_dev = pd.read_csv('/content/KorNLUDatasets/KorNLI/xnli.dev.ko.tsv', sep='\t')
kakao_test = pd.read_csv('/content/KorNLUDatasets/KorNLI/xnli.test.ko.tsv', sep='\t')

kakao_dataset = pd.concat([kakao_dev, kakao_test])
kakao_dataset.reset_index(drop = True, inplace = True)
kakao_dataset.columns = ['premise', 'hypothesis', 'label']

In [None]:
# Label 분포 파악
feature = kakao_dataset['label']

plt.figure(figsize=(10,7.5))
plt.title('Label Count', fontsize=20)

temp = feature.value_counts()
plt.bar(temp.keys(), temp.values, width=0.5, color='b', alpha=0.5)
plt.text(-0.05, temp.values[0]+20, s=temp.values[0])
plt.text(0.95, temp.values[1]+20, s=temp.values[1])
plt.text(1.95, temp.values[2]+20, s=temp.values[2])

plt.xticks(temp.keys(), fontsize=12) # x축 값, 폰트 크기 설정
plt.tight_layout(rect=[0, 0.03, 1, 0.95]) # 레이아웃 설정
plt.show() # 그래프 나타내기

In [None]:
# Original Dataset 불러오기
PATH = '/content/data/open'
train = pd.read_csv(os.path.join(PATH, 'train_data.csv'), encoding = 'utf-8')
test = pd.read_csv(os.path.join(PATH, 'test_data.csv'), encoding = 'utf-8')

concat_dataset = pd.concat([train, test])
concat_dataset.reset_index(drop = True, inplace = True)

In [None]:
# 기존 Dataset과의 중복 검사
premise_cnt = 0
hypothesis_cnt = 0

from tqdm import tqdm
for i in tqdm(range(len(concat_dataset))):
    if concat_dataset['premise'][i] in kakao_dataset['premise']:
        premise_cnt += 1
    if concat_dataset['hypothesis'][i] in kakao_dataset['hypothesis']:
        hypothesis_cnt += 1

In [None]:
# Tokenizer Download
MODEL_NAME = 'klue/roberta-large'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
# 예시 출력
example_format = list(kakao_dataset['premise'])[0]
print(tokenizer.tokenize(example_format))

In [None]:
# Kor NLI Dataset의 Token Length 분포 파악
train_premise_eda = corpus_statistic_with_graph(list(kakao_dataset['premise']), tokenizer_type = 'wordpiece', tokenizer = tokenizer)
print(train_premise_eda)
train_hypothesis_eda = corpus_statistic_with_graph(list(kakao_dataset['hypothesis']), tokenizer_type = 'wordpiece', tokenizer = tokenizer)
print(train_hypothesis_eda)

In [None]:
# 최종 길이가 50보다 짧은 문장만 최종 선택
drop_list = []
for i in range(len(kakao_dataset)):
    if len(tokenizer.tokenize(kakao_dataset['premise'][i])) > 50 or len(tokenizer.tokenize(kakao_dataset['hypothesis'][i])) > 50:
        drop_list.append(i)

kakao_dataset.drop(drop_list, axis = 0, inplace =True)
kakao_dataset.reset_index(drop=True, inplace=True)

print("추가 Dataset의 개수:", len(kakao_dataset))

In [None]:
kakao_dataset.to_csv('/content/KLUE-NLI/data/kor_nli_valid.csv')