In [None]:
# Colab 한글 폰트 설정
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

# Transformer Library Download
!pip install transformers

# Github Clone
!git clone https://github.com/jjonhwa/KLUE-NLI.git

# Dataset Unzip
!unzip -q '/content/KLUE-NLI/data/open.zip' -d '/content/data'

In [None]:
# Library Import
import matplotlib.pyplot as plt
plt.rc('font', family='NanumBarunGothic') 
import pandas as pd
import os

from transformers import AutoTokenizer

%cd KLUE-NLI
from utils.nlpdata_eda import corpus_statistic_with_graph

In [None]:
# Dataset 불러오기
PATH = '/content/data/open'
train = pd.read_csv(os.path.join(PATH, 'train_data.csv'), encoding = 'utf-8')
test = pd.read_csv(os.path.join(PATH, 'test_data.csv'), encoding = 'utf-8')

# index label 삭제
train.drop(['index'], axis = 1, inplace = True)
test.drop(['index'], axis = 1, inplace = True)

## Dataset 분포 확인

In [None]:
# null값 확인
print(train.info(), end='\n\n')
print(test.info())

In [None]:
# Label 분포 확인
feature = train['label']

plt.figure(figsize=(10,7.5))
plt.title('Label Count', fontsize=20)

temp = feature.value_counts()
plt.bar(temp.keys(), temp.values, width=0.5, color='b', alpha=0.5)
plt.text(-0.05, temp.values[0]+20, s=temp.values[0])
plt.text(0.95, temp.values[1]+20, s=temp.values[1])
plt.text(1.95, temp.values[2]+20, s=temp.values[2])

plt.xticks(temp.keys(), fontsize=12) # x축 값, 폰트 크기 설정
plt.tight_layout(rect=[0, 0.03, 1, 0.95]) # 레이아웃 설정
plt.show() # 그래프 나타내기

## Token Length 확인

In [None]:
# Tokenizer Download
MODEL_NAME = 'klue/roberta-large'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
# 예시 출력
example_format = list(train['premise'])[0]
print(tokenizer.tokenize(example_format))

### Train / Test Tokenizer Check

In [None]:
# Token Length 확인
checks = ['train', 'test']
for check in checks :
    if check == 'train' :
        train_premise_eda = corpus_statistic_with_graph(list(train['premise']), tokenizer_type = 'wordpiece', tokenizer = tokenizer)
        print(train_premise_eda)
        train_hypothesis_eda = corpus_statistic_with_graph(list(train['hypothesis']), tokenizer_type = 'wordpiece', tokenizer = tokenizer)
        print(train_hypothesis_eda)
    elif check == 'test' :
        test_premise_eda = corpus_statistic_with_graph(list(test['premise']), tokenizer_type = 'wordpiece', tokenizer = tokenizer)
        print(test_premise_eda)
        test_hypothesis_eda = corpus_statistic_with_graph(list(test['hypothesis']), tokenizer_type = 'wordpiece', tokenizer = tokenizer)
        print(test_hypothesis_eda)

In [None]:
# Token Length 64 기준으로 비율 확인
drop_premise_list = ([], [])
drop_hypothesis_list = ([], [])

for check in checks :
    if check == 'train' :
        premise_sentence = train['premise']
        hypothesis_sentence = train['hypothesis']
    elif check == 'test' :
        premise_sentence = test['premise']
        hypothesis_sentence = test['hypothesis']

    for i, sent in enumerate(premise_sentence) :
        words = tokenizer.tokenize(sent)
        if len(words) > 64 :
            if check == 'train' :
                drop_premise_list[0].append(i)
            elif check == 'test' :
                drop_premise_list[1].append(i)

    for i, sent in enumerate(hypothesis_sentence) :
        words = tokenizer.tokenize(sent)
        if len(words) > 64 :
            if check == 'train' :
                drop_hypothesis_list[0].append(i)
            elif check == 'test' :
                drop_hypothesis_list[1].append(i)

In [None]:
import matplotlib.pyplot as plt

train_premise_out = (len(drop_premise_list[0]) / len(train))*100
train_hypothesis_out = (len(drop_hypothesis_list[0]) / len(train))*100
test_premise_out = (len(drop_premise_list[1]) / len(test))*100
test_hypothesis_out = (len(drop_hypothesis_list[1]) / len(test))*100
train_premise_in = 100-train_premise_out
train_hypothesis_in = 100-train_hypothesis_out
test_premise_in = 100-test_premise_out
test_hypothesis_in = 100-test_hypothesis_out

train_premise_ratio = [train_premise_in, train_premise_out]
test_premise_ratio = [test_premise_in, test_premise_out]

train_hypothesis_ratio = [train_hypothesis_in, train_hypothesis_out]
test_hypothesis_ratio = [test_hypothesis_in, test_hypothesis_out]

train_labels = ['64이하', '64이상']
test_labels = ['64이하', '64이상']

plt.figure(figsize = (12,8))
plt.subplot(2,2,1)
plt.pie(train_premise_ratio, 
        labels=train_labels,
        autopct='%.2f%%',
        colors=['darkgreen', 'firebrick'],
        startangle=90,
        textprops={'fontsize':14, 'weight': 'bold'},
        shadow=False,
        explode=(0.1, 0.1))
plt.title('Train - Premise', size=25)

plt.subplot(2,2,2)
plt.pie(train_hypothesis_ratio, 
        labels=train_labels,
        autopct='%.2f%%',
        colors=['darkgreen', 'firebrick'],
        startangle=90,
        textprops={'fontsize':14, 'weight': 'bold'},
        shadow=False,
        explode=(0.1, 0.1))
plt.title('Train - Hypothesis', size=25)

plt.figure(figsize = (12,8))
plt.subplot(2,2,1)
plt.pie(test_premise_ratio, 
        labels=test_labels,
        autopct='%.2f%%',
        colors=['darkgreen', 'firebrick'],
        startangle=90,
        textprops={'fontsize':14, 'weight': 'bold'},
        shadow=False,
        explode=(0.1, 0.1))
plt.title('Test - Premise', size=25)

plt.subplot(2,2,2)
plt.pie(test_hypothesis_ratio, 
        labels=test_labels,
        autopct='%.2f%%',
        colors=['darkgreen', 'firebrick'],
        startangle=90,
        textprops={'fontsize':14, 'weight': 'bold'},
        shadow=False,
        explode=(0.1, 0.1))
plt.title('Test - Hypothesis', size=25)
plt.show()