# UNK token을 포함한 문장에는 무엇이 있는가 확인하는 ipynb

## 1. 기초 함수 세팅

In [1]:
import json

import pandas as pd
from matplotlib import pyplot as plt

from tqdm import tqdm
from transformers import AutoTokenizer

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
# unk sentence to csv

from collections import Counter, OrderedDict

train_path = './data/train.csv'
dev_path = './data/dev.csv'

train_data = pd.read_csv(train_path)
dev_data = pd.read_csv(dev_path)

tokenizer = AutoTokenizer.from_pretrained('klue/roberta-small')

# Function Settings
def get_num_tokens(df):
    sentence1_len, sentence2_len = [], []
    sentence1_unk, sentence2_unk = [], []
    
    for i, item in df.iterrows():
        sentence1 = tokenizer(item['sentence_1'])['input_ids']
        sentence2 = tokenizer(item['sentence_2'])['input_ids']

        sentence1_len.append(len(sentence1))
        sentence2_len.append(len(sentence2))

        sentence1_unk.append(sentence1.count(tokenizer.unk_token_id))
        sentence2_unk.append(sentence2.count(tokenizer.unk_token_id))

    return sentence1_len, sentence2_len, sentence1_unk, sentence2_unk
    # return pd.DataFrame({'number of tokens':sentence1_len, 'label score':df.label.values.tolist()})


# 1. 전체 df에 대해 score 5단계로 분류, 열 추가
train_data_scored = train_data.copy(deep=True)
score_integer = []

for i, item in train_data_scored.iterrows():
    label_value = int(item['label'])
    if   label_value == 0:  col = 0
    elif label_value < 2.0: col = 1
    elif label_value < 3.0: col = 2
    elif label_value < 4.0: col = 3
    elif label_value < 5.0: col = 4
    else:                   col = 5
        
    score_integer.append(col)
train_data_scored['score_class'] = score_integer

# 2. sentence 별 토큰 개수 넣기
s1_len, s2_len, s1_unk, s2_unk = get_num_tokens(train_data_scored)

train_data_scored['s1_num_tokens'] = s1_len
train_data_scored['s2_num_tokens'] = s2_len
train_data_scored['s1_num_unk'] = s1_unk
train_data_scored['s2_num_unk'] = s2_unk

# 2. unk token 문장 확인 후 CSV 저장

In [3]:
# unk sentence to csv

unk1_token_sentence = []
unk2_token_sentence = []
unk1_count, unk2_count = [], []

source_list = sorted(train_data['source'].unique())

for source_name in source_list:
    g = train_data_scored.groupby(['source']).get_group('nsmc-sampled')
    u1_sentence = g[g['s1_num_unk'] >=1]['sentence_1'].values.tolist()
    u2_sentence = g[g['s2_num_unk'] >=1]['sentence_2'].values.tolist()

    unk1_token_sentence.extend(u1_sentence)
    unk2_token_sentence.extend(u2_sentence)
    unk1_count.extend(g[g['s1_num_unk'] >=1]['s1_num_unk'].values.tolist())
    unk2_count.extend(g[g['s2_num_unk'] >=1]['s2_num_unk'].values.tolist())
    

max_len = max(len(unk1_token_sentence), len(unk2_token_sentence))
unk1_token_sentence += [''] * (max_len - len(unk1_token_sentence))
unk1_count += [''] * (max_len - len(unk1_count))
unk2_token_sentence += [''] * (max_len - len(unk2_token_sentence))
unk2_count += [''] * (max_len - len(unk2_count))

unk_pd = pd.DataFrame({'unk1_sentences': unk1_token_sentence, 
                       'unk1_count': unk1_count,
                       'unk2_sentences': unk2_token_sentence,
                       'unk2_count': unk2_count,
                       })
unk_pd.to_csv('./unk_setences_train.csv')