In [None]:
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
from pprint import pprint

In [None]:
df1 = pd.read_csv('../data/raw_spoken.txt', header=None, delimiter='\t') #500k  (499998)
df2 = pd.read_csv('../data/raw_ratings.txt', header=None, delimiter='\t') #200k (199992)
df3 = pd.read_csv('../data/raw_wiki_ko_sent.txt', header=None, delimiter='\t') #4M (4528804)

## 1. Basic Corpus Stat

In [None]:
print("Number of Sentences")
print(len(df1))
print(len(df2))
print(len(df3))

df1['length'] = df1[0].apply(len)
df2['length'] = df2[0].apply(len)
df3['length'] = df3[0].apply(len)

print("\n==Basic Stat==")
print(df1['length'].mean(), df1['length'].median(), df1['length'].std())
print(df2['length'].mean(), df2['length'].median(), df2['length'].std())
print(df3['length'].mean(), df3['length'].median(), df3['length'].std())

print("\n==Total Character==")
print(df1['length'].sum())
print(df2['length'].sum())
print(df3['length'].sum())



In [None]:
print(df1['length'].hist())

In [None]:
print(df2['length'].hist())

In [None]:
print(df3['length'].hist())

## Extract Initial Consonant



In [None]:
with open('../data/clean_corpus.txt', 'r', encoding='utf-8') as input:
    sents

In [None]:
max_char_length = 128

a = np.zeros(max_char_length)

In [None]:
from itertools import product
import re


class NGRAMTokenizer():

    BASE_CODE, HEAD, MID = 44032, 588, 28

    # 초성 리스트. 00 ~ 18
    HEAD_LIST = ['ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ']
    EXTRA_LIST = [' ', ',', '.', '?', '!', '~', '∼']

    # 중성 리스트. 00 ~ 20
    MID_LIST = ['ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ', '@']

    # 종성 리스트. 00 ~ 27 + 1(1개 없음)
    TAIL_LIST = ['#', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ', '@']

    def __init__(self, ngram, max_char_length, head_list=None, mid_list=None, tail_list=None):
        self.ngram = ngram
        self.max_char_length = max_char_length
        self.head_list = head_list if head_list else self.HEAD_LIST
        self.mid_list = mid_list if mid_list else self.MID_LIST
        self.tail_list = tail_list if tail_list else self.TAIL_LIST

        self.head2id = self.generate_head_ngram2id(self.EXTRA_LIST+self.head_list, self.ngram)
        self.mid2id = {mid:i for i,mid in enumerate(self.mid_list)}
        self.tail2id = {tail:i for i,tail in enumerate(self.tail_list)}

    def generate_head_ngram2id(self, head_list, ngram):
        ngram_list = list(product(head_list, repeat = ngram))
        ngram2id = {ngram_head:i for i,ngram_head in enumerate(ngram_list)}
        return ngram2id

    def encode(self, sent_list):
        encoded_sent_list = list()

        list_head_ids = list()
        list_mid_ids = list()
        list_tail_ids = list()

        for sent in sent_list:
            head_ids, mid_ids, tail_ids = self.encode_sent(sent)
            list_head_ids.append(head_ids)
            list_mid_ids.append(mid_ids)
            list_tail_ids.append(tail_ids)

        return list_head_ids, list_mid_ids, list_tail_ids

    def encode_sent(self, sent):
        heads = list()
        mids = list()
        tails = list()

        for i, keyword in enumerate(sent[:self.max_char_length]): # truncate with max_char_length
            # 한글 여부 check 후 분리
            if re.match('.*[ㄱ-ㅎㅏ-ㅣ가-힣]+.*', keyword) is not None:
                char_code = ord(keyword) - self.BASE_CODE
                char1 = int(char_code / self.HEAD)
                heads.append(self.head_list[char1])
                #print('초성 : {}'.format(HEAD_LIST[char1]))

                char2 = int((char_code - (self.HEAD * char1)) / self.MID)
                mids.append(self.mid_list[char2])
                #print('중성 : {}'.format(MID_LIST[char2]))

                char3 = int((char_code - (self.HEAD * char1) - (self.MID * char2)))
                tails.append(self.tail_list[char3])
                #print('종성 : {}'.format(TAIL_LIST[char3]))
            else: #non-korean character
                heads.append(keyword)
                mids.append('@')
                tails.append('@')

        head_ids = np.zeros(self.max_char_length, dtype=np.int)
        mid_ids = np.zeros(self.max_char_length, dtype=np.int)
        tail_ids = np.zeros(self.max_char_length, dtype=np.int)

        # Calculate left, right offset
        if self.ngram % 2 == 0: # even ngram
            left_offset = (self.ngram) // 2
            right_offset = (self.ngram-1) // 2
        else: # odd ngram
            left_offset = (self.ngram-1) // 2
            right_offset = (self.ngram-1) // 2

        # Convert consonant to id
        for i, (head, mid, tail) in enumerate(zip(heads, mids, tails)):
            # ngram-head id
            ngram = heads[max(i-left_offset, 0):min(i+right_offset+1, len(heads))]
            if i < left_offset:
                margin = left_offset - i
                ngram = [' '] * margin + ngram
            if (len(heads)-1-i) >= 0:
                margin = right_offset - (len(heads)-1-i)
                ngram = ngram + [' '] * margin 

            ngram = tuple(ngram)
            head_ids[i] = self.head2id[ngram] + 1
            mid_ids[i] = self.mid2id[mid] + 1
            tail_ids[i] = self.tail2id[tail] + 1

        return head_ids, mid_ids, tail_ids
            

In [16]:
sentence = ["내가 너 엄청 좋아해!"]
tokenizer = NGRAMTokenizer(3, 15)

print("Num Head Vocab:", len(tokenizer.head2id))
print("Num  Mid Vocab:", len(tokenizer.mid2id))
print("Num Tail Vocab:", len(tokenizer.tail2id))

head_ids, mid_ids, tail_ids = tokenizer.encode(sentence)
print()
print("Head Consonant ID")
print('->', head_ids[0])

print()
print("Mid Consonant ID")
print('->', mid_ids[0])

print()
print("Tail Consonant ID")
print('->', tail_ids[0])


Num Head Vocab: 17576
Num  Mid Vocab: 22
Num Tail Vocab: 29

Head Consonant ID
-> [  242  6267  4742   235  6103   490 12715 14216   513 13338 12823 17005
     0     0     0]

Mid Consonant ID
-> [ 2  1 22  5 22  5  5 22  9  1  2 22  0  0  0]

Tail Consonant ID
-> [ 1  1 29  1 29 17 22 29 28  1  1 29  0  0  0]


In [None]:
sentence = ["내가 너 엄청 좋아해!"]
tokenizer = NGRAMTokenizer(3, 15)
head_ids, mid_ids, tail_ids = tokenizer.encode(sentence)



In [None]:
N = 3
grams = [sentence[i:i+N] for i in range(len(sentence)-N+1)]


In [None]:
grams