## 21세기 세종 형태분석 말뭉치 통계 사전 구축 과정


In [137]:
import sys
from utils import load, get_sentences, load_processed_corpus, tageojeol_to_tuple
from hangle import decompose, compose, moum_begin, moum_end, jaum_begin, jaum_end, kor_begin, kor_end

In [116]:
import glob
tagged_corpus = glob.glob('../sejong/*')
len(tagged_corpus)

446

In [125]:
for file_name in tagged_corpus:    
    try:
        lines = load(file_name)
    except Exception as e:
        print('ERROR: %s: %s' % (file_name, str(e)))
        continue

ERROR: ../sejong\BTAE0201.txt: 'utf-16-le' codec can't decode bytes in position 2172-2173: illegal encoding


# XML 태그 제거

In [202]:
def summary(sentences):
    s_ = []
    for sentence in sentences:
        text, tagged = zip(*sentence)
        s_.append('%s\t%s' % (' '.join(text), ' '.join(tagged)))
    return s_

In [203]:
import os
for file_path in tagged_corpus:        
    directory = ('../refinement')
    if not os.path.exists(directory):
        os.makedirs(directory)
    refinement_file_path = "{0:s}/{1:s}".format(directory, os.path.split(file_path)[-1])    
    try:
        sentences = get_sentences(file_path)
        sentences = summary(sentences)
    except Exception as e:
        print('ERROR: %s: %s' % (file_path, str(e)))
    with open(refinement_file_path, 'w', encoding='utf-8') as f:
        for sentence in sentences:
            f.write('%s\n' % sentence)

ERROR: ../sejong\BTAE0201.txt: 'utf-16-le' codec can't decode bytes in position 2172-2173: illegal encoding


In [204]:
refinement_corpus = glob.glob('../refinement/*')
with open(refinement_corpus[0], encoding='utf-8') as f:
    line = f.readline()
line

'프랑스의 세계적인 의상 디자이너 엠마누엘 웅가로가 실내 장식용 직물 디자이너로 나섰다.\t프랑스/NNP+의/JKG 세계/NNG+적/XSN+이/VCP+ᆫ/ETM 의상/NNG 디자이너/NNG 엠마누엘/NNP 웅가로/NNP+가/JKS 실내/NNG 장식/NNG+용/XSN 직물/NNG 디자이너/NNG+로/JKB 나서/VV+었/EP+다/EF+./SF\n'

# 형태소 단위로 변환

In [209]:
def is_all_complete_hangle(eojeol):
    for char in eojeol:
        if not (kor_begin <= ord(char) <= kor_end):
            return False
    return True

def to_lr(e, w, t):
    tag = t[0][0]
    i = 0
    for i_, ti in enumerate(t):
        if t[0][0] == 'N' and ti[0] == 'V':
            break
        if t[0][0] == 'V' and (ti == 'ETN' and len(w[i_]) == 1 and jaum_begin <= ord(w[i_][0]) <= jaum_end):
            tag = 'N'
            break
        if not (ti[0] == 'N' or ti == 'XSN' or ti[:2] == 'VV' or ti[:2] == 'VA' or ti == 'XR'):
            break
        i = i_
    lw = e[:len(''.join(w[:i+1]))]
    r = e[len(lw):]
    
    # 아빤 = 아빠/N + ㄴ/J
    # 갈꺼야 = 가/V + ㄹ/E + 꺼야/E
    if (t[i][0] == 'N' or t[i][0] == 'V') and (jaum_begin <= ord(w[i+1][0]) <= jaum_end):
        last_l = decompose(lw[-1])
        l0 = lw[:-1] + compose(last_l[0], last_l[1], ' ')
        return lw, r, tag, l0

    # 가? = 가/V + ㅏ/E + ?/S
    # 먹었어 = 먹/V + 었어/E
    return lw, r, tag.replace('X','N'), ''.join(w[:i+1])

def is_compound_noun(t):
    if len(t) <= 1:
        return False
    n_count = len([ti for ti in t if ti[0] == 'N' or ti == 'XSN'])
    if len(t) == n_count:
        return True
    if n_count <= 1:
        return False
    if t[0][0] == 'N' and [-1][0] == 'N':
        return True
        
def print_tolr(args):
    print('L=%s, R=%s, tag=%s, L(원형)=%s' % args)

In [210]:
directory = ('../db')
if not os.path.exists(directory):
    os.makedirs(directory)

In [213]:
with open('../db/lrdb.csv', 'w', encoding='utf-8') as f:
    f.write('%s\n' % '\t'.join(['eojeol', 'l', 'r', 'tag', 'lstemmed']))
    for fname in refinement_corpus:
        texts, texttags = load_processed_corpus(fname)
        for text, tag in zip(texts, texttags):
            for eojeol, eojeoltag in zip(text.split(), tag.split()):
                if not is_all_complete_hangle(eojeol):
                    continue
                    
                try:
                    w, t = tageojeol_to_tuple(eojeoltag)
                except:
                    continue
                
                if len(t) <= 1:
                    continue
                if (t[0][0] != 'N' and t[0][0] != 'V'):
                    continue
                if is_compound_noun(t):
                    continue

                try:
                    l, r, y, l0 = to_lr(eojeol, w, t)
                    f.write('%s\n' % '\t'.join([eojeol, l, r, y, l0]))
                except:
                    continue

In [219]:
import pandas as pd
df = pd.read_csv('../db/lrdb.csv', sep='\t', encoding='utf-8')
df.head()

Unnamed: 0,eojeol,l,r,tag,lstemmed
0,프랑스의,프랑스,의,N,프랑스
1,세계적인,세계적,인,N,세계적
2,웅가로가,웅가로,가,N,웅가로
3,디자이너로,디자이너,로,N,디자이너
4,웅가로는,웅가로,는,N,웅가로
