In [1]:
__all__ = ["split_syllable_char", "split_syllables",
           "join_jamos", "join_jamos_char",
           "CHAR_INITIALS", "CHAR_MEDIALS", "CHAR_FINALS"]

import itertools

INITIAL = 0x001
MEDIAL = 0x010
FINAL = 0x100
CHAR_LISTS = {
    INITIAL: list(map(chr, [
        0x3131, 0x3132, 0x3134, 0x3137, 0x3138, 0x3139,
        0x3141, 0x3142, 0x3143, 0x3145, 0x3146, 0x3147,
        0x3148, 0x3149, 0x314a, 0x314b, 0x314c, 0x314d,
        0x314e
    ])),
    MEDIAL: list(map(chr, [
        0x314f, 0x3150, 0x3151, 0x3152, 0x3153, 0x3154,
        0x3155, 0x3156, 0x3157, 0x3158, 0x3159, 0x315a,
        0x315b, 0x315c, 0x315d, 0x315e, 0x315f, 0x3160,
        0x3161, 0x3162, 0x3163
    ])),
    FINAL: list(map(chr, [
        0x3131, 0x3132, 0x3133, 0x3134, 0x3135, 0x3136,
        0x3137, 0x3139, 0x313a, 0x313b, 0x313c, 0x313d,
        0x313e, 0x313f, 0x3140, 0x3141, 0x3142, 0x3144,
        0x3145, 0x3146, 0x3147, 0x3148, 0x314a, 0x314b,
        0x314c, 0x314d, 0x314e
    ]))
}
CHAR_INITIALS = CHAR_LISTS[INITIAL]
CHAR_MEDIALS = CHAR_LISTS[MEDIAL]
CHAR_FINALS = CHAR_LISTS[FINAL]
CHAR_SETS = {k: set(v) for k, v in CHAR_LISTS.items()}
CHARSET = set(itertools.chain(*CHAR_SETS.values()))
CHAR_INDICES = {k: {c: i for i, c in enumerate(v)}
                for k, v in CHAR_LISTS.items()}


def is_hangul_syllable(c):
    return 0xac00 <= ord(c) <= 0xd7a3  # Hangul Syllables


def is_hangul_jamo(c):
    return 0x1100 <= ord(c) <= 0x11ff  # Hangul Jamo


def is_hangul_compat_jamo(c):
    return 0x3130 <= ord(c) <= 0x318f  # Hangul Compatibility Jamo


def is_hangul_jamo_exta(c):
    return 0xa960 <= ord(c) <= 0xa97f  # Hangul Jamo Extended-A


def is_hangul_jamo_extb(c):
    return 0xd7b0 <= ord(c) <= 0xd7ff  # Hangul Jamo Extended-B


def is_hangul(c):
    return (is_hangul_syllable(c) or
            is_hangul_jamo(c) or
            is_hangul_compat_jamo(c) or
            is_hangul_jamo_exta(c) or
            is_hangul_jamo_extb(c))


def is_supported_hangul(c):
    return is_hangul_syllable(c) or is_hangul_compat_jamo(c)


def check_hangul(c, jamo_only=False):
    if not ((jamo_only or is_hangul_compat_jamo(c)) or is_supported_hangul(c)):
        raise ValueError(f"'{c}' is not a supported hangul character. "
                         f"'Hangul Syllables' (0xac00 ~ 0xd7a3) and "
                         f"'Hangul Compatibility Jamos' (0x3130 ~ 0x318f) are "
                         f"supported at the moment.")


def get_jamo_type(c):
    check_hangul(c)
    assert is_hangul_compat_jamo(c), f"not a jamo: {ord(c):x}"
    return sum(t for t, s in CHAR_SETS.items() if c in s)


def split_syllable_char(c):
    """
    Splits a given korean syllable into its components. Each component is
    represented by Unicode in 'Hangul Compatibility Jamo' range.

    Arguments:
        c: A Korean character.

    Returns:
        A triple (initial, medial, final) of Hangul Compatibility Jamos.
        If no jamo corresponds to a position, `None` is returned there.

    Example:
        >>> split_syllable_char("안")
        ("ㅇ", "ㅏ", "ㄴ")
        >>> split_syllable_char("고")
        ("ㄱ", "ㅗ", None)
        >>> split_syllable_char("ㅗ")
        (None, "ㅗ", None)
        >>> split_syllable_char("ㅇ")
        ("ㅇ", None, None)
    """
    check_hangul(c)
    if len(c) != 1:
        raise ValueError("Input string must have exactly one character.")

    init, med, final = None, None, None
    if is_hangul_syllable(c):
        offset = ord(c) - 0xac00
        x = (offset - offset % 28) // 28
        init, med, final = x // 21, x % 21, offset % 28
        if not final:
            final = None
        else:
            final -= 1
    else:
        pos = get_jamo_type(c)
        if pos & INITIAL == INITIAL:
            pos = INITIAL
        elif pos & MEDIAL == MEDIAL:
            pos = MEDIAL
        elif pos & FINAL == FINAL:
            pos = FINAL
        idx = CHAR_INDICES[pos][c]
        if pos == INITIAL:
            init = idx
        elif pos == MEDIAL:
            med = idx
        else:
            final = idx
    return tuple(CHAR_LISTS[pos][idx] if idx is not None else None
                 for pos, idx in
                 zip([INITIAL, MEDIAL, FINAL], [init, med, final]))


def split_syllables(s, ignore_err=True, pad= '_'):
    """
    Performs syllable-split on a string.

    Arguments:
        s (str): A string (possibly mixed with non-Hangul characters).
        ignore_err (bool): If set False, it ensures that all characters in
            the string are Hangul-splittable and throws a ValueError otherwise.
            (default: True)
        pad (str): Pad empty jamo positions (initial, medial, or final) with
            `pad` character. This is useful for cases where fixed-length
            strings are needed. (default: None)

    Returns:
        Hangul-split string

    Example:
        >>> split_syllables("안녕하세요")
        "ㅇㅏㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ"
        >>> split_syllables("안녕하세요~~", ignore_err=False)
        ValueError: encountered an unsupported character: ~ (0x7e)
        >>> split_syllables("안녕하세요ㅛ", pad="x")
        'ㅇㅏㄴㄴㅕㅇㅎㅏxㅅㅔxㅇㅛxxㅛx'
    """

    def try_split(c):
        try:
            return split_syllable_char(c)
        except ValueError:
            if ignore_err:
                return (c,)
            raise ValueError(f"encountered an unsupported character: "
                             f"{c} (0x{ord(c):x})")

    s = map(try_split, s)
    if pad is not None:
        tuples = map(lambda x: tuple(pad if y is None else y for y in x), s)
    else:
        tuples = map(lambda x: filter(None, x), s)
    return "".join(itertools.chain(*tuples))


def join_jamos_char(init, med, final=None):
    """
    Combines jamos into a single syllable.

    Arguments:
        init (str): Initial jao.
        med (str): Medial jamo.
        final (str): Final jamo. If not supplied, the final syllable is made
            without the final. (default: None)

    Returns:
        A Korean syllable.
    """
    chars = (init, med, final)
    for c in filter(None, chars):
        check_hangul(c, jamo_only=True)

    idx = tuple(CHAR_INDICES[pos][c] if c is not None else c
                for pos, c in zip((INITIAL, MEDIAL, FINAL), chars))
    init_idx, med_idx, final_idx = idx
    # final index must be shifted once as
    # final index with 0 points to syllables without final
    final_idx = 0 if final_idx is None else final_idx + 1
    return chr(0xac00 + 28 * 21 * init_idx + 28 * med_idx + final_idx)


def join_jamos(s, ignore_err=True):
    """
    Combines a sequence of jamos to produce a sequence of syllables.

    Arguments:
        s (str): A string (possible mixed with non-jamo characters).
        ignore_err (bool): If set False, it will ensure that all characters
            will be consumed for the making of syllables. It will throw a
            ValueError when it fails to do so. (default: True)

    Returns:
        A string

    Example:
        >>> join_jamos("ㅇㅏㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ")
        "안녕하세요"
        >>> join_jamos("ㅇㅏㄴㄴㄴㅕㅇㅎㅏㅅㅔㅇㅛ")
        "안ㄴ녕하세요"
        >>> join_jamos()
        아니 ㅇㅏㄴㅣ
        아니 ㅇㅏ_ㄴㅣ
    """
    last_t = 0
    queue = []
    new_string = ""

    def flush(n=0):
        new_queue = []
        while len(queue) > n:
            new_queue.append(queue.pop())
        if len(new_queue) == 1:
            if not ignore_err:
                raise ValueError(f"invalid jamo character: {new_queue[0]}")
            result = new_queue[0]
        elif len(new_queue) >= 2:
            try:
                result = join_jamos_char(*new_queue)
            except (ValueError, KeyError):
                # Invalid jamo combination
                if not ignore_err:
                    raise ValueError(f"invalid jamo characters: {new_queue}")
                result = "".join(new_queue)
        else:
            result = None
        return result

    for c in s:
        if c == '_':  #jeonghui add
            continue  #jeonghui add
        if c not in CHARSET:
            if queue:
                new_c = flush() + c
            else:
                new_c = c
            last_t = 0
        else:
            t = get_jamo_type(c)
            new_c = None
            if t & FINAL == FINAL:
                if not (last_t == MEDIAL):
                    new_c = flush()
            elif t == INITIAL:
                new_c = flush()
            elif t == MEDIAL:
                if last_t & INITIAL == INITIAL:
                    new_c = flush(1)
                else:
                    new_c = flush()
            last_t = t
            queue.insert(0, c)
        if new_c:
            new_string += new_c
    if queue:
        new_string += flush()
    return new_string



In [40]:
from khaiii import KhaiiiApi
from collections import deque
api = KhaiiiApi()
da_low_full = ['다', 'ㄴ다', '는다', '더다', '느냐다', '단다', '서다', '어서다', '세다', '는지다', '란다', 'ㄴ단다', '냐다', '라다', '드다', 'ㄹ지다', '더란다', '잖다', '넌다', '냐이다', 'ㄴ가다', '매다', '건다', '대다', '가다', '으란다', '어다', 'ㅂ세다', 'ㄴ지다', '아다', '앗다', '거다', '다다', '뵀다', '서란다', 'ㄹ다', '로다', '나다', '엿다', '여다', '딘다', '으다', '쫒는다', '네다', '더랬다', '는단다', '어라다', 'ㄹ트다', '소이다', '뫼다', 'ㅂ디다', '는가다', 'ㄴ는다', '을지다', '더이다', 'ㅂ다', '두다', '인다', '렌다', '은다', '구다', '습다', '련다', '차다', '아야다', 'ㄴ다다', 'ㄴ거다', '을게다', '기다', '프다', '오다', 'ㅁ다', '엇다', '조다', '햇다', '아서다']

da_case1 = ['더다', '느냐다', '서다', '어서다', '세다', '는지다', '냐다', '라다', '드다', 'ㄹ지다', 'ㄴ가다', '매다', '대다', '가다', '어다', 'ㅂ세다', 'ㄴ지다', '거다', 'ㄹ다', '네다', '어라다', 'ㄹ트다', '뫼다', '는가다', '을지다', '두다', '구다', '아야다', '조다', '아서다', '오다', '기다']
da_case2 = ['ㄴ다', '단다', '란다', 'ㄴ단다', '더란다', '넌다', '냐이다', '건다', '으란다', '서란다', '나다', '는단다', '인다', '렌다', '은다', '련다', '차다', 'ㄴ거다', '프다', '올게다']
da_case3 = ['는다', '잖다', '뵀다', '쫒는다', '더랬다', 'ㅂ다']
an_low = ['아', '야', '어', '지', '자', '을까', 'ㄹ까', '어라']



da_low= ['다', 'ㄴ다', '는다', '느냐다', '단다', '서다', '란다', 'ㄴ단다'] 

def Changer(input):

    result = []

    analyzed = api.analyze(input)  
    for data in analyzed:
        lis_word = []
        lis_tag = []
        EF_in = 0
        for morph in data.morphs:
            lis_word.append(morph.lex)
            lis_tag.append(morph.tag)
            if morph.tag == 'EF':
                EF_in = 1
        if EF_in == 1 and len(data.lex) != 2:
            result.append(to_high(lis_word,lis_tag))
        else:
            result.append(data.lex)

    return ' '.join(result)
        
def to_high(lis_word, lis_tag):
    print(lis_word)
    print(lis_tag)
    result = ''
    for i in range(len(lis_word)):
        #다로 끝나는 친구들
        if lis_tag[i] == 'EF' and lis_word[i] in da_case1:
            lis_word[i] = lis_word[i].replace('다', '입니다')
        elif lis_tag[i] == 'EF' and lis_word[i] in da_case2:
            if lis_word[i] == 'ㄴ다':
                lis_word[i] = lis_word[i].replace('ㄴ다', 'ㅂ니다')
            else:
                lis_word[i] = lis_word[i].replace('다', 'ㅂ니다')
        elif lis_tag[i] == 'EF' and lis_word[i] in da_case3:
            if '는다' in lis_word[i]:
                lis_word[i] = lis_word[i].replace('는다', '습니다')
            else:
                lis_word[i] = lis_word[i].replace('다', '습니다')
        elif lis_tag[i] == 'EF' and lis_word[i] == '다':
            jamo = split_syllables(lis_word[i-1])[-1]
            if jamo != '_':
                lis_word[i] = lis_word[i].replace('다', '습니다')
            #받침없으면 ㅂ니다
            else:
                lis_word[i] = lis_word[i].replace('다', 'ㅂ니다')
        
        #나머지 친구들
        EF_dict = {
            '야':'case1',
            '아라':'case2',
            '어라':'case2',
            '라':'case2',
            
        }
        if lis_tag[i] == 'EF' and lis_word[i] in an_low:
            jamo = split_syllables(lis_word[i-1])[-1]
            #'야' 의 경우
            if lis_word[i] == '야':
                lis_word[i] = '에요'
            elif lis_word[i] == '아':
                lis_word[i] = '아요'
            elif lis_word[i] == '어':
                lis_word[i] = '어요'
            elif lis_word[i]== '아라' or lis_word[i] == '어라' or lis_word[i] == '라':
                    if jamo == '_':
                        lis_word[i] = '세요'
                    else:
                        if lis_word[i] == '아라':
                            lis_word[i] = '아요'
                        elif lis_word[i] == '어라':
                            lis_word[i] = '어요'
                        else:
                            lis_word = '으세요'
            elif lis_word[i] == '지' or lis_word[i] == '자':
                lis_word[i] = '죠'
        
        
        
        #이거 합치는거 조금 손봐줘야겠다 정희야 이거 집가서 하자.
        if lis_word[i][0] >= 'ㄱ' and lis_word[i][0] <= 'ㅎ' :
            result = join_jamos(split_syllables(result)[:-1] + lis_word[i][0])+lis_word[i][1:]
        else:
            result += lis_word[i]
    return result

In [78]:
#txt = "그건 올라."
txt = "먹어라. 기어라. 참아라. 실어라. 와라. 믿어라. 빨리 해라. 먹는다더라. 빨리 가라. 이게 뭐야? 알겠어? 오호라."

print(Changer(txt))

['먹', '어라', '.']
['VV', 'EF', 'SF']
['기', '어', '라', '.']
['MAG', 'NNG', 'EF', 'SF']
['싣', '어', '라', '.']
['VV', 'EF', 'JKV', 'SF']
['오', '아라', '.']
['VV', 'EF', 'SF']
['믿', '어라', '.']
['VV', 'EF', 'SF']
['하', '아라', '.']
['VV', 'EF', 'SF']
['먹', '는다더라', '.']
['VV', 'EF', 'SF']
['가', '아라', '.']
['VV', 'EF', 'SF']
['뭐', '이', '야', '?']
['NP', 'VCP', 'EF', 'SF']
['알', '겠', '어', '?']
['VV', 'EP', 'EF', 'SF']
['오', '호', '이', '라', '.']
['NNP', 'IC', 'VCP', 'EF', 'SF']
먹어요. 기어라. 참아라. 싣어요라. 오아라. 믿어요. 빨리 하아라. 먹는다더라. 빨리 가아라. 이게 뭐이에요? 알겠어요? 오호이라.


In [56]:
부호 = ['./SF', '?/SF', '!/SF', ')/SS', '"/SS 다는 EF, 나머지는 EC', '…/SE', '”/SS 다는 EF, 나머지는 EC', '．/SF', ':/SP', '~/SO', ']/SS EF로나옴', '┃/XSV', ',/SP', '」/SS']
kr = open('/Users/jeonghui/Desktop/aihub.kr.shuf', "r" , encoding = "utf-8")
high = open('/Users/jeonghui/Desktop/aihub.khaiii번경.shuf' , "w", encoding = "utf-8")
  
texts=kr.readline()

while(texts != ''):
    try:
        text = Changer(texts)
        if text != texts:
            high.write(text+'\n')
        
    except:
        print(texts)
    
    texts=kr.readline()

kr.close()
high.close()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/jeonghui/Desktop/aihub.kr.shuf'

In [5]:

sentence = "의장 또는 위원장은 방호원으로 하여금 의회와 회의장의 질서를 유지케 할 수 있으며 필요에 따라 청원경찰 또는 경찰관의 파견을 요구할 수 있 다."

a = Changer(sentence)
print(a)

의장 또는 위원장은 방호원으로 하여금 의회와 회의장의 질서를 유지케 할 수 있으며 필요에 따라 청원경찰 또는 경찰관의 파견을 요구할 수 있 다.


In [2]:
CC = open('/Users/jeonghui/Desktop/오리지널확인/aihub.CC.kr', "r" , encoding = "utf-8")
kr = open('/Users/jeonghui/Desktop/오리지널확인/aihub.kr', "r" , encoding = "utf-8")
diff = open('/Users/jeonghui/Desktop/오리지널확인/aihub비교', "w" , encoding = "utf-8")

text1=CC.readline()
text2=kr.readline()

while(text1 != ''):
    if text1 != text2:
        diff.write(text1)
        diff.write(text2)
        diff.write('\n')
    
    text1=CC.readline()
    text2=kr.readline()

kr.close()
CC.close()
diff.close()

In [15]:
import pickle as pkl


with open('/Users/jeonghui/Desktop/aihub.vocab.kr.tok.sym.10000sub.safe.P10.pkl','rb') as f:
    src_dict = pkl.load(f,encoding = "utf-8")

print(src_dict)


{'UNK': 1, '<s>': 0, '</s>': 0, '.': 2, ',': 3, '이': 4, '에': 5, '수': 6, '한': 7, '의': 8, '을': 9, '있다': 10, '은': 11, '(': 12, ')': 13, '__P0': 14, '할': 15, '하는': 16, '‘': 17, '’': 18, '하고': 19, '과': 20, '·': 21, '“': 22, '한다': 23, '고': 24, '”': 25, '있는': 26, '가': 27, '에서': 28, '도': 29, '로': 30, '는': 31, '했다': 32, '해': 33, '를': 34, '으로': 35, '등': 36, '이@@': 37, '?': 38, '지@@': 39, '와': 40, '인': 41, '수@@': 42, '기@@': 43, '__P0@@': 44, '대@@': 45, '사@@': 46, '주@@': 47, '시@@': 48, '된': 49, '가@@': 50, '및': 51, '것으로': 52, '전@@': 53, '일@@': 54, '인@@': 55, '부@@': 56, '정@@': 57, '고@@': 58, '지': 59, '다': 60, '그': 61, '대한': 62, '자@@': 63, '신@@': 64, '해@@': 65, '유@@': 66, '조@@': 67, '보@@': 68, '동@@': 69, '상@@': 70, '전': 71, '원@@': 72, '기': 73, '공@@': 74, '비@@': 75, '성@@': 76, '한@@': 77, '장@@': 78, '게': 79, '%': 80, '소@@': 81, '위해': 82, '3@@': 83, '하게': 84, '2@@': 85, '도@@': 86, '자': 87, '리@@': 88, '에는': 89, '어@@': 90, '__P1': 91, '1@@': 92, '구@@': 93, '재@@': 94, '제@@': 95, '들이': 96, '연@@': 97, '여@@':