In [22]:
import time
import matplotlib.pyplot as plt
import numpy as np
import math as mt
import seaborn as sns
from tqdm import tqdm
import pandas as pd
from konlpy.tag import Hannanum, Mecab
from hangul_utils import split_syllables, join_jamos
from tqdm.auto import tqdm
from kiwipiepy import Kiwi
import re
from pykospacing import Spacing
#from eunjeon import Mecab
han = Hannanum()
mec = Mecab()

number = 100

# 초성 리스트. 00 ~ 18
CHOSUNG_LIST = ['ㄱ', 'ㄲ', 'ㄴ', 'ㄷ', 'ㄸ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅃ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅉ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ']
# 중성 리스트. 00 ~ 20
JUNGSUNG_LIST = ['ㅏ', 'ㅐ', 'ㅑ', 'ㅒ', 'ㅓ', 'ㅔ', 'ㅕ', 'ㅖ', 'ㅗ', 'ㅘ', 'ㅙ', 'ㅚ', 'ㅛ', 'ㅜ', 'ㅝ', 'ㅞ', 'ㅟ', 'ㅠ', 'ㅡ', 'ㅢ', 'ㅣ']
# 종성 리스트. 00 ~ 27 + 1(1개 없음)
JONGSUNG_LIST = ['_', 'ㄱ', 'ㄲ', 'ㄳ', 'ㄴ', 'ㄵ', 'ㄶ', 'ㄷ', 'ㄹ', 'ㄺ', 'ㄻ', 'ㄼ', 'ㄽ', 'ㄾ', 'ㄿ', 'ㅀ', 'ㅁ', 'ㅂ', 'ㅄ', 'ㅅ', 'ㅆ', 'ㅇ', 'ㅈ', 'ㅊ', 'ㅋ', 'ㅌ', 'ㅍ', 'ㅎ']

H_LIST = ['니다', '니까', '요', '시오', '죠']

con_dict = [
    
    ['ㅏㅣ','ㅐ'], ['ㅑㅣ','ㅒ'], ['ㅓㅣ','ㅔ'],
    ['ㅕㅣ','ㅖ'], ['ㅗㅣ','ㅚ'], ['ㅗㅐ','ㅙ'],
    ['ㅜㅓ','ㅝ'], ['ㅜㅔ','ㅞ'], ['ㅡㅣ','ㅢ'],
    ['ㅣㅏ','ㅑ'], ['ㅣㅓ','ㅕ'], ['ㅣㅗ','ㅛ'],
    ['ㅣㅜ','ㅠ'], ['ㅗㅏ','ㅘ']
    
]

#########<high -> low>###########

EP = [
    [['ㅈ','ㅓ','_','ㄴ','ㅡ','ㄴ'],['ㄴ','ㅏ','_','ㄴ','ㅡ','ㄴ']],
    [[' ','ㅈ','ㅓ','ㄴ',' '],['ㄴ','ㅏ','_','ㄴ','ㅡ','ㄴ']],
    [['ㅇ','ㅓ','_','ㅇ','ㅛ','_'],['ㄷ','ㅏ','_']]
    ,[['ㅅ','ㅔ'],['']]
]

EF_ONLY = [
    
    [['ㅅ','ㅡ','ㅂ','ㄴ','ㅣ','ㄷ','ㅏ'],['ㄷ','ㅏ']]
    
]

EP_EF = [
    
    [['ㅅ','ㅔ','ㅇ','ㅛ'],['']]
    #-시오 family
    ,[['ㅡ','ㅅ','ㅣ','ㅂ','ㅅ','ㅣ','ㅇ','ㅗ'],['ㅓ']]
    ,[['ㅅ','ㅣ','ㅂ','ㅅ','ㅣ','ㅇ','ㅗ'],['']]
    
]

VCP_EF = [
    
    
    
]

A_EF = [
    
    [['ㅂ','ㄴ','ㅣ','ㄷ','ㅏ'],['ㄴ','ㄷ','ㅏ']]
    ,[['ㅇ','ㅛ'],['']]
    
]

#종결 어미
EF = [
    [['ㅅ','ㅡ','ㅂ','ㄴ','ㅣ','ㄷ','ㅏ'],['ㄷ','ㅏ']],
    [[' ','ㅈ','ㅓ','ㄴ',' '],['ㄴ','ㅏ','_','ㄴ','ㅡ','ㄴ']],
    [['ㅇ','ㅓ','_','ㅇ','ㅛ','_'],['ㄷ','ㅏ','_']],
    [['ㅂ','ㄴ','ㅣ','ㄷ','ㅏ'],['ㄴ','ㄷ','ㅏ']],
    [['ㅇ','ㅔ','ㅇ','ㅛ'],['ㅇ','ㅑ']],
    [['ㅇ','ㅛ'],['']]
    #,[['ㅅ','ㅔ'],['ㅝ'],['ㅘ']]
]

NP = [
    
    [['ㅈ','ㅓ','ㄴ'],['ㄴ','ㅏ','ㄴ']],
    [['ㅈ','ㅓ'],['ㄴ','ㅏ']]
    
]

# 보조사
JX = [
   [['ㅇ','ㅣ','ㅇ','ㅛ'],['ㅇ','ㅣ','ㅇ','ㅑ']] 
]

VX = [
    
    [['ㅈ','ㅜ'],['ㅈ','ㅜ'],['ㅈ','ㅝ']]
    
]

VV = [
    
    [['ㅇ','ㅗ'],['ㅇ','ㅗ'],['ㅇ','ㅘ']],
    [['ㅈ','ㅜ'],['ㅈ','ㅜ'],['ㅈ','ㅝ']],
    [['ㅎ','ㅏ'],['ㅎ','ㅏ'],['ㅎ','ㅐ']]
    
]

#Not Use
EXC = [
    
    ['ㅜ',1],
    ['ㅗ',2],
    ['ㅏ',3]
    
]

EXC_word = [
    
    ['ㅅㅔ',2]
    ,['ㅅㅣㅂ',2]
    
]

EXC_tags = [
    
    'EP',
    'EF'
    
]

#########<high -> low>###########

#########<low -> high>###########
#
EF_ONLY_4S = [
    
    [['ㄷ','ㅏ'],['ㅅ','ㅡ','ㅂ','ㄴ','ㅣ','ㄷ','ㅏ'],['special']]
    
]

EF_ONLY_4C = [
    
    [['ㄷ','ㅏ'],['ㅅ','ㅡ','ㅂ','ㄴ','ㅣ','ㄷ','ㅏ']]
    ,[['ㅇ','ㅓ'],['ㅅ','ㅡ','ㅂ','ㄴ','ㅣ','ㄷ','ㅏ']]
    
]

VCP_EF_4 = [
    
    [['ㄷ','ㅏ'],['ㅇ','ㅣ','ㅂ','ㄴ','ㅣ','ㄷ','ㅏ']]
    
]

#VCP+EF와는 다르다. /VCP, /EF 인 경우
EF_AFTER_VCP_4 = [
    
    [['ㄷ','ㅏ'],['ㅂ','ㄴ','ㅣ','ㄷ','ㅏ']]
    
]
#########<low -> high>###########

def unite(input, dict):
    for i in dict:
        input = re.sub(i[0],i[1],input)
    return input
    

class Jamodealer:
    jamo = []
    pp = ''
    def __init__(self,lis_word):
        
        #print('the jamo starts!')
        self.jamo = []
        for i in lis_word:
            self.jamo.append(split_syllables(i))
        
    def make_one(self):
        self.pp = ''
        for i in self.jamo:
             self.pp= self.pp+i
                
        self.pp = unite(self.pp, con_dict)
                
        chars = list(set(self.pp))
        char_to_ix = { ch:i for i,ch in enumerate(chars) }
        ix_to_char = { i:ch for i,ch in enumerate(chars) }
        
        jamo_numbers = [char_to_ix[x] for x in self.pp]
        restored_jamo = ''.join([ix_to_char[x] for x in jamo_numbers])
        restored_text = join_jamos(restored_jamo)
        return restored_text

def tojamo(korean_word):
    r_lst = []
    for w in list(korean_word.strip()):
        ## 영어인 경우 구분해서 작성함. 
        if '가'<=w<='힣':
            ## 588개 마다 초성이 바뀜. 
            ch1 = (ord(w) - ord('가'))//588
            ## 중성은 총 28가지 종류
            ch2 = ((ord(w) - ord('가')) - (588*ch1)) // 28
            ch3 = (ord(w) - ord('가')) - (588*ch1) - 28*ch2
            r_lst.append([CHOSUNG_LIST[ch1], JUNGSUNG_LIST[ch2], JONGSUNG_LIST[ch3]])
        else:
            r_lst.append([w])
    return r_lst

def toword(arr):
    print('wow')
    

def to1dim(input):
    result=[]
    for i in input:
        for j in i:
            result.append(j)
    return result

def to2dim(input):
    result = []
    li = []
    for i in input:
        if i == ' ':
            result.append([' '])
        else:
            li.append(i)
        if len(li)==3:
            result.append(li)
            li = []
    return result

def makeone(input):
    result = ''
    li = ''
    for i in input:
        if i[0]==' ':
            result = result+' '
        else:
            ind = ord('가')
            ind +=CHOSUNG_LIST.index(i[0])*588
            ind +=JUNGSUNG_LIST.index(i[1])*28
            ind +=JONGSUNG_LIST.index(i[2])
            result = result+chr(ind)
    return result
        
def li2str(input):
    st = ""
    for i in input:
        st = st+i
    return st

def str2li(input):
    li = []
    for i in range(len(input)):
        li.append(input[i])
    return li

def makejamodict(input):
    result = []
    for i in input:
        bullet = []
        one = []
        two = []
        gre1 = tojamo(i[0])
        for j in gre1:
            for k in j:
                one.append(k)
        bullet.append(one)
        gre2 = tojamo(i[1])
        for j in gre2:
            for k in j:
                two.append(k)
        bullet.append(two)
        result.append(bullet)
    return result

def makestrdict(input):
    result = []
    for i in input:
        bullet = []
        for j in range(len(i)):
            gre = li2str(i[j])
            bullet.append(gre)
        result.append(bullet)
    return result

#추가한 사전에 대한 str 사전을 생성, mapping 시키는 부분

EP_dict = makestrdict(EP)
EF_dict = makestrdict(EF)
NP_dict = makestrdict(NP)
JX_dict = makestrdict(JX)
VX_dict = makestrdict(VX)
VV_dict = makestrdict(VV)

Dict_list=['EP','EF','NP','JX','VX','VV']

Dict_map = [EP_dict, EF_dict,NP_dict,JX_dict,VX_dict, VV_dict]

EF_ONLY_dict = makestrdict(EF_ONLY)
EP_EF_dict = makestrdict(EP_EF)
VCP_EF_dict = makestrdict(VCP_EF)
A_EF_dict = makestrdict(A_EF)

#여기까지

#str dict의 low->high 
EF_ONLY_4S_dict = makestrdict(EF_ONLY_4S)
EF_ONLY_4C_dict = makestrdict(EF_ONLY_4C)
EF_AFTER_VCP_4_dict = makestrdict(EF_AFTER_VCP_4)
VCP_EF_4_dict = makestrdict(VCP_EF_4)


def to2lists(input):
    lis_word = []
    lis_tag = []
    #data = han.pos(input,ntags=22,flatten=True, join=False)
    data = mec.pos(input)
    for i in data:
        lis_word.append(i[0])
        lis_tag.append(i[1])
    return lis_word, lis_tag

#add in 2021.09.26

def indee(lis, input):
    
    rlis = []
    
    for i in range(len(lis)):
        if lis[i]==input:
            rlis.append(i)
            
    for i in range(len(rlis)):
        rlis[i] = rlis[i]-i
            
    return rlis

def ind_lili(lis_space,lis_lis):
    
    rlis = []
    k=0
    for i in range(len(lis_lis)):
        
        if k in lis_space:
            rlis.append(i)
            
        k = k+len(lis_lis[i])
        
    
    return rlis

def union(lis, lis_lis):
    
    k = 0
    for i in lis:
        lis_lis.insert(i+k,' ')
        k = k+1


class Changer(object):
    #def __init__(self):
        #print('the changer starts!')
        
    def high_low(self, stc):
        result = stc
        
        space_list = indee(stc,' ')
        
        lis_word, lis_tag = to2lists(result)
        space_location = ind_lili(space_list, lis_word)
        jam = Jamodealer(lis_word)
        lis = []
        key = -1
        for i in H_LIST:
            if i in lis_word[-2]:
                key = 1
        if key>0:
            
            for i in range(len(lis_tag)):
                res = jam.jamo[i]
                for k in range(len(Dict_list)):
                    dic = []
                    if 'EF' in lis_tag[i]:
                        #print('aa')
                        if 'EF' == lis_tag[i]:
                            dic = EF_ONLY_dict
                        elif 'EP' in lis_tag[i]:
                            #print('bb')
                            dic = EP_EF_dict
                        elif 'VCP' in lis_tag[i]:
                            dic = VCP_EF_dict
                        else:
                            dic = A_EF_dict
                        
                    elif Dict_list[k] in lis_tag[i]:
                        dic = Dict_map[Dict_list.index(Dict_list[k])]
                        #res = jam.jamo[i]
                    for j in range(len(dic)):
                        if self.isExcept(dic[j])==1:
                            ind = self.indicator(i,jam.jamo,lis_tag,EXC_word, EXC_tags)
                            res = re.sub(dic[j][0],dic[j][ind],res)
                        else:
                            #print('tt')
                            res = re.sub(dic[j][0],dic[j][1],res)
                            
                        #jam.jamo[i] = res
                lis.append(res)
            
            
            #print(jam.jamo[i])
        print(lis)
        union(space_location, lis)
        jam.jamo = []
        for i in range(len(lis)):
            jam.jamo.append(lis[i])
            #print(lis[i])
        
        #union(space_location, jam.jamo)
        
        return jam.make_one()
    
    def low_high(self, stc):
        result = stc
        
        space_list = indee(stc, ' ')
        
        lis_word, lis_tag = to2lists(result)
        space_location = ind_lili(space_list, lis_word)
        jam = Jamodealer(lis_word)
        lis = []
        key = -1
        
        for i in H_LIST:
            if i not in lis_word[-2]:
                #print('hou')
                key = 1
        if key>0:
            
            res = jam.jamo[-2]
            dic = []
            #어말만 바꾸도록 구현되어 있다. 
            if 'EF' in lis_tag[-2]:
                print('aa')
                if 'EF' == lis_tag[-2]:
                    if '+' in lis_tag[-3]:
                        dic = EF_ONLY_4C_dict
                    elif lis_tag[-3]=='VCP':
                        dic = EF_AFTER_VCP_4_dict
                    else:
                        dic = EF_ONLY_4S_dict
                elif 'EP' in lis_tag[-2]:
                    dic = EP_EF_dict
                elif 'VCP' in lis_tag[-2]:
                    print('dd')
                    dic = VCP_EF_4_dict
                else:
                    dic = A_EF_dict
            for i in range(len(dic)):
                if len(dic[i])>2:
                    print('dd')
                    res = re.sub(dic[i][0],dic[i][1],res)
                else:
                    print('ee')
                    
                    res = re.sub(dic[i][0],dic[i][1],res)
            jam.jamo[-2] = res
            
        print(jam.jamo)
        union(space_location, jam.jamo)
        
        return jam.make_one()
            
        
        
        
    def isExcept(self, input):
        if len(input)>=3:
            return 1
        else:
            return 0
        
    def indicator(self, ind, lis, tag, ex_word, exc_tags):
        re = 1
        for j in range(len(ex_word)):
            if exc_tags[j] in tag[ind+1]:
                for i in range(len(ex_word)):
                    if ex_word[i][0] in lis[ind+1]:
                        re = ex_word[i][1]
                        break
                        print(lis[ind-1])
        return re
        
        
    def processText(self,stc):
        result = stc
        res = self.high_low(result)
        #spacing = Spacing()
        #res = spacing(res)
        return res
    
    def processText_0(self,stc):
        result = stc
        res = self.low_high(result)
        return res
    

In [24]:
#txt = '전 그것이 맞다고 생각합니다.'
tx = '찬성하는 사람은 이리로 오ㅏ.'
txt2 = '나는 천재다.'
txt3 = '나는 그것이 맞다고 생각하는 바보다.'
ch = Changer()
#tt = ch.processText(txt)
ttt = ch.processText_0(tx)
#print(tt)
print(ttt)

['ㅊㅏㄴㅅㅓㅇ', 'ㅎㅏ', 'ㄴㅡㄴ', 'ㅅㅏㄹㅏㅁ', 'ㅇㅡㄴ', 'ㅇㅣㄹㅣㄹㅗ', 'ㅇㅗ', 'ㅏ', '.']
찬성하는 사람은 이리로 와.
