In [1]:
import numpy as np
import pandas as pd
from konlpy.tag import Komoran, Okt, Kkma, Hannanum

from collections import Counter

from keras.utils.np_utils import to_categorical

from keras import models
from keras import layers

In [2]:
data = pd.read_csv('kedi.csv', encoding='cp949')

In [3]:
sentence = data.프로그램명[0]
okt = Okt()
okt.nouns(sentence)

['명중', '명', '한글맞춤법']

In [4]:
komoran = Komoran()
komoran.nouns(sentence)


['명', '중', '명', '한글', '맞춤법']

In [5]:
kkma = Kkma()
kkma.nouns(sentence)

['10', '10명중9', '명', '중', '9', '한글', '한글맞춤법', '맞춤법']

In [6]:
def make_nouns(x):
    return okt.nouns(x)

In [7]:
data = data[["프로그램명","소분류코드","소분류"]]

In [8]:
data.columns = ['program_name','code','code_name']

In [9]:
data = data.iloc[1:,:]

In [10]:
data.reset_index(inplace=True)

In [11]:
data.drop('index',axis=1,inplace=True)

In [12]:
X = data.program_name.apply(make_nouns)

In [36]:
def remove_one(x):
    new_x = []
    for i in x:
        if len(i) > 1:
            new_x.append(i)
    return new_x

In [37]:
X = X.apply(remove_one)

In [38]:
data = data.iloc[1:,:]

In [39]:
full_text = []
for i in X:
    full_text.extend(i)

In [40]:

word_cnt = Counter(full_text)
common_word = word_cnt.most_common(1000)

In [41]:
used_word = [ i for i,j in common_word]

In [42]:
unique_word = list(set(full_text))

In [43]:
data.program_name.values.reshape(-1,1).shape

(125167, 1)

In [44]:
unique_word = [ i for i,j in common_word]

In [45]:
word_index = { k:v for k,v in enumerate(unique_word) }

In [46]:
index_word = { v:k for k,v in word_index.items()}

In [47]:
## 정수 인코딩
def label_encode(x):
    encoded_x = []
    for i in x:
        encoded_x.append(index_word.get(i,0))
    return encoded_x

In [48]:
label_encoded = X.apply(label_encode)

In [49]:
def vectorize_word(x,dimension=1000):
    t = np.zeros((x.shape[0],dimension))
    for k,v in enumerate(x.values):
        for i in v:
            if i < dimension:
                t[k,i] += 1
    return t

In [50]:
one_hot = vectorize_word(label_encoded)

In [51]:
code_text = []
for i in data.code:
    code_text.append(i)

In [52]:
unique_code = list(set(code_text))

In [53]:
code_index = { k:v for k,v in enumerate(unique_code)} 
index_code = { v:k for k,v in code_index.items()}

In [54]:
def code_encode(x):
    encoded_x = []
    for i in x:
        encoded_x.append(index_code.get(i,0))
    return encoded_x

In [55]:
label_code = []
for i in data.code:
    label_code.append(index_code[i])
data['label'] = label_code

In [56]:
y = to_categorical(data.label.values)

In [57]:
dt = pd.DataFrame(np.c_[one_hot,y])

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 125169 and the array at index 1 has size 125167

In [None]:
idx = np.random.choice(dt.index,len(dt.index),replace=False)

In [None]:
tr_idx = idx[:int(0.8*len(idx))]
te_idx = idx[int(0.8*len(idx)):]

In [None]:
X_train = dt.iloc[tr_idx,:1000].values
X_test = dt.iloc[te_idx,:1000].values
y_train = dt.iloc[tr_idx,1000:].values
y_test = dt.iloc[te_idx,1000:].values

In [None]:
input_shape = X_train.shape[1]
output_shape = y_train.shape[1]

In [None]:
model = Sequential()
model.add(Dense(256,activation='relu',input_shape=(input_shape,)))
model.add(Dense(128,activation='sigmoid'))
model.add(Dense(output_shape,activation='softmax'))

In [None]:
model.summary()

In [None]:
optimizer = 'rmsprop'
loss = 'categorical_crossentropy'
metrics = ['accuracy']
model.compile(optimizer=optimizer,
             loss=loss,
             metrics=metrics)

In [None]:
epochs = 100
batch_size = 300 
model.fit(X_train,
         y_train,
         epochs=epochs,
         batch_size=batch_size)

In [None]:
"""
make_sentences(X_train):
input : np.array: ([123,12,15,48],[51,74,63], ...)
output : list:['text', 'text', ...]


동작

np 어레이를 리스트로 typecasting 후
내부 리스트 하나씩(sentence) 읽어와서
원소(index) 하나씩 단어사전(index_word) 참조
만약 index가 사전에 최댓값 보다 크다면 
사전 참조하지 않고 continue
단어 단위를 더할때에는 사이에 공백(' ')추가
리스트에 문장을 넣을 때 양 끝에 공백 벗기고(.strip())추가
리스트 반환

"""
def make_sentences(dummy):
    dummy_list = []
    for sentence in list(dummy):
        context = ''
        for index in sentence:
            if index > max_vlaue:
                continue
            context += index_word[index]+' '
        dummy_list.append(context.strip())
    return dummy_list

"""

contexts_to_index(sentence_list):
input: text_list: ['abc asv', 'bdvs brrwab brb', ...] 
output: index_list: [[1594, 2456, 24154, 1514, 1248, 12],[154,87,7,213,8], ...]

동작
인덱스리스트의 내부 리스트(sentence)를 하나씩 가져와서
만약 sentence에 공백이 존재하면 (하나 이상의 단어라는 의미)
word_list를 sentence.split으로 생성 단어 리스트
단어리스트를 보면서 index_word를 사전참조해
tmp에 append
tmp를 index_list에 담는다.

만약 공백이 존재하지 않으면? --> 단일 단어
바로 index_word에 사전참조하여 [리스트에 담아서] index_list에 담는다.

index_list 반환

"""

def contexts_to_index(sentence_list):
    index_list = []
    for sentence in sentence_list:
        if " " in sentence:
            tmp = []
            word_list = sentence.split()
            for word in word_list:
                try:
                    tmp.append(word_index[word])
                except:
                    try:
                        tmp.append(word_index[word.strip()])
                    except:
                        pass
                    # 만약 안되면 추가
                    tmp.append(0)
            index_list.append(tmp)
        else:
            index_list.append([word_index[sentence]])
    return index_list

"""
make_full_text(sentence_list):
input:
output:


"""

def make_full_text(sentence_list):
    full_text = ''
    for sentence in sentence_list:
        full_text += sentence + ' '
    return full_text.strip()


def vectorize_sequences(sequences, dimension=10000):
    # 크기가 (len(sequences), dimension))이고 모든 원소가 0인 행렬을 만듭니다
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.  # results[i]에서 특정 인덱스의 위치를 1로 만듭니다
    return results