In [1]:
from konlpy.tag import Okt,Kkma,Komoran,Mecab,Hannanum
import os
import re

import gensim
from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
import fasttext
import h5py

import sys
from random import shuffle

from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import MaxPooling1D, Embedding, Dense, Concatenate, Input, Reshape, Bidirectional, LSTM, Flatten, Dropout, Conv1D, Conv2D, MaxPooling2D, GRU, TimeDistributed
from keras.models import Model, Sequential, model_from_json
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import optimizers
from keras.optimizers import Adam

Using TensorFlow backend.


In [2]:
def CNNModel(num_filters = 128, drop =0.7):
    filter_sizes = [3,4,5]

    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    reshape = Reshape((MAX_SEQUENCE_LENGTH,EMBEDDING_DIM,1))(embedded_sequences)

    conv_0 = Conv2D(num_filters, 
                    kernel_size=(filter_sizes[0], EMBEDDING_DIM), 
                    padding='valid', 
                    kernel_initializer='normal', 
                    activation='relu')(reshape)
    conv_1 = Conv2D(num_filters, 
                    kernel_size=(filter_sizes[1], EMBEDDING_DIM), 
                    padding='valid', 
                    kernel_initializer='normal', 
                    activation='relu')(reshape)
    conv_2 = Conv2D(num_filters, 
                    kernel_size=(filter_sizes[2], EMBEDDING_DIM), 
                    padding='valid', 
                    kernel_initializer='normal', 
                    activation='relu')(reshape)


    maxpool_0 = MaxPooling2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[0] + 1, 1), 
                             strides=(1,1), 
                             padding='valid')(conv_0)
    maxpool_1 = MaxPooling2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[1] + 1, 1), 
                             strides=(1,1), 
                             padding='valid')(conv_1)
    maxpool_2 = MaxPooling2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[2] + 1, 1), 
                             strides=(1,1), 
                             padding='valid')(conv_2)

    concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
    flatten = Flatten()(concatenated_tensor)
    dropout = Dropout(drop)(flatten)
    preds = Dense(8, activation='softmax')(dropout)

    model = Model(sequence_input, preds)

    return model

In [61]:
model = CNNModel()
model.load_weights('./model_log_demo/CNN/val_loss-0.6965.hdf5')

W1215 17:20:24.976094  2644 nn_ops.py:4224] Large dropout rate: 0.7 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.


In [91]:
xTest = getSequence(Xdata, MAX_FEATURES, word_index)

In [94]:
model.predict(xTest)

array([[0.1348152 , 0.1260254 , 0.05959042, 0.03534745, 0.0488081 ,
        0.15353604, 0.00550095, 0.4363764 ]], dtype=float32)

In [4]:
MAX_SEQUENCE_LENGTH = 525
EMBEDDING_DIM = 300

In [20]:
from gensim.models.wrappers import FastText
ftModel = KeyedVectors.load_word2vec_format('wiki.ko.vec')
ftVocab= list(ftModel.wv.vocab)

  This is separate from the ipykernel package so we can avoid doing imports until


In [21]:
def ebdIdx(model, vocab_list):
    embd_idx = {}

    for w in vocab_list:
        embd_idx[w] = model.__getitem__(w)

    print(len(embd_idx))
    return embd_idx

In [22]:
embedding_idx = ebdIdx(ftModel, ftVocab)

879129


In [9]:
def getSequence(text, MAX_FEATURES, word_index):
    seq = []
    for line in text:
        lineseq = []
        for i in range(len(line)):
            if (word_index[line[i]] < MAX_FEATURES):
                lineseq.append(word_index[line[i]])
            else:
                pass
        seq.append(lineseq)
    seq = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
    seq = np.array(seq)
    return seq

In [10]:
MAX_FEATURES = len(embedding_idx)
MAX_SEQUENCE_LENGTH = 525
EMBEDDING_DIM = 300
print(MAX_FEATURES)

879129


In [71]:
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(Xdata)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(Xdata)
seqlen = np.array([len(sequence) for sequence in sequences])
np.histogram(seqlen, bins=50)

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0], dtype=int64),
 array([187.5 , 187.52, 187.54, 187.56, 187.58, 187.6 , 187.62, 187.64,
        187.66, 187.68, 187.7 , 187.72, 187.74, 187.76, 187.78, 187.8 ,
        187.82, 187.84, 187.86, 187.88, 187.9 , 187.92, 187.94, 187.96,
        187.98, 188.  , 188.02, 188.04, 188.06, 188.08, 188.1 , 188.12,
        188.14, 188.16, 188.18, 188.2 , 188.22, 188.24, 188.26, 188.28,
        188.3 , 188.32, 188.34, 188.36, 188.38, 188.4 , 188.42, 188.44,
        188.46, 188.48, 188.5 ]))

In [17]:
print('Preparing embedding matrix...')

num_words = min(MAX_FEATURES, len(ftVocab) + 1) #unknown word 때문에 +1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i >= MAX_FEATURES:
        continue
    embedding_vector = embedding_idx.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix], 
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

Preparing embedding matrix...


In [83]:
print(embedding_matrix.shape)

(15515, 300)


In [13]:
def text_cleaner(text):
        cleantext = []
        publisher = r"\((.*?)\)"
        braces = r"\[(.*?)\]"
        braces2 = r"\{(.*?)\}"
        braces3 = r"\【(.*?)\】"
        writer = r"특파원"
        writer2 = r"기자"
        weird = r"[=_\.,;:~…\"\"\'\'◇%\<\>/·○★☆]"
        tab = r'\t'
        newline = r'\n'
        r=r'\r'
        for line in text:
            try:
                clean = re.sub(writer, '', line)
                clean = re.sub(writer2, '', clean)
                clean = re.sub(publisher,'', clean)
                clean = re.sub(braces,'', clean)
                clean = re.sub(braces2,'', clean)
                clean = re.sub(braces3,'', clean)
                clean = re.sub('[YTN,OSEN]','', clean)
                clean = re.sub(weird,'', clean)
                clean = re.sub(tab,'', clean)
                clean = re.sub(newline,'',clean)
                clean = re.sub(r,'',clean)
                cleantext.append(clean)
            except:
                pass
        return cleantext

In [14]:
def morphs_process(lines, tagger):
    sentences = []
    for line in lines:
        sentence = []
        po= tagger.pos(line)
        for pair in po:
            if pair[1] in ['Noun','Verb']:
                morpheme = pair[0]
                sentence.append(morpheme)
            else:
                continue
        sentences.append(sentence)
    return sentences

In [93]:
Xdata=[]
f = open('./newsData/0/정치17.txt',encoding='utf-8')
Xdata.append(f.read())
Xdata = text_cleaner(Xdata)
tokenizer= Okt()
Xdata = morphs_process(Xdata, tokenizer)
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(Xdata)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(Xdata)
seqlen = np.array([len(sequence) for sequence in sequences])
np.histogram(seqlen, bins=50)
xTest = getSequence(Xdata, MAX_FEATURES, word_index)

In [39]:
categoriesFolder = {"정치":0,"경제":1,"사회":2,"생활문화":3,"세계":4,"IT과학":5,"연예":6,"스포츠":7}
Xtrain = []
Ytrain = []
Xtest = []
Ytest = []
Xdata = []
Ydata = []
def readNewsTrainData(categoryName,newsNumber):
    if categoryName=="all":
        for n in range(1,newsNumber+1):
            for folder in categoriesFolder.keys():
                folderName = str(categoriesFolder.get(folder))
                filePath=os.path.join('./newsData',folderName)
                f=open(os.path.join(filePath,folder+str(n)+".txt"),encoding='utf-8')
                try:
                    Xdata.append(f.read())
                except:
                    f=open(os.path.join(filePath,folder+str(n)+".txt"),'rb')
                    Xdata.append(f.read().decode('cp949'))
                Ydata.append(folderName)
                print(str(os.path.join(filePath,folder+str(n)+".txt")+" saved"))

In [40]:
readNewsTrainData("all",1000)

./newsData\0\정치1.txt saved
./newsData\1\경제1.txt saved
./newsData\2\사회1.txt saved
./newsData\3\생활문화1.txt saved
./newsData\4\세계1.txt saved
./newsData\5\IT과학1.txt saved
./newsData\6\연예1.txt saved
./newsData\7\스포츠1.txt saved
./newsData\0\정치2.txt saved
./newsData\1\경제2.txt saved
./newsData\2\사회2.txt saved
./newsData\3\생활문화2.txt saved
./newsData\4\세계2.txt saved
./newsData\5\IT과학2.txt saved
./newsData\6\연예2.txt saved
./newsData\7\스포츠2.txt saved
./newsData\0\정치3.txt saved
./newsData\1\경제3.txt saved
./newsData\2\사회3.txt saved
./newsData\3\생활문화3.txt saved
./newsData\4\세계3.txt saved
./newsData\5\IT과학3.txt saved
./newsData\6\연예3.txt saved
./newsData\7\스포츠3.txt saved
./newsData\0\정치4.txt saved
./newsData\1\경제4.txt saved
./newsData\2\사회4.txt saved
./newsData\3\생활문화4.txt saved
./newsData\4\세계4.txt saved
./newsData\5\IT과학4.txt saved
./newsData\6\연예4.txt saved
./newsData\7\스포츠4.txt saved
./newsData\0\정치5.txt saved
./newsData\1\경제5.txt saved
./newsData\2\사회5.txt saved
./newsData\3\생활문화5.txt saved
./news

./newsData\2\사회62.txt saved
./newsData\3\생활문화62.txt saved
./newsData\4\세계62.txt saved
./newsData\5\IT과학62.txt saved
./newsData\6\연예62.txt saved
./newsData\7\스포츠62.txt saved
./newsData\0\정치63.txt saved
./newsData\1\경제63.txt saved
./newsData\2\사회63.txt saved
./newsData\3\생활문화63.txt saved
./newsData\4\세계63.txt saved
./newsData\5\IT과학63.txt saved
./newsData\6\연예63.txt saved
./newsData\7\스포츠63.txt saved
./newsData\0\정치64.txt saved
./newsData\1\경제64.txt saved
./newsData\2\사회64.txt saved
./newsData\3\생활문화64.txt saved
./newsData\4\세계64.txt saved
./newsData\5\IT과학64.txt saved
./newsData\6\연예64.txt saved
./newsData\7\스포츠64.txt saved
./newsData\0\정치65.txt saved
./newsData\1\경제65.txt saved
./newsData\2\사회65.txt saved
./newsData\3\생활문화65.txt saved
./newsData\4\세계65.txt saved
./newsData\5\IT과학65.txt saved
./newsData\6\연예65.txt saved
./newsData\7\스포츠65.txt saved
./newsData\0\정치66.txt saved
./newsData\1\경제66.txt saved
./newsData\2\사회66.txt saved
./newsData\3\생활문화66.txt saved
./newsData\4\세계66.txt save

./newsData\0\정치117.txt saved
./newsData\1\경제117.txt saved
./newsData\2\사회117.txt saved
./newsData\3\생활문화117.txt saved
./newsData\4\세계117.txt saved
./newsData\5\IT과학117.txt saved
./newsData\6\연예117.txt saved
./newsData\7\스포츠117.txt saved
./newsData\0\정치118.txt saved
./newsData\1\경제118.txt saved
./newsData\2\사회118.txt saved
./newsData\3\생활문화118.txt saved
./newsData\4\세계118.txt saved
./newsData\5\IT과학118.txt saved
./newsData\6\연예118.txt saved
./newsData\7\스포츠118.txt saved
./newsData\0\정치119.txt saved
./newsData\1\경제119.txt saved
./newsData\2\사회119.txt saved
./newsData\3\생활문화119.txt saved
./newsData\4\세계119.txt saved
./newsData\5\IT과학119.txt saved
./newsData\6\연예119.txt saved
./newsData\7\스포츠119.txt saved
./newsData\0\정치120.txt saved
./newsData\1\경제120.txt saved
./newsData\2\사회120.txt saved
./newsData\3\생활문화120.txt saved
./newsData\4\세계120.txt saved
./newsData\5\IT과학120.txt saved
./newsData\6\연예120.txt saved
./newsData\7\스포츠120.txt saved
./newsData\0\정치121.txt saved
./newsData\1\경제121.txt 

./newsData\4\세계171.txt saved
./newsData\5\IT과학171.txt saved
./newsData\6\연예171.txt saved
./newsData\7\스포츠171.txt saved
./newsData\0\정치172.txt saved
./newsData\1\경제172.txt saved
./newsData\2\사회172.txt saved
./newsData\3\생활문화172.txt saved
./newsData\4\세계172.txt saved
./newsData\5\IT과학172.txt saved
./newsData\6\연예172.txt saved
./newsData\7\스포츠172.txt saved
./newsData\0\정치173.txt saved
./newsData\1\경제173.txt saved
./newsData\2\사회173.txt saved
./newsData\3\생활문화173.txt saved
./newsData\4\세계173.txt saved
./newsData\5\IT과학173.txt saved
./newsData\6\연예173.txt saved
./newsData\7\스포츠173.txt saved
./newsData\0\정치174.txt saved
./newsData\1\경제174.txt saved
./newsData\2\사회174.txt saved
./newsData\3\생활문화174.txt saved
./newsData\4\세계174.txt saved
./newsData\5\IT과학174.txt saved
./newsData\6\연예174.txt saved
./newsData\7\스포츠174.txt saved
./newsData\0\정치175.txt saved
./newsData\1\경제175.txt saved
./newsData\2\사회175.txt saved
./newsData\3\생활문화175.txt saved
./newsData\4\세계175.txt saved
./newsData\5\IT과학175.tx

./newsData\6\연예207.txt saved
./newsData\7\스포츠207.txt saved
./newsData\0\정치208.txt saved
./newsData\1\경제208.txt saved
./newsData\2\사회208.txt saved
./newsData\3\생활문화208.txt saved
./newsData\4\세계208.txt saved
./newsData\5\IT과학208.txt saved
./newsData\6\연예208.txt saved
./newsData\7\스포츠208.txt saved
./newsData\0\정치209.txt saved
./newsData\1\경제209.txt saved
./newsData\2\사회209.txt saved
./newsData\3\생활문화209.txt saved
./newsData\4\세계209.txt saved
./newsData\5\IT과학209.txt saved
./newsData\6\연예209.txt saved
./newsData\7\스포츠209.txt saved
./newsData\0\정치210.txt saved
./newsData\1\경제210.txt saved
./newsData\2\사회210.txt saved
./newsData\3\생활문화210.txt saved
./newsData\4\세계210.txt saved
./newsData\5\IT과학210.txt saved
./newsData\6\연예210.txt saved
./newsData\7\스포츠210.txt saved
./newsData\0\정치211.txt saved
./newsData\1\경제211.txt saved
./newsData\2\사회211.txt saved
./newsData\3\생활문화211.txt saved
./newsData\4\세계211.txt saved
./newsData\5\IT과학211.txt saved
./newsData\6\연예211.txt saved
./newsData\7\스포츠211.txt

./newsData\3\생활문화244.txt saved
./newsData\4\세계244.txt saved
./newsData\5\IT과학244.txt saved
./newsData\6\연예244.txt saved
./newsData\7\스포츠244.txt saved
./newsData\0\정치245.txt saved
./newsData\1\경제245.txt saved
./newsData\2\사회245.txt saved
./newsData\3\생활문화245.txt saved
./newsData\4\세계245.txt saved
./newsData\5\IT과학245.txt saved
./newsData\6\연예245.txt saved
./newsData\7\스포츠245.txt saved
./newsData\0\정치246.txt saved
./newsData\1\경제246.txt saved
./newsData\2\사회246.txt saved
./newsData\3\생활문화246.txt saved
./newsData\4\세계246.txt saved
./newsData\5\IT과학246.txt saved
./newsData\6\연예246.txt saved
./newsData\7\스포츠246.txt saved
./newsData\0\정치247.txt saved
./newsData\1\경제247.txt saved
./newsData\2\사회247.txt saved
./newsData\3\생활문화247.txt saved
./newsData\4\세계247.txt saved
./newsData\5\IT과학247.txt saved
./newsData\6\연예247.txt saved
./newsData\7\스포츠247.txt saved
./newsData\0\정치248.txt saved
./newsData\1\경제248.txt saved
./newsData\2\사회248.txt saved
./newsData\3\생활문화248.txt saved
./newsData\4\세계248.tx

./newsData\2\사회279.txt saved
./newsData\3\생활문화279.txt saved
./newsData\4\세계279.txt saved
./newsData\5\IT과학279.txt saved
./newsData\6\연예279.txt saved
./newsData\7\스포츠279.txt saved
./newsData\0\정치280.txt saved
./newsData\1\경제280.txt saved
./newsData\2\사회280.txt saved
./newsData\3\생활문화280.txt saved
./newsData\4\세계280.txt saved
./newsData\5\IT과학280.txt saved
./newsData\6\연예280.txt saved
./newsData\7\스포츠280.txt saved
./newsData\0\정치281.txt saved
./newsData\1\경제281.txt saved
./newsData\2\사회281.txt saved
./newsData\3\생활문화281.txt saved
./newsData\4\세계281.txt saved
./newsData\5\IT과학281.txt saved
./newsData\6\연예281.txt saved
./newsData\7\스포츠281.txt saved
./newsData\0\정치282.txt saved
./newsData\1\경제282.txt saved
./newsData\2\사회282.txt saved
./newsData\3\생활문화282.txt saved
./newsData\4\세계282.txt saved
./newsData\5\IT과학282.txt saved
./newsData\6\연예282.txt saved
./newsData\7\스포츠282.txt saved
./newsData\0\정치283.txt saved
./newsData\1\경제283.txt saved
./newsData\2\사회283.txt saved
./newsData\3\생활문화283.tx

./newsData\1\경제314.txt saved
./newsData\2\사회314.txt saved
./newsData\3\생활문화314.txt saved
./newsData\4\세계314.txt saved
./newsData\5\IT과학314.txt saved
./newsData\6\연예314.txt saved
./newsData\7\스포츠314.txt saved
./newsData\0\정치315.txt saved
./newsData\1\경제315.txt saved
./newsData\2\사회315.txt saved
./newsData\3\생활문화315.txt saved
./newsData\4\세계315.txt saved
./newsData\5\IT과학315.txt saved
./newsData\6\연예315.txt saved
./newsData\7\스포츠315.txt saved
./newsData\0\정치316.txt saved
./newsData\1\경제316.txt saved
./newsData\2\사회316.txt saved
./newsData\3\생활문화316.txt saved
./newsData\4\세계316.txt saved
./newsData\5\IT과학316.txt saved
./newsData\6\연예316.txt saved
./newsData\7\스포츠316.txt saved
./newsData\0\정치317.txt saved
./newsData\1\경제317.txt saved
./newsData\2\사회317.txt saved
./newsData\3\생활문화317.txt saved
./newsData\4\세계317.txt saved
./newsData\5\IT과학317.txt saved
./newsData\6\연예317.txt saved
./newsData\7\스포츠317.txt saved
./newsData\0\정치318.txt saved
./newsData\1\경제318.txt saved
./newsData\2\사회318.txt 

./newsData\3\생활문화350.txt saved
./newsData\4\세계350.txt saved
./newsData\5\IT과학350.txt saved
./newsData\6\연예350.txt saved
./newsData\7\스포츠350.txt saved
./newsData\0\정치351.txt saved
./newsData\1\경제351.txt saved
./newsData\2\사회351.txt saved
./newsData\3\생활문화351.txt saved
./newsData\4\세계351.txt saved
./newsData\5\IT과학351.txt saved
./newsData\6\연예351.txt saved
./newsData\7\스포츠351.txt saved
./newsData\0\정치352.txt saved
./newsData\1\경제352.txt saved
./newsData\2\사회352.txt saved
./newsData\3\생활문화352.txt saved
./newsData\4\세계352.txt saved
./newsData\5\IT과학352.txt saved
./newsData\6\연예352.txt saved
./newsData\7\스포츠352.txt saved
./newsData\0\정치353.txt saved
./newsData\1\경제353.txt saved
./newsData\2\사회353.txt saved
./newsData\3\생활문화353.txt saved
./newsData\4\세계353.txt saved
./newsData\5\IT과학353.txt saved
./newsData\6\연예353.txt saved
./newsData\7\스포츠353.txt saved
./newsData\0\정치354.txt saved
./newsData\1\경제354.txt saved
./newsData\2\사회354.txt saved
./newsData\3\생활문화354.txt saved
./newsData\4\세계354.tx

./newsData\2\사회386.txt saved
./newsData\3\생활문화386.txt saved
./newsData\4\세계386.txt saved
./newsData\5\IT과학386.txt saved
./newsData\6\연예386.txt saved
./newsData\7\스포츠386.txt saved
./newsData\0\정치387.txt saved
./newsData\1\경제387.txt saved
./newsData\2\사회387.txt saved
./newsData\3\생활문화387.txt saved
./newsData\4\세계387.txt saved
./newsData\5\IT과학387.txt saved
./newsData\6\연예387.txt saved
./newsData\7\스포츠387.txt saved
./newsData\0\정치388.txt saved
./newsData\1\경제388.txt saved
./newsData\2\사회388.txt saved
./newsData\3\생활문화388.txt saved
./newsData\4\세계388.txt saved
./newsData\5\IT과학388.txt saved
./newsData\6\연예388.txt saved
./newsData\7\스포츠388.txt saved
./newsData\0\정치389.txt saved
./newsData\1\경제389.txt saved
./newsData\2\사회389.txt saved
./newsData\3\생활문화389.txt saved
./newsData\4\세계389.txt saved
./newsData\5\IT과학389.txt saved
./newsData\6\연예389.txt saved
./newsData\7\스포츠389.txt saved
./newsData\0\정치390.txt saved
./newsData\1\경제390.txt saved
./newsData\2\사회390.txt saved
./newsData\3\생활문화390.tx

./newsData\3\생활문화421.txt saved
./newsData\4\세계421.txt saved
./newsData\5\IT과학421.txt saved
./newsData\6\연예421.txt saved
./newsData\7\스포츠421.txt saved
./newsData\0\정치422.txt saved
./newsData\1\경제422.txt saved
./newsData\2\사회422.txt saved
./newsData\3\생활문화422.txt saved
./newsData\4\세계422.txt saved
./newsData\5\IT과학422.txt saved
./newsData\6\연예422.txt saved
./newsData\7\스포츠422.txt saved
./newsData\0\정치423.txt saved
./newsData\1\경제423.txt saved
./newsData\2\사회423.txt saved
./newsData\3\생활문화423.txt saved
./newsData\4\세계423.txt saved
./newsData\5\IT과학423.txt saved
./newsData\6\연예423.txt saved
./newsData\7\스포츠423.txt saved
./newsData\0\정치424.txt saved
./newsData\1\경제424.txt saved
./newsData\2\사회424.txt saved
./newsData\3\생활문화424.txt saved
./newsData\4\세계424.txt saved
./newsData\5\IT과학424.txt saved
./newsData\6\연예424.txt saved
./newsData\7\스포츠424.txt saved
./newsData\0\정치425.txt saved
./newsData\1\경제425.txt saved
./newsData\2\사회425.txt saved
./newsData\3\생활문화425.txt saved
./newsData\4\세계425.tx

./newsData\0\정치459.txt saved
./newsData\1\경제459.txt saved
./newsData\2\사회459.txt saved
./newsData\3\생활문화459.txt saved
./newsData\4\세계459.txt saved
./newsData\5\IT과학459.txt saved
./newsData\6\연예459.txt saved
./newsData\7\스포츠459.txt saved
./newsData\0\정치460.txt saved
./newsData\1\경제460.txt saved
./newsData\2\사회460.txt saved
./newsData\3\생활문화460.txt saved
./newsData\4\세계460.txt saved
./newsData\5\IT과학460.txt saved
./newsData\6\연예460.txt saved
./newsData\7\스포츠460.txt saved
./newsData\0\정치461.txt saved
./newsData\1\경제461.txt saved
./newsData\2\사회461.txt saved
./newsData\3\생활문화461.txt saved
./newsData\4\세계461.txt saved
./newsData\5\IT과학461.txt saved
./newsData\6\연예461.txt saved
./newsData\7\스포츠461.txt saved
./newsData\0\정치462.txt saved
./newsData\1\경제462.txt saved
./newsData\2\사회462.txt saved
./newsData\3\생활문화462.txt saved
./newsData\4\세계462.txt saved
./newsData\5\IT과학462.txt saved
./newsData\6\연예462.txt saved
./newsData\7\스포츠462.txt saved
./newsData\0\정치463.txt saved
./newsData\1\경제463.txt 

./newsData\1\경제496.txt saved
./newsData\2\사회496.txt saved
./newsData\3\생활문화496.txt saved
./newsData\4\세계496.txt saved
./newsData\5\IT과학496.txt saved
./newsData\6\연예496.txt saved
./newsData\7\스포츠496.txt saved
./newsData\0\정치497.txt saved
./newsData\1\경제497.txt saved
./newsData\2\사회497.txt saved
./newsData\3\생활문화497.txt saved
./newsData\4\세계497.txt saved
./newsData\5\IT과학497.txt saved
./newsData\6\연예497.txt saved
./newsData\7\스포츠497.txt saved
./newsData\0\정치498.txt saved
./newsData\1\경제498.txt saved
./newsData\2\사회498.txt saved
./newsData\3\생활문화498.txt saved
./newsData\4\세계498.txt saved
./newsData\5\IT과학498.txt saved
./newsData\6\연예498.txt saved
./newsData\7\스포츠498.txt saved
./newsData\0\정치499.txt saved
./newsData\1\경제499.txt saved
./newsData\2\사회499.txt saved
./newsData\3\생활문화499.txt saved
./newsData\4\세계499.txt saved
./newsData\5\IT과학499.txt saved
./newsData\6\연예499.txt saved
./newsData\7\스포츠499.txt saved
./newsData\0\정치500.txt saved
./newsData\1\경제500.txt saved
./newsData\2\사회500.txt 

./newsData\5\IT과학531.txt saved
./newsData\6\연예531.txt saved
./newsData\7\스포츠531.txt saved
./newsData\0\정치532.txt saved
./newsData\1\경제532.txt saved
./newsData\2\사회532.txt saved
./newsData\3\생활문화532.txt saved
./newsData\4\세계532.txt saved
./newsData\5\IT과학532.txt saved
./newsData\6\연예532.txt saved
./newsData\7\스포츠532.txt saved
./newsData\0\정치533.txt saved
./newsData\1\경제533.txt saved
./newsData\2\사회533.txt saved
./newsData\3\생활문화533.txt saved
./newsData\4\세계533.txt saved
./newsData\5\IT과학533.txt saved
./newsData\6\연예533.txt saved
./newsData\7\스포츠533.txt saved
./newsData\0\정치534.txt saved
./newsData\1\경제534.txt saved
./newsData\2\사회534.txt saved
./newsData\3\생활문화534.txt saved
./newsData\4\세계534.txt saved
./newsData\5\IT과학534.txt saved
./newsData\6\연예534.txt saved
./newsData\7\스포츠534.txt saved
./newsData\0\정치535.txt saved
./newsData\1\경제535.txt saved
./newsData\2\사회535.txt saved
./newsData\3\생활문화535.txt saved
./newsData\4\세계535.txt saved
./newsData\5\IT과학535.txt saved
./newsData\6\연예535.tx

./newsData\0\정치569.txt saved
./newsData\1\경제569.txt saved
./newsData\2\사회569.txt saved
./newsData\3\생활문화569.txt saved
./newsData\4\세계569.txt saved
./newsData\5\IT과학569.txt saved
./newsData\6\연예569.txt saved
./newsData\7\스포츠569.txt saved
./newsData\0\정치570.txt saved
./newsData\1\경제570.txt saved
./newsData\2\사회570.txt saved
./newsData\3\생활문화570.txt saved
./newsData\4\세계570.txt saved
./newsData\5\IT과학570.txt saved
./newsData\6\연예570.txt saved
./newsData\7\스포츠570.txt saved
./newsData\0\정치571.txt saved
./newsData\1\경제571.txt saved
./newsData\2\사회571.txt saved
./newsData\3\생활문화571.txt saved
./newsData\4\세계571.txt saved
./newsData\5\IT과학571.txt saved
./newsData\6\연예571.txt saved
./newsData\7\스포츠571.txt saved
./newsData\0\정치572.txt saved
./newsData\1\경제572.txt saved
./newsData\2\사회572.txt saved
./newsData\3\생활문화572.txt saved
./newsData\4\세계572.txt saved
./newsData\5\IT과학572.txt saved
./newsData\6\연예572.txt saved
./newsData\7\스포츠572.txt saved
./newsData\0\정치573.txt saved
./newsData\1\경제573.txt 

./newsData\2\사회604.txt saved
./newsData\3\생활문화604.txt saved
./newsData\4\세계604.txt saved
./newsData\5\IT과학604.txt saved
./newsData\6\연예604.txt saved
./newsData\7\스포츠604.txt saved
./newsData\0\정치605.txt saved
./newsData\1\경제605.txt saved
./newsData\2\사회605.txt saved
./newsData\3\생활문화605.txt saved
./newsData\4\세계605.txt saved
./newsData\5\IT과학605.txt saved
./newsData\6\연예605.txt saved
./newsData\7\스포츠605.txt saved
./newsData\0\정치606.txt saved
./newsData\1\경제606.txt saved
./newsData\2\사회606.txt saved
./newsData\3\생활문화606.txt saved
./newsData\4\세계606.txt saved
./newsData\5\IT과학606.txt saved
./newsData\6\연예606.txt saved
./newsData\7\스포츠606.txt saved
./newsData\0\정치607.txt saved
./newsData\1\경제607.txt saved
./newsData\2\사회607.txt saved
./newsData\3\생활문화607.txt saved
./newsData\4\세계607.txt saved
./newsData\5\IT과학607.txt saved
./newsData\6\연예607.txt saved
./newsData\7\스포츠607.txt saved
./newsData\0\정치608.txt saved
./newsData\1\경제608.txt saved
./newsData\2\사회608.txt saved
./newsData\3\생활문화608.tx

./newsData\4\세계639.txt saved
./newsData\5\IT과학639.txt saved
./newsData\6\연예639.txt saved
./newsData\7\스포츠639.txt saved
./newsData\0\정치640.txt saved
./newsData\1\경제640.txt saved
./newsData\2\사회640.txt saved
./newsData\3\생활문화640.txt saved
./newsData\4\세계640.txt saved
./newsData\5\IT과학640.txt saved
./newsData\6\연예640.txt saved
./newsData\7\스포츠640.txt saved
./newsData\0\정치641.txt saved
./newsData\1\경제641.txt saved
./newsData\2\사회641.txt saved
./newsData\3\생활문화641.txt saved
./newsData\4\세계641.txt saved
./newsData\5\IT과학641.txt saved
./newsData\6\연예641.txt saved
./newsData\7\스포츠641.txt saved
./newsData\0\정치642.txt saved
./newsData\1\경제642.txt saved
./newsData\2\사회642.txt saved
./newsData\3\생활문화642.txt saved
./newsData\4\세계642.txt saved
./newsData\5\IT과학642.txt saved
./newsData\6\연예642.txt saved
./newsData\7\스포츠642.txt saved
./newsData\0\정치643.txt saved
./newsData\1\경제643.txt saved
./newsData\2\사회643.txt saved
./newsData\3\생활문화643.txt saved
./newsData\4\세계643.txt saved
./newsData\5\IT과학643.tx

./newsData\2\사회675.txt saved
./newsData\3\생활문화675.txt saved
./newsData\4\세계675.txt saved
./newsData\5\IT과학675.txt saved
./newsData\6\연예675.txt saved
./newsData\7\스포츠675.txt saved
./newsData\0\정치676.txt saved
./newsData\1\경제676.txt saved
./newsData\2\사회676.txt saved
./newsData\3\생활문화676.txt saved
./newsData\4\세계676.txt saved
./newsData\5\IT과학676.txt saved
./newsData\6\연예676.txt saved
./newsData\7\스포츠676.txt saved
./newsData\0\정치677.txt saved
./newsData\1\경제677.txt saved
./newsData\2\사회677.txt saved
./newsData\3\생활문화677.txt saved
./newsData\4\세계677.txt saved
./newsData\5\IT과학677.txt saved
./newsData\6\연예677.txt saved
./newsData\7\스포츠677.txt saved
./newsData\0\정치678.txt saved
./newsData\1\경제678.txt saved
./newsData\2\사회678.txt saved
./newsData\3\생활문화678.txt saved
./newsData\4\세계678.txt saved
./newsData\5\IT과학678.txt saved
./newsData\6\연예678.txt saved
./newsData\7\스포츠678.txt saved
./newsData\0\정치679.txt saved
./newsData\1\경제679.txt saved
./newsData\2\사회679.txt saved
./newsData\3\생활문화679.tx

./newsData\2\사회711.txt saved
./newsData\3\생활문화711.txt saved
./newsData\4\세계711.txt saved
./newsData\5\IT과학711.txt saved
./newsData\6\연예711.txt saved
./newsData\7\스포츠711.txt saved
./newsData\0\정치712.txt saved
./newsData\1\경제712.txt saved
./newsData\2\사회712.txt saved
./newsData\3\생활문화712.txt saved
./newsData\4\세계712.txt saved
./newsData\5\IT과학712.txt saved
./newsData\6\연예712.txt saved
./newsData\7\스포츠712.txt saved
./newsData\0\정치713.txt saved
./newsData\1\경제713.txt saved
./newsData\2\사회713.txt saved
./newsData\3\생활문화713.txt saved
./newsData\4\세계713.txt saved
./newsData\5\IT과학713.txt saved
./newsData\6\연예713.txt saved
./newsData\7\스포츠713.txt saved
./newsData\0\정치714.txt saved
./newsData\1\경제714.txt saved
./newsData\2\사회714.txt saved
./newsData\3\생활문화714.txt saved
./newsData\4\세계714.txt saved
./newsData\5\IT과학714.txt saved
./newsData\6\연예714.txt saved
./newsData\7\스포츠714.txt saved
./newsData\0\정치715.txt saved
./newsData\1\경제715.txt saved
./newsData\2\사회715.txt saved
./newsData\3\생활문화715.tx

./newsData\7\스포츠747.txt saved
./newsData\0\정치748.txt saved
./newsData\1\경제748.txt saved
./newsData\2\사회748.txt saved
./newsData\3\생활문화748.txt saved
./newsData\4\세계748.txt saved
./newsData\5\IT과학748.txt saved
./newsData\6\연예748.txt saved
./newsData\7\스포츠748.txt saved
./newsData\0\정치749.txt saved
./newsData\1\경제749.txt saved
./newsData\2\사회749.txt saved
./newsData\3\생활문화749.txt saved
./newsData\4\세계749.txt saved
./newsData\5\IT과학749.txt saved
./newsData\6\연예749.txt saved
./newsData\7\스포츠749.txt saved
./newsData\0\정치750.txt saved
./newsData\1\경제750.txt saved
./newsData\2\사회750.txt saved
./newsData\3\생활문화750.txt saved
./newsData\4\세계750.txt saved
./newsData\5\IT과학750.txt saved
./newsData\6\연예750.txt saved
./newsData\7\스포츠750.txt saved
./newsData\0\정치751.txt saved
./newsData\1\경제751.txt saved
./newsData\2\사회751.txt saved
./newsData\3\생활문화751.txt saved
./newsData\4\세계751.txt saved
./newsData\5\IT과학751.txt saved
./newsData\6\연예751.txt saved
./newsData\7\스포츠751.txt saved
./newsData\0\정치752.txt

./newsData\1\경제783.txt saved
./newsData\2\사회783.txt saved
./newsData\3\생활문화783.txt saved
./newsData\4\세계783.txt saved
./newsData\5\IT과학783.txt saved
./newsData\6\연예783.txt saved
./newsData\7\스포츠783.txt saved
./newsData\0\정치784.txt saved
./newsData\1\경제784.txt saved
./newsData\2\사회784.txt saved
./newsData\3\생활문화784.txt saved
./newsData\4\세계784.txt saved
./newsData\5\IT과학784.txt saved
./newsData\6\연예784.txt saved
./newsData\7\스포츠784.txt saved
./newsData\0\정치785.txt saved
./newsData\1\경제785.txt saved
./newsData\2\사회785.txt saved
./newsData\3\생활문화785.txt saved
./newsData\4\세계785.txt saved
./newsData\5\IT과학785.txt saved
./newsData\6\연예785.txt saved
./newsData\7\스포츠785.txt saved
./newsData\0\정치786.txt saved
./newsData\1\경제786.txt saved
./newsData\2\사회786.txt saved
./newsData\3\생활문화786.txt saved
./newsData\4\세계786.txt saved
./newsData\5\IT과학786.txt saved
./newsData\6\연예786.txt saved
./newsData\7\스포츠786.txt saved
./newsData\0\정치787.txt saved
./newsData\1\경제787.txt saved
./newsData\2\사회787.txt 

./newsData\7\스포츠818.txt saved
./newsData\0\정치819.txt saved
./newsData\1\경제819.txt saved
./newsData\2\사회819.txt saved
./newsData\3\생활문화819.txt saved
./newsData\4\세계819.txt saved
./newsData\5\IT과학819.txt saved
./newsData\6\연예819.txt saved
./newsData\7\스포츠819.txt saved
./newsData\0\정치820.txt saved
./newsData\1\경제820.txt saved
./newsData\2\사회820.txt saved
./newsData\3\생활문화820.txt saved
./newsData\4\세계820.txt saved
./newsData\5\IT과학820.txt saved
./newsData\6\연예820.txt saved
./newsData\7\스포츠820.txt saved
./newsData\0\정치821.txt saved
./newsData\1\경제821.txt saved
./newsData\2\사회821.txt saved
./newsData\3\생활문화821.txt saved
./newsData\4\세계821.txt saved
./newsData\5\IT과학821.txt saved
./newsData\6\연예821.txt saved
./newsData\7\스포츠821.txt saved
./newsData\0\정치822.txt saved
./newsData\1\경제822.txt saved
./newsData\2\사회822.txt saved
./newsData\3\생활문화822.txt saved
./newsData\4\세계822.txt saved
./newsData\5\IT과학822.txt saved
./newsData\6\연예822.txt saved
./newsData\7\스포츠822.txt saved
./newsData\0\정치823.txt

./newsData\7\스포츠854.txt saved
./newsData\0\정치855.txt saved
./newsData\1\경제855.txt saved
./newsData\2\사회855.txt saved
./newsData\3\생활문화855.txt saved
./newsData\4\세계855.txt saved
./newsData\5\IT과학855.txt saved
./newsData\6\연예855.txt saved
./newsData\7\스포츠855.txt saved
./newsData\0\정치856.txt saved
./newsData\1\경제856.txt saved
./newsData\2\사회856.txt saved
./newsData\3\생활문화856.txt saved
./newsData\4\세계856.txt saved
./newsData\5\IT과학856.txt saved
./newsData\6\연예856.txt saved
./newsData\7\스포츠856.txt saved
./newsData\0\정치857.txt saved
./newsData\1\경제857.txt saved
./newsData\2\사회857.txt saved
./newsData\3\생활문화857.txt saved
./newsData\4\세계857.txt saved
./newsData\5\IT과학857.txt saved
./newsData\6\연예857.txt saved
./newsData\7\스포츠857.txt saved
./newsData\0\정치858.txt saved
./newsData\1\경제858.txt saved
./newsData\2\사회858.txt saved
./newsData\3\생활문화858.txt saved
./newsData\4\세계858.txt saved
./newsData\5\IT과학858.txt saved
./newsData\6\연예858.txt saved
./newsData\7\스포츠858.txt saved
./newsData\0\정치859.txt

./newsData\4\세계889.txt saved
./newsData\5\IT과학889.txt saved
./newsData\6\연예889.txt saved
./newsData\7\스포츠889.txt saved
./newsData\0\정치890.txt saved
./newsData\1\경제890.txt saved
./newsData\2\사회890.txt saved
./newsData\3\생활문화890.txt saved
./newsData\4\세계890.txt saved
./newsData\5\IT과학890.txt saved
./newsData\6\연예890.txt saved
./newsData\7\스포츠890.txt saved
./newsData\0\정치891.txt saved
./newsData\1\경제891.txt saved
./newsData\2\사회891.txt saved
./newsData\3\생활문화891.txt saved
./newsData\4\세계891.txt saved
./newsData\5\IT과학891.txt saved
./newsData\6\연예891.txt saved
./newsData\7\스포츠891.txt saved
./newsData\0\정치892.txt saved
./newsData\1\경제892.txt saved
./newsData\2\사회892.txt saved
./newsData\3\생활문화892.txt saved
./newsData\4\세계892.txt saved
./newsData\5\IT과학892.txt saved
./newsData\6\연예892.txt saved
./newsData\7\스포츠892.txt saved
./newsData\0\정치893.txt saved
./newsData\1\경제893.txt saved
./newsData\2\사회893.txt saved
./newsData\3\생활문화893.txt saved
./newsData\4\세계893.txt saved
./newsData\5\IT과학893.tx

./newsData\5\IT과학924.txt saved
./newsData\6\연예924.txt saved
./newsData\7\스포츠924.txt saved
./newsData\0\정치925.txt saved
./newsData\1\경제925.txt saved
./newsData\2\사회925.txt saved
./newsData\3\생활문화925.txt saved
./newsData\4\세계925.txt saved
./newsData\5\IT과학925.txt saved
./newsData\6\연예925.txt saved
./newsData\7\스포츠925.txt saved
./newsData\0\정치926.txt saved
./newsData\1\경제926.txt saved
./newsData\2\사회926.txt saved
./newsData\3\생활문화926.txt saved
./newsData\4\세계926.txt saved
./newsData\5\IT과학926.txt saved
./newsData\6\연예926.txt saved
./newsData\7\스포츠926.txt saved
./newsData\0\정치927.txt saved
./newsData\1\경제927.txt saved
./newsData\2\사회927.txt saved
./newsData\3\생활문화927.txt saved
./newsData\4\세계927.txt saved
./newsData\5\IT과학927.txt saved
./newsData\6\연예927.txt saved
./newsData\7\스포츠927.txt saved
./newsData\0\정치928.txt saved
./newsData\1\경제928.txt saved
./newsData\2\사회928.txt saved
./newsData\3\생활문화928.txt saved
./newsData\4\세계928.txt saved
./newsData\5\IT과학928.txt saved
./newsData\6\연예928.tx

./newsData\0\정치960.txt saved
./newsData\1\경제960.txt saved
./newsData\2\사회960.txt saved
./newsData\3\생활문화960.txt saved
./newsData\4\세계960.txt saved
./newsData\5\IT과학960.txt saved
./newsData\6\연예960.txt saved
./newsData\7\스포츠960.txt saved
./newsData\0\정치961.txt saved
./newsData\1\경제961.txt saved
./newsData\2\사회961.txt saved
./newsData\3\생활문화961.txt saved
./newsData\4\세계961.txt saved
./newsData\5\IT과학961.txt saved
./newsData\6\연예961.txt saved
./newsData\7\스포츠961.txt saved
./newsData\0\정치962.txt saved
./newsData\1\경제962.txt saved
./newsData\2\사회962.txt saved
./newsData\3\생활문화962.txt saved
./newsData\4\세계962.txt saved
./newsData\5\IT과학962.txt saved
./newsData\6\연예962.txt saved
./newsData\7\스포츠962.txt saved
./newsData\0\정치963.txt saved
./newsData\1\경제963.txt saved
./newsData\2\사회963.txt saved
./newsData\3\생활문화963.txt saved
./newsData\4\세계963.txt saved
./newsData\5\IT과학963.txt saved
./newsData\6\연예963.txt saved
./newsData\7\스포츠963.txt saved
./newsData\0\정치964.txt saved
./newsData\1\경제964.txt 

./newsData\0\정치996.txt saved
./newsData\1\경제996.txt saved
./newsData\2\사회996.txt saved
./newsData\3\생활문화996.txt saved
./newsData\4\세계996.txt saved
./newsData\5\IT과학996.txt saved
./newsData\6\연예996.txt saved
./newsData\7\스포츠996.txt saved
./newsData\0\정치997.txt saved
./newsData\1\경제997.txt saved
./newsData\2\사회997.txt saved
./newsData\3\생활문화997.txt saved
./newsData\4\세계997.txt saved
./newsData\5\IT과학997.txt saved
./newsData\6\연예997.txt saved
./newsData\7\스포츠997.txt saved
./newsData\0\정치998.txt saved
./newsData\1\경제998.txt saved
./newsData\2\사회998.txt saved
./newsData\3\생활문화998.txt saved
./newsData\4\세계998.txt saved
./newsData\5\IT과학998.txt saved
./newsData\6\연예998.txt saved
./newsData\7\스포츠998.txt saved
./newsData\0\정치999.txt saved
./newsData\1\경제999.txt saved
./newsData\2\사회999.txt saved
./newsData\3\생활문화999.txt saved
./newsData\4\세계999.txt saved
./newsData\5\IT과학999.txt saved
./newsData\6\연예999.txt saved
./newsData\7\스포츠999.txt saved
./newsData\0\정치1000.txt saved
./newsData\1\경제1000.tx

In [41]:
Ydata2 = list(Ydata)
Xdata2 = list(Xdata)

Xtrain = Xdata2[:3200]
Xtest = Xdata2[3200:]
Ytrain = Ydata2[:3200]
Ytest = Ydata2[3200:]

In [42]:
Xdata = text_cleaner(Xdata)
Xtrain = text_cleaner(Xtrain)
Xtest = text_cleaner(Xtest)

In [43]:
tokenizer= Okt()

In [44]:
Xtrain = morphs_process(Xtrain, tokenizer)
Xtest = morphs_process(Xtest, tokenizer) 
Xdata = morphs_process(Xdata, tokenizer)

In [45]:
from gensim.models import Word2Vec
embedding_model2 = Word2Vec(Xdata,size=300, window=5, min_count=10, workers=1)

In [46]:
w2vVocab=list(embedding_model2.wv.vocab)

In [47]:
embedding_idx2 = ebdIdx(embedding_model2, w2vVocab)

15515


  """


In [48]:
MAX_FEATURES = len(embedding_idx2)
MAX_SEQUENCE_LENGTH = 525
EMBEDDING_DIM = 300
print(MAX_FEATURES)

15515


In [67]:
tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(Xdata)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(Xdata)
seqlen = np.array([len(sequence) for sequence in sequences])
np.histogram(seqlen, bins=50)

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0], dtype=int64),
 array([187.5 , 187.52, 187.54, 187.56, 187.58, 187.6 , 187.62, 187.64,
        187.66, 187.68, 187.7 , 187.72, 187.74, 187.76, 187.78, 187.8 ,
        187.82, 187.84, 187.86, 187.88, 187.9 , 187.92, 187.94, 187.96,
        187.98, 188.  , 188.02, 188.04, 188.06, 188.08, 188.1 , 188.12,
        188.14, 188.16, 188.18, 188.2 , 188.22, 188.24, 188.26, 188.28,
        188.3 , 188.32, 188.34, 188.36, 188.38, 188.4 , 188.42, 188.44,
        188.46, 188.48, 188.5 ]))

In [68]:
print('Preparing embedding matrix...')

num_words = min(MAX_FEATURES, len(w2vVocab) + 1) #unknown word 때문에 +1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i >= MAX_FEATURES:
        continue
    embedding_vector = embedding_idx2.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix], 
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

Preparing embedding matrix...


In [51]:
print(embedding_matrix.shape)

(15515, 300)


In [56]:
def getSequence(text, MAX_FEATURES, word_index):
    seq = []
    for line in text:
        lineseq = []
        for i in range(len(line)):
            if (word_index[line[i]] < MAX_FEATURES):
                lineseq.append(word_index[line[i]])
            else:
                pass
        seq.append(lineseq)
    seq = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
    seq = np.array(seq)
    return seq

In [57]:
x_train = getSequence(Xtrain, MAX_FEATURES, word_index)
x_test = getSequence(Xtest, MAX_FEATURES, word_index)
y_train = to_categorical(np.asarray(Ytrain))
y_test = to_categorical(np.asarray(Ytest))

In [58]:
print('Shape of x_train tensor:', x_train.shape)
print('Shape of y_train tensor:', y_train.shape)
print('Shape of x_test tensor:', x_test.shape)
print('Shape of y_test tensor:', y_test.shape)
print(np.amax(x_train))
print(np.amax(y_train))
print(x_train[0])
print(y_train[0])

Shape of x_train tensor: (3200, 525)
Shape of y_train tensor: (3200, 8)
Shape of x_test tensor: (4800, 525)
Shape of y_test tensor: (4800, 8)
15448
1.0
[11541  1963  1419   154   904  9919  7942  6243  2581   121    16  6950
  2581 11541  1963  1161  4478     7   154  1419   778  6243    31   173
    49   173   165     3   138   244  1059   109 11541   148    39   779
    19  2129  6843   157  5031   901  1627  3714 13873   157   902  2357
   926  3714   448    19    26  3264   259   571  1106     1  4216   113
  2129    80  4478  2840  7943  4479    95  2478   154  1419     1     8
  3534  3005   127     4     3  3005  4478  7240     7  4216   154  1419
     1  4478    83   196   574    26  3415 14634    20    17     1   154
   653    59   333   254  1303     7  4216   248  1535  7241  8137   154
  5405  1141  2496  1571  1963  9920   654  6614  5405   244   344  4478
 14634   196  9919    95     3  7942  6243     7   952     1  7942   205
   259  9614  3714   865   204  1106  3264   

In [52]:
# path where all models are saved
BASE_PATH = './model_log_demo/'
if not os.path.exists(BASE_PATH):
    os.mkdir(BASE_PATH)

In [53]:
def create_checkpoint(model_name):
    MODEL_PATH = os.path.join(BASE_PATH, model_name)
    if not os.path.exists(MODEL_PATH):
        os.mkdir(MODEL_PATH)
    
    return ModelCheckpoint(filepath=os.path.join(MODEL_PATH, 'val_loss-{val_loss:.4f}.hdf5'),
                           monitor='val_loss',
                           verbose=1,
                           save_best_only=True)

def create_checkpoint2(model_name):
    MODEL_PATH = os.path.join(BASE_PATH, model_name)
    if not os.path.exists(MODEL_PATH):
        os.mkdir(MODEL_PATH)
    
    return ModelCheckpoint(filepath=os.path.join(MODEL_PATH, 'val_acc-{val_acc:.4f}.hdf5'),
                           monitor='val_acc',
                           verbose=1,
                           save_best_only=True)

In [54]:
batch_size = 32
max_epochs = 100

early_stopping = EarlyStopping(monitor='val_loss',
                               patience=10)

In [59]:
model = CNNModel()
model.summary()

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
checkloss = create_checkpoint('CNN') 
checkacc = create_checkpoint2('CNN')
    
print('Training...')
history = model.fit(x_train, y_train,
                    batch_size=batch_size, 
                    epochs=max_epochs, 
                    verbose=1, 
                    shuffle=True,
                    callbacks=[checkloss, checkacc, early_stopping], 
                    validation_split=0.15)

W1215 17:10:44.493546  2644 nn_ops.py:4224] Large dropout rate: 0.7 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 525)          0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 525, 300)     4654500     input_5[0][0]                    
__________________________________________________________________________________________________
reshape_3 (Reshape)             (None, 525, 300, 1)  0           embedding_3[1][0]                
__________________________________________________________________________________________________
conv2d_7 (Conv2D)               (None, 523, 1, 128)  115328      reshape_3[0][0]                  
__________________________________________________________________________________________________
conv2d_8 (

W1215 17:10:44.781298  2644 deprecation.py:323] From C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\ops\math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 2720 samples, validate on 480 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 1.11482, saving model to ./model_log_demo/CNN\val_loss-1.1148.hdf5

Epoch 00001: val_acc improved from -inf to 0.69792, saving model to ./model_log_demo/CNN\val_acc-0.6979.hdf5
Epoch 2/100

Epoch 00002: val_loss improved from 1.11482 to 0.91032, saving model to ./model_log_demo/CNN\val_loss-0.9103.hdf5

Epoch 00002: val_acc improved from 0.69792 to 0.71250, saving model to ./model_log_demo/CNN\val_acc-0.7125.hdf5
Epoch 3/100

Epoch 00003: val_loss improved from 0.91032 to 0.82446, saving model to ./model_log_demo/CNN\val_loss-0.8245.hdf5

Epoch 00003: val_acc improved from 0.71250 to 0.71667, saving model to ./model_log_demo/CNN\val_acc-0.7167.hdf5
Epoch 4/100

Epoch 00004: val_loss improved from 0.82446 to 0.74737, saving model to ./model_log_demo/CNN\val_loss-0.7474.hdf5

Epoch 00004: val_acc improved from 0.71667 to 0.73542, saving model to ./model_log_demo/CNN\val_acc-0.7354.hdf5


In [78]:
x_train.shape

(3200, 525)