In [1]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
import random

In [2]:
news_dataset = pd.read_csv("news_dataset.csv", index_col = 0)

In [3]:
news_dataset.head()

Unnamed: 0,date,title,content,label
0,20170101,"朴대통령 ""뇌물죄, 완전히 엮은 것…세월호 허위 걷혀야""(종합)","새해 첫날 청와대서 사실상 기자간담회…직무정지 23일 만에 첫 입장표명""공모나 누구...",BH
1,20170102,"정유라, 덴마크서 불법 체류 혐의로 체포···특검 “송환 협조중” (종합)",[아시아경제 정준영 기자] 이화여대 학사부정 및 삼성 특혜지원 의혹의 수혜자 겸 공...,BH
2,20170103,"[단독]정유라, “(주사 아줌마)누구인지 알 것 같다”…현지 답변태도 분석, 사전 ...",덴마크 올보르 법원에서 잠시 휴정중 기자들의 질문에 답변하는 정유라씨 사진=현지교...,BH
3,20170104,"[단독]""정유라, 이대학장실 등 교내서 교수 6명에 학점취득 코치받아""","[연합뉴스TV제공]김병욱, 교육부 자료 확인…""학점 좋은이유 모른다더니""담당교수들 ...",Politic
4,20170105,"윤전추 ""기억안나. 몰라. 말못해""… 헌재 ""본인범죄 외 답해라""","""외부인 동행 없다"" 주장하다 ""세월호 당일 미용사 태워왔다"" 윤전추 헌재 탄핵심리...",BH


In [4]:
## 총 6개의 뉴스 카테고리 존재 확인
set(news_dataset["label"])

{'Admin', 'BH', 'Con/Party', 'Defence/Diplo', 'North', 'Politic'}

In [5]:
category = ['BH', "Con/Party", "North", "Admin", "Defence/Diplo", "Politic"]
pd.DataFrame({"cate_name" : category}).to_csv("news_class.csv")

In [6]:
classes = pd.read_csv("news_class.csv", index_col = 0)
classes

Unnamed: 0,cate_name
0,BH
1,Con/Party
2,North
3,Admin
4,Defence/Diplo
5,Politic


In [7]:
def make_input(documents):
    max_document_length = 1000
    vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor(max_document_length)
    x = np.array(list(vocab_processor.transform(documents)))
    vocab_dict = vocab_processor.vocabulary_._mapping
    sorted_vocab = sorted(vocab_dict.items(), key = lambda x : x[1])
    vocabulary  = list(list(zip(*sorted_vocab))[0])
    return x, vocabulary, len(vocab_processor.vocabulary_)

In [8]:
def make_output(category):
    classes = pd.read_csv("news_class.csv", index_col = 0)
    one_hot_vectors = np.eye(len(classes), dtype = int)
    class_vectors = {}
    y = []
    for i, cls in enumerate(list(classes["cate_name"])):
        class_vectors[cls] = one_hot_vectors[i]
    for c in category:
        y.append(class_vectors[c])
    return np.array(y)

In [9]:
## 함수에 대한 설명 : ~~

def load_data():
    cnn = pd.read_csv("news_dataset.csv", index_col = 0)
    contents = cnn["content"]
    category = cnn["label"]
    
    x, vocabulary, vocabulary_size = make_input(contents)
    
    y = make_output(category)
    return x, y, vocabulary, vocabulary_size

In [10]:
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPool2D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard
from keras.optimizers import Adam
from keras.models import Model
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [11]:
## data loading

x, y, vocabulary, vocabulary_size = load_data()
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [12]:
sequence_length = x.shape[1]
embedding_dim = 256
filter_sizes = [3, 4, 5]
num_filters= 512
drop = 0.5
epochs = 50
batch_size = 30

In [13]:
# this returns a tensor
inputs = Input(shape=(sequence_length,), dtype='int32')
embedding = Embedding(input_dim=vocabulary_size, output_dim=embedding_dim, input_length=sequence_length)(inputs)
reshape = Reshape((sequence_length,embedding_dim,1))(embedding)

In [14]:
conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)

maxpool_0 = MaxPool2D(pool_size=(sequence_length - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
maxpool_1 = MaxPool2D(pool_size=(sequence_length - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
maxpool_2 = MaxPool2D(pool_size=(sequence_length - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)

concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(drop)(flatten)
output = Dense(units=6, activation='softmax')(dropout)

In [15]:
# this creates a model that includes
model = Model(inputs=inputs, outputs=output)

In [16]:
checkpoint = ModelCheckpoint('weights.{epoch:03d}-{val_acc:.4f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

In [17]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1000)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1000, 256)    9347328     input_1[0][0]                    
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 1000, 256, 1) 0           embedding_1[0][0]                
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 998, 1, 512)  393728      reshape_1[0][0]                  
__________________________________________________________________________________________________
conv2d_2 (

In [18]:
## Model Training...

early_stopping = EarlyStopping(monitor = "val_loss", patience = 10)
history_1 = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, 
                     callbacks=[checkpoint, 
                                early_stopping, 
                                TensorBoard(log_dir = "/tmp/logs/history_1")], 
                     validation_data=(X_test, y_test))

Train on 292 samples, validate on 73 samples
Epoch 1/50

Epoch 00001: val_acc improved from -inf to 0.83333, saving model to weights.001-0.8333.hdf5
Epoch 2/50

Epoch 00002: val_acc did not improve
Epoch 3/50

Epoch 00003: val_acc did not improve
Epoch 4/50

Epoch 00004: val_acc did not improve
Epoch 5/50

Epoch 00005: val_acc did not improve
Epoch 6/50

Epoch 00006: val_acc did not improve
Epoch 7/50

Epoch 00007: val_acc did not improve
Epoch 8/50

Epoch 00008: val_acc improved from 0.83333 to 0.83562, saving model to weights.008-0.8356.hdf5
Epoch 9/50

Epoch 00009: val_acc improved from 0.83562 to 0.84247, saving model to weights.009-0.8425.hdf5
Epoch 10/50

Epoch 00010: val_acc did not improve
Epoch 11/50

Epoch 00011: val_acc improved from 0.84247 to 0.84475, saving model to weights.011-0.8447.hdf5
Epoch 12/50

Epoch 00012: val_acc improved from 0.84475 to 0.84932, saving model to weights.012-0.8493.hdf5
Epoch 13/50

Epoch 00013: val_acc did not improve
Epoch 14/50

Epoch 00014: v


Epoch 00044: val_acc did not improve
Epoch 45/50

Epoch 00045: val_acc did not improve
Epoch 46/50

Epoch 00046: val_acc did not improve
Epoch 47/50

Epoch 00047: val_acc did not improve
Epoch 48/50

Epoch 00048: val_acc did not improve
Epoch 49/50

Epoch 00049: val_acc did not improve
Epoch 50/50

Epoch 00050: val_acc did not improve


In [19]:
score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 0.34068038733038186
Test accuracy: 0.8835616568996482


<img src = "./history_1_graph.PNG">

<img src = "./history_1_scala.PNG">