# CNN으로 IMDB 분류하기

Conv1D를 이용하여 텍스트를 분류한다

원본 - https://github.com/fchollet/keras/blob/master/examples/imdb_cnn.py

In [1]:
from __future__ import print_function

In [3]:
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb

In [4]:
max_features = 5000
maxlen = 400
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2

# 데이터 로드하기

In [5]:
print("데이터를 로드하고 있습니다...")
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

데이터를 로드하고 있습니다...
Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
25000 test sequences


# 데이터 축소해 주기 (내가 추가)

In [18]:
num_train = 5000
num_test = 5000

mask_train = list(range(num_train))
mask_test = list(range(num_test))

x_train = x_train[mask_train] # ((5000, 400)
y_train = y_train[mask_train] # (5000,)

x_test = x_test[mask_test] # (5000, 400)
y_test = y_test[mask_test] # (5000

In [19]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(5000, 400)
(5000,)
(5000, 400)
(5000,)


# 시퀀스 패딩하기

In [20]:
print("시퀀스를 패딩하고 있습니다 (samples x time)")
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

시퀀스를 패딩하고 있습니다 (samples x time)
x_train shape: (5000, 400)
x_test shape: (5000, 400)


# 모델 구축하기

In [21]:
print("모델을 구축합니다...")
model = Sequential()

모델을 구축합니다...


In [22]:
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model.add(Dropout(0.2))

In [23]:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))

In [24]:
model.add(GlobalMaxPooling1D())

In [25]:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

In [26]:
model.add(Dense(1))
model.add(Activation('sigmoid'))

# 모델 컴파일하기

In [27]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# 모델 학습시키기

In [31]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

Train on 5000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x11e2c4f50>

# 스코어 구해주기

In [32]:
score = model.evaluate(x_test, y_test, verbose=1)



In [33]:
print("Test loss:", score[0])
print("Test accuracy", score[1])

Test loss: 0.407824087286
Test accuracy 0.8434
