# IMDB 영화 리뷰 감성분석 - Conv1D

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import imdb
import warnings
warnings.filterwarnings('ignore')

### Conv1D으로 IMDB 리뷰 감성 분석
- 단어 빈도수 : 10,000개 ( 총 88,584)
- 문장의 단어수 : 500( 최대 2,494)
- test data 중 40%(10,000개)는 검증용으로 분리

In [3]:
import tensorflow as tf
seed = 2022
np.random.seed(seed)
tf.random.set_seed(seed)

In [4]:
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, Dense, Dropout
from tensorflow.keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [5]:
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)
X_train.shape, X_test.shape, y_train.shape

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


((25000,), (25000,), (25000,))

In [6]:
max_len = 500
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)
X_train.shape, X_test.shape
# 500으로 맞춰져 있다.

((25000, 500), (25000, 500))

In [7]:
from sklearn.model_selection import train_test_split
X_test, X_valid, y_test, y_valid = train_test_split(
    X_test, y_test, stratify=y_test, test_size=0.4, random_state=seed
)
X_test.shape, X_valid.shape, y_test.shape, y_valid.shape

((15000, 500), (10000, 500), (15000,), (10000,))

- Case 1) Conv1D X 2, MaxPooling1D X 2 , Dropout, GlobalMaxPooling1D
    - embedding dim : 100
    
    

In [8]:
model1 = Sequential([ 
    Embedding(10000, 100, input_length=max_len),
    Dropout(0.5),
    Conv1D(64, 7, activation='relu'),
    MaxPooling1D(7),
    Conv1D(64, 5, activation='relu'),
    MaxPooling1D(5),
    GlobalMaxPooling1D(),
    Dense(1, activation='sigmoid')
])
model1.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 500, 100)          1000000   
                                                                 
 dropout (Dropout)           (None, 500, 100)          0         
                                                                 
 conv1d (Conv1D)             (None, 494, 64)           44864     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 70, 64)           0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 66, 64)            20544     
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 13, 64)           0         
 1D)                                                    

In [9]:
model1.compile('adam', 'binary_crossentropy', ['accuracy'])

model_path = 'best-imdb-conv1d.h5'
mc = ModelCheckpoint(model_path, save_best_only=True, verbose=1)
es = EarlyStopping(patience=5)

In [10]:
hist1 = model1.fit(
    X_train, y_train, epochs=30, batch_size=128, 
    validation_data=[X_valid, y_valid],
    callbacks=[mc,es]
)

Epoch 1/30
Epoch 00001: val_loss improved from inf to 0.35699, saving model to best-imdb-conv1d.h5
Epoch 2/30
Epoch 00002: val_loss improved from 0.35699 to 0.32018, saving model to best-imdb-conv1d.h5
Epoch 3/30
Epoch 00003: val_loss did not improve from 0.32018
Epoch 4/30
Epoch 00004: val_loss did not improve from 0.32018
Epoch 5/30
Epoch 00005: val_loss did not improve from 0.32018
Epoch 6/30
Epoch 00006: val_loss did not improve from 0.32018
Epoch 7/30
Epoch 00007: val_loss did not improve from 0.32018


In [11]:
best_model = load_model(model_path)
best_model.evaluate(X_test, y_test)



[0.316537082195282, 0.8658666610717773]

- Case 2) Conv1D + LSTM

In [12]:
from tensorflow.keras.layers import  LSTM

In [13]:
model2 = Sequential([ 
    Embedding(10000, 100, input_length=max_len),
    Dropout(0.5),
    Conv1D(64, 5, activation='relu'),
    MaxPooling1D(4),
    LSTM(100),
    Dense(1, activation='sigmoid')
])
model2.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 500, 100)          1000000   
                                                                 
 dropout_1 (Dropout)         (None, 500, 100)          0         
                                                                 
 conv1d_2 (Conv1D)           (None, 496, 64)           32064     
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 124, 64)          0         
 1D)                                                             
                                                                 
 lstm (LSTM)                 (None, 100)               66000     
                                                                 
 dense_1 (Dense)             (None, 1)                 101       
                                                      

In [14]:
model2.compile('adam', 'binary_crossentropy', ['accuracy'])

model_path = 'best-imdb-conv1d-lstm.h5'
mc = ModelCheckpoint(model_path, save_best_only=True, verbose=1)
es = EarlyStopping(patience=5)

In [15]:
hist2 = model2.fit(
    X_train, y_train, epochs=30, batch_size=128, 
    validation_data=[X_valid, y_valid],
    callbacks=[mc,es]
)

Epoch 1/30
Epoch 00001: val_loss improved from inf to 0.30066, saving model to best-imdb-conv1d-lstm.h5
Epoch 2/30
Epoch 00002: val_loss improved from 0.30066 to 0.28842, saving model to best-imdb-conv1d-lstm.h5
Epoch 3/30
Epoch 00003: val_loss did not improve from 0.28842
Epoch 4/30
Epoch 00004: val_loss did not improve from 0.28842
Epoch 5/30
Epoch 00005: val_loss did not improve from 0.28842
Epoch 6/30
Epoch 00006: val_loss did not improve from 0.28842
Epoch 7/30
Epoch 00007: val_loss did not improve from 0.28842


In [16]:
best_model = load_model(model_path)
best_model.evaluate(X_test, y_test)



[0.28742682933807373, 0.8805999755859375]

- Case 3) conv1D + Dense 

In [17]:
model3 = Sequential([ 
    Embedding(10000, 100, input_length=max_len),
    Dropout(0.5),
    Conv1D(64, 5, activation='relu'),
    MaxPooling1D(4),
    GlobalMaxPooling1D(),
    Dense(100, activation='relu'),  
    Dense(1, activation='sigmoid')
])
model3.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 500, 100)          1000000   
                                                                 
 dropout_2 (Dropout)         (None, 500, 100)          0         
                                                                 
 conv1d_3 (Conv1D)           (None, 496, 64)           32064     
                                                                 
 max_pooling1d_3 (MaxPooling  (None, 124, 64)          0         
 1D)                                                             
                                                                 
 global_max_pooling1d_1 (Glo  (None, 64)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_2 (Dense)             (None, 100)              

In [18]:
model3.compile('adam', 'binary_crossentropy', ['accuracy'])

model_path = 'best-imdb-conv1d-fcn.h5'
mc = ModelCheckpoint(model_path, save_best_only=True, verbose=1)
es = EarlyStopping(patience=5)

In [19]:
hist3 = model3.fit(
    X_train, y_train, epochs=30, batch_size=128, 
    validation_data=[X_valid, y_valid],
    callbacks=[mc,es]
)

Epoch 1/30
Epoch 00001: val_loss improved from inf to 0.35787, saving model to best-imdb-conv1d-fcn.h5
Epoch 2/30
Epoch 00002: val_loss improved from 0.35787 to 0.27438, saving model to best-imdb-conv1d-fcn.h5
Epoch 3/30
Epoch 00003: val_loss improved from 0.27438 to 0.26261, saving model to best-imdb-conv1d-fcn.h5
Epoch 4/30
Epoch 00004: val_loss did not improve from 0.26261
Epoch 5/30
Epoch 00005: val_loss did not improve from 0.26261
Epoch 6/30
Epoch 00006: val_loss did not improve from 0.26261
Epoch 7/30
Epoch 00007: val_loss did not improve from 0.26261
Epoch 8/30
Epoch 00008: val_loss did not improve from 0.26261


In [20]:
best_model = load_model(model_path)
best_model.evaluate(X_test, y_test)



[0.2606480121612549, 0.8919333219528198]