In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 3.3.5 순환 신경망으로 IMDB 감정 분류

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import datasets
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense 

from sklearn.model_selection import train_test_split

In [2]:
from tensorflow.keras import datasets

(x_train, y_train), (x_test, y_test) = datasets.imdb.load_data(num_words=25000)

print('\ntrain dataset :', x_train.shape, y_train.shape)
print('test dataset :', x_test.shape, y_test.shape)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz

train dataset : (25000,) (25000,)
test dataset : (25000,) (25000,)


In [3]:
print(x_train[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]


In [4]:
len(x_train[0]), len(x_train[1])

(218, 189)

In [5]:
word_index = datasets.imdb.get_word_index()
word_index

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


{'fawn': 34701,
 'tsukino': 52006,
 'nunnery': 52007,
 'sonja': 16816,
 'vani': 63951,
 'woods': 1408,
 'spiders': 16115,
 'hanging': 2345,
 'woody': 2289,
 'trawling': 52008,
 "hold's": 52009,
 'comically': 11307,
 'localized': 40830,
 'disobeying': 30568,
 "'royale": 52010,
 "harpo's": 40831,
 'canet': 52011,
 'aileen': 19313,
 'acurately': 52012,
 "diplomat's": 52013,
 'rickman': 25242,
 'arranged': 6746,
 'rumbustious': 52014,
 'familiarness': 52015,
 "spider'": 52016,
 'hahahah': 68804,
 "wood'": 52017,
 'transvestism': 40833,
 "hangin'": 34702,
 'bringing': 2338,
 'seamier': 40834,
 'wooded': 34703,
 'bravora': 52018,
 'grueling': 16817,
 'wooden': 1636,
 'wednesday': 16818,
 "'prix": 52019,
 'altagracia': 34704,
 'circuitry': 52020,
 'crotch': 11585,
 'busybody': 57766,
 "tart'n'tangy": 52021,
 'burgade': 14129,
 'thrace': 52023,
 "tom's": 11038,
 'snuggles': 52025,
 'francesco': 29114,
 'complainers': 52027,
 'templarios': 52125,
 '272': 40835,
 '273': 52028,
 'zaniacs': 52130,

In [6]:
import plotly.express as px

review_len = [len(review) for review in x_train]

fig = px.histogram(review_len)
fig.show()

In [7]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

print( len(x_train[0]), len(x_train[1]) )

x_train = pad_sequences(sequences=x_train, maxlen=256)
x_test = pad_sequences(sequences=x_test, maxlen=256)

print( len(x_train[0]), len(x_train[1]) )

218 189
256 256


In [8]:
print(x_train[0])

[    0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     1    14    22    16    43   530   973  1622  1385    65
   458  4468    66  3941     4   173    36   256     5    25   100    43
   838   112    50   670 22665     9    35   480   284     5   150     4
   172   112   167 21631   336   385    39     4   172  4536  1111    17
   546    38    13   447     4   192    50    16     6   147  2025    19
    14    22     4  1920  4613   469     4    22    71    87    12    16
    43   530    38    76    15    13  1247     4    22    17   515    17
    12    16   626    18 19193     5    62   386    12     8   316     8
   106     5     4  2223  5244    16   480    66  3785    33     4   130
    12    16    38   619     5    25   124    51    36   135    48    25
  1415    33     6    22    12   215    28    77   

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense 

num_words = 25000

model = Sequential()
model.add(Embedding(input_dim=num_words, output_dim=16))
model.add(LSTM(32, return_sequences=True))
model.add(LSTM(32))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 16)          400000    
                                                                 
 lstm (LSTM)                 (None, None, 32)          6272      
                                                                 
 lstm_1 (LSTM)               (None, 32)                8320      
                                                                 
 dense (Dense)               (None, 16)                528       
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 415,137
Trainable params: 415,137
Non-trainable params: 0
_________________________________________________________________


In [10]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [11]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2)

print('train dataset :', x_train.shape, y_train.shape)
print('validation dataset :', x_val.shape, y_val.shape)

train dataset : (20000, 256) (20000,)
validation dataset : (5000, 256) (5000,)


In [12]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

early_stopping = EarlyStopping(patience = 10)
checkpoint_model = ModelCheckpoint('ModelCheckpoint.h5')

In [13]:
history = model.fit(x_train, y_train,
                    validation_data=(x_val, y_val),
                    epochs=100,
                    batch_size=256,
                    callbacks=[early_stopping, checkpoint_model])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100


In [14]:
model.evaluate(x_test,  y_test)



[0.8674032688140869, 0.8348399996757507]

In [15]:
history_dict = history.history
history_dict.keys()

dict_keys(['loss', 'accuracy', 'val_loss', 'val_accuracy'])

In [17]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scattergl(y=history.history['accuracy'], name='Train'))
fig.add_trace(go.Scattergl(y=history.history['val_accuracy'], name='Valid'))
fig.update_layout(height=500, width=700, title='Accuracy', xaxis_title='Epoch', yaxis_title='Accuracy')
fig.show()

In [18]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scattergl(y=history.history['loss'],name='Train'))
fig.add_trace(go.Scattergl(y=history.history['val_loss'],name='Valid'))
fig.update_layout(height=500, width=700,title='Loss',xaxis_title='Epoch',yaxis_title='Loss')
fig.show()