# IMDB Data Analysis Based on RNN

In [2]:
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline
%env KERAS_BACKEND = tensorflow

env: KERAS_BACKEND=tensorflow


In [3]:
from keras.datasets import imdb
from keras.utils import np_utils
from keras.models import Sequential
from keras.optimizers import SGD
from keras.layers import Dense, LSTM, Embedding, Dropout
from keras.constraints import max_norm
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


### 把文字的種類限制到 10000 字

In [4]:
(x_train,y_train),(x_test,y_test) = imdb.load_data(num_words=10000)

### 看一下 training data 和 testing data 的 shape

In [5]:
print(f'shape of training data: {x_train.shape}; shape of testing data: {x_test.shape}')

shape of training data: (25000,); shape of testing data: (25000,)


### 把 training 和 testing input 的字數都刪減或增補到 300 字

In [6]:
x_train_seq = sequence.pad_sequences(x_train, maxlen = 300)
x_test_seq = sequence.pad_sequences(x_test, maxlen = 300)

### 看一下被動過手腳之後的 input shape

In [7]:
print(f'shape of training data: {x_train_seq.shape}; shape of testing data: {x_test_seq.shape}')

shape of training data: (25000, 300); shape of testing data: (25000, 300)


## Model 1<sup>st</sup>
### 1 層 Embedding + 1 層 LSTM + 1 層  Dense 的神經網路
之前試過好多參數，測試資料準確率都不超過 86% ，參考學弟致元的做法及建議，改用 nadam 當 optimizer， N 跟 K 從 (8,5) 改大到 (320,16) ，測試資料的準確率好不容易來到 86.80 % 。

In [8]:
N = 320# 把10000壓到 N 維
K = 16 # LSTM 有 K 個神經元

model_1st = Sequential()
model_1st.add(Embedding(10000,N))
model_1st.add(LSTM(K))
model_1st.add(Dense(1,activation='sigmoid'))
model_1st.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc'])
model_1st.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 320)         3200000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 16)                21568     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 3,221,585
Trainable params: 3,221,585
Non-trainable params: 0
_________________________________________________________________


就如各位同學所指出的，這次的訓練資料很容易產生 overfitting，所以在 fit 的參數裡面設了 early stopping 的 callback, 採用 testing data 做 validation_data, 讓結果能收在 testing data 準確率最高的權重 (有開 restore_best_weights= True)。

In [11]:
#model_1st.load_weights('./RNN_IMDB_Machines/model_1st_weights.h5')

model_1st_early = EarlyStopping(monitor='val_acc', min_delta=0, patience=0, verbose=0, mode='max', restore_best_weights=True)
model_1st_history = model_1st.fit(x_train_seq,y_train, batch_size=256, epochs=5, validation_data=(x_test_seq, y_test), callbacks = [model_1st_early])

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5


其實可以不用做這步，因為上面訓練的時候validation_data 已經餵 testing data 進去了，直接看 valid_acc 就好~

In [12]:
model_1st_score = model_1st.evaluate(x_test_seq,y_test)
model_1st_score



[0.3323085970878601, 0.86804]

In [79]:
print(f'The arrucay for 1st model is {100*model_1st_score[1]:.2f}% ; loss is {model_1st_score[0]:.4f} .' )

The arrucay for 1st model is 86.80% ; loss is 0.3323 .


In [15]:
#model_1st_json = model_1st.to_json()
#open('./RNN_IMDB_Machines/model_1st','w').write(model_1st_json)
#model_1st.save_weights('./RNN_IMDB_Machines/model_1st_weights.h5')

## Model 2 <sup>nd</sup>
### 用 N=8 及 LSTM Cell = 5，加上 Dropout 以及權重限制 Constraint，設計新的 RNN 結構: 1 層 Embedding + Dropout +1 層 LSTM + Dropout + 1 層  Dense
對 LSTM Cell ，能對 kernel, recurrent, 以及 bias 做限制，值在3.0以下; 對 Dense 層，限制 kernel, bias 在2.0 以下。 Dropout 一開始採建議的 0.2-0.5 之間，發現效果沒很好，所以開始亂試畸形的數值，最後發現第一層 Dropout 用低一點效果不錯。用 Dropout 以及 Constraint 減緩 overfitting 後，測試資料準確率能達到 86.1 %

In [9]:
N = 8 # 把10000壓到 N 維
K = 5 # LSTM 有 K 個神經元

model_2nd = Sequential()
model_2nd.add(Embedding(10000,N))
model_2nd.add(Dropout(0.75))
model_2nd.add(LSTM(K, kernel_constraint=max_norm(3.0),recurrent_constraint=max_norm(3.0), bias_constraint=max_norm(3.0)))
model_2nd.add(Dropout(0.2))
model_2nd.add(Dense(1,activation='sigmoid',kernel_constraint=max_norm(2.0),bias_constraint=max_norm(2.0) ))
model_2nd.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc'])
model_2nd.summary()

Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 8)           80000     
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 8)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 5)                 280       
_________________________________________________________________
dropout_2 (Dropout)          (None, 5)                 0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 6         
Total params: 80,286
Trainable params: 80,286
Non-trainable params: 0
_________________________________________________________________


In [64]:
model_2nd_early = EarlyStopping(monitor='val_acc', min_delta=0, patience=0, verbose=0, mode='max', restore_best_weights=True)
model_2nd_history = model_2nd.fit(x_train_seq,y_train, batch_size=256, epochs=5,validation_data=(x_test_seq, y_test), callbacks = [model_2nd_early] )

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [65]:
model_2nd_score = model_2nd.evaluate(x_test_seq,y_test)
model_2nd_score



[0.3701012341213226, 0.86108]

In [87]:
#model_2nd_json = model_2nd.to_json()
#open('./RNN_IMDB_Machines/model_2nd','w').write(model_2nd_json)
#model_2nd.save_weights('./RNN_IMDB_Machines/model_2nd_weights.h5')

In [85]:
print(f'The arrucay for 2nd model is {100*model_2nd_score[1]:.2f}% ; loss is {model_2nd_score[0]:.4f} .' )

The arrucay for 2nd model is 86.11% ; loss is 0.3701 .


## Model 3<sup>rd</sup>: 最後輸出 one-hot encode 的 output ，所以最後 Dense 層有兩個神經元

想說因為結果是想要區別評論是正面或負面，有點像分類，所以把 y_train 跟 y_test one-hot encoding 後不知道隊訓練有沒有幫助，所以試做這個模型。<br><br>
先前在測試的時候用同樣的 RNN 結構，可是 maxlen = 150， Dropout 的參數略為不同(但同樣是第一層 0.7-0.75，第二層在 0.2 以下, optimizer 用 adam ，權重的 constraint 忘記了，不過都是在 3.0 以下 XD)，試出一組本來測試資料準確率大約 86.3%左右的神經網路。用這個網路的權重當基礎，經過一次把第一層 Dropout 設成 0.75 ，第二層設成 0.25 ，optimizer 改 nadam ，加上 early stopping 的訓練，測試資料準確率約來到 88.1%; 再用第一層 Dropout=0.7，第二層 Dropout= 0.2， optimizer 用 nadam 再訓練一次，準確率略為上升到 88.4% 。

In [80]:
y_train_one_hot = np_utils.to_categorical(y_train,2)
y_test_one_hot = np_utils.to_categorical(y_test,2)

In [81]:
N = 8 # 把10000壓到 N 維
K = 5 # LSTM 有 K 個神經元

model_3rd = Sequential()
model_3rd.add(Embedding(10000,N))
model_3rd.add(Dropout(0.7))
model_3rd.add(LSTM(K, kernel_constraint=max_norm(3.0),recurrent_constraint=max_norm(3.0), bias_constraint=max_norm(3.0)))
model_3rd.add(Dropout(0.2))
model_3rd.add(Dense(2,activation='sigmoid',kernel_constraint=max_norm(2.0),bias_constraint=max_norm(2.0)))
model_3rd.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc'])
model_3rd.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_20 (Embedding)     (None, None, 8)           80000     
_________________________________________________________________
dropout_33 (Dropout)         (None, None, 8)           0         
_________________________________________________________________
lstm_20 (LSTM)               (None, 5)                 280       
_________________________________________________________________
dropout_34 (Dropout)         (None, 5)                 0         
_________________________________________________________________
dense_20 (Dense)             (None, 2)                 12        
Total params: 80,292
Trainable params: 80,292
Non-trainable params: 0
_________________________________________________________________


In [67]:
model_3rd.load_weights('./RNN_IMDB_Machines/model_3rd_weights.h5')

model_3rd_early = EarlyStopping(monitor='val_acc', min_delta=0, patience=0, verbose=0, mode='max', restore_best_weights=True)
model_3rd_history = model_3rd.fit(x_train_seq,y_train_one_hot, batch_size=256, epochs=10,validation_data=(x_test_seq, y_test_one_hot), callbacks = [model_3rd_early])

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10


In [68]:
model_3rd_score = model_3rd.evaluate(x_test_seq,y_test_one_hot)
model_3rd_score



[0.3118935162830353, 0.88426]

In [69]:
#model_3rd_json = model_3rd.to_json()
#open('./RNN_IMDB_Machines/model_3rd','w').write(model_3rd_json)
#model_3rd.save_weights('./RNN_IMDB_Machines/model_3rd_weights.h5')

In [86]:
print(f'The arrucay for 3rd model is {100*model_3rd_score[1]:.2f}% ; loss is {model_3rd_score[0]:.4f} .' )

The arrucay for 3rd model is 88.43% ; loss is 0.3119 .
