# 利用RNN模型進行imdb評價分析(正評或負評)

## 1. 載入keras等套件

In [150]:
%matplotlib inline
%env KERAS_BACKEND = tensorflow

import matplotlib.pyplot as plt
import numpy as np
from keras.datasets import imdb

env: KERAS_BACKEND=tensorflow


## 2. 從imdb資料庫讀入資料，並確認資料數

In [151]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words = 10000)

In [152]:
print("訓練數: ", len(x_train))
print("測試數: ", len(x_test))

訓練數:  25000
測試數:  25000


## 3. 設定輸入資料的長度(300字，太短補0)

In [153]:
from keras.preprocessing import sequence

In [154]:
x_train = sequence.pad_sequences(x_train, maxlen = 300)
x_test = sequence.pad_sequences(x_test, maxlen = 300)

In [155]:
x_train.shape

(25000, 300)

## 4. 建立sequential model

In [156]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout
from keras.layers import LSTM

model = Sequential()

## 5. 將文字進行one-hot encoding 壓縮(10000維 -> 500維)

In [157]:
model.add(Embedding(10000, 500)) # 壓成500維

## 6. 添加網路層數並設定Activation function
* 第一層：  
**LSTM**  
Amount = 25  
Dropout = 0.25
    
      
* 輸出層：  
**Fully-connected NN**  
Amount = 1  
Activation function = sigmoid  
Dropout = 0.25

In [158]:
model.add(Dropout(0.25))

model.add(LSTM(25))

model.add(Dropout(0.25))

model.add(Dense(1, activation = "sigmoid"))

## 7. 定義loss function, optimizer 以及learning rate 並進行compile
* **Loss function:**  
Binary crossentropy   
  
    
* **Optimizer:**   
Adam    

In [159]:
model.compile(loss = "binary_crossentropy", optimizer = "adam", metrics=["accuracy"])

In [160]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_18 (Embedding)     (None, None, 500)         5000000   
_________________________________________________________________
dropout_29 (Dropout)         (None, None, 500)         0         
_________________________________________________________________
lstm_16 (LSTM)               (None, 25)                52600     
_________________________________________________________________
dropout_30 (Dropout)         (None, 25)                0         
_________________________________________________________________
dense_17 (Dense)             (None, 1)                 26        
Total params: 5,052,626
Trainable params: 5,052,626
Non-trainable params: 0
_________________________________________________________________


## 8. 對training data進行fitting
每50筆資料更改一次參數，對所有資料進行2次fitting(避免overfitting)

In [161]:
model_output = model.fit(x_train, y_train, batch_size = 50, epochs = 2, verbose = 1, validation_data = (x_test, y_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


## 9. 利用testing data評估結果(accuracy > 87%)

In [162]:
score = model.evaluate(x_test, y_test)



In [163]:
print("loss rate", score[0])
print("accuracy", score[1])

loss rate 0.2951697346448898
accuracy 0.87712


## 10. 結論：多次嘗試後，能達到87%的準確率