<a href="https://colab.research.google.com/github/jumbokh/nknu-class/blob/main/NLP/notebooks/12_01_RNN_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 簡單的RNN實作

## 程式參考來源：
- https://keras.io/api/layers/core_layers/embedding/
- https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/
- https://keras.io/guides/working_with_rnns/


## 載入相關套件

In [1]:
# 載入相關套件
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

## 嵌入層測試

In [2]:
# 建立模型
model = tf.keras.Sequential()

# 模型只含嵌入層(Embedding layer)
# 字彙表最大為1000，輸出維度為 64，輸入的字數為 10
model.add(layers.Embedding(input_dim=1000, output_dim=64))

# 產生亂數資料，32筆資料，每筆 10 個數字
input_array = np.random.randint(1000, size=(32, 10))

# 指定優化器、損失函數
model.compile('rmsprop', 'mse')

# 預測
output_array = model.predict(input_array)
print(output_array.shape)
output_array[0]

(32, 10, 64)


array([[-8.45818594e-03, -6.08902052e-03, -2.15960275e-02,
        -1.11558810e-02,  4.02867794e-03, -4.61404920e-02,
        -4.74290736e-02,  1.31964944e-02,  3.40586640e-02,
        -1.98386665e-02,  3.81723531e-02, -1.50691047e-02,
        -5.99787384e-03,  2.17188038e-02,  1.06513873e-02,
         4.89194877e-02, -4.20936346e-02, -1.47929303e-02,
         4.41478975e-02,  3.39065827e-02, -4.84524369e-02,
        -5.65917417e-03,  1.89935304e-02, -4.20943871e-02,
        -2.20180396e-02, -3.98682952e-02,  2.65881456e-02,
        -1.55056603e-02, -9.45740938e-03,  2.38053538e-02,
        -1.26759410e-02, -1.56820901e-02, -3.82947922e-02,
         2.46201046e-02, -2.42404230e-02,  1.59515031e-02,
         9.06696171e-03, -3.45064774e-02, -3.19270045e-02,
         8.20305198e-03,  1.36314072e-02, -1.52950361e-03,
         2.52687596e-02, -2.30603106e-02, -4.82298620e-02,
        -4.01755422e-03,  1.25916637e-02,  2.52896287e-02,
         4.31804731e-03,  3.62548865e-02, -3.16821411e-0

## 使用真實的資料轉換

In [3]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 測試資料
docs = ['Well done!',
        'Good work',
        'Great effort',
        'nice work',
        'Excellent!',
        'Weak',
        'Poor effort!',
        'not good',
        'poor work',
        'Could have done better.']

# 轉成 one-hot encoding
vocab_size = 50 # 字典最大字數
maxlen = 4      # 語句最大字數
encoded_docs = [one_hot(d, vocab_size) for d in docs]

# 轉成固定長度，長度不足則後面補空白
padded_docs = pad_sequences(encoded_docs, maxlen=maxlen, padding='post')

# 模型只有 Embedding
model = tf.keras.Sequential()
model.add(layers.Embedding(vocab_size, 64, input_length=maxlen))
model.compile('rmsprop', 'mse')

# 預測
output_array = model.predict(padded_docs)
output_array.shape

(10, 4, 64)

In [4]:
# one-hot encoding 轉換結果
print(encoded_docs[0])

# 補空白後的輸入維度
print(padded_docs.shape)

[7, 9]
(10, 4)


## 加上完全連接層(Dense)

In [5]:
# 定義 10 個語句的正面(1)或負面(0)的情緒
labels = np.array([1,1,1,1,1,0,0,0,0,0])

vocab_size = 50
maxlen = 4
encoded_docs = [one_hot(d, vocab_size) for d in docs]
padded_docs = pad_sequences(encoded_docs, maxlen=maxlen, padding='post')

model = tf.keras.Sequential()
model.add(layers.Embedding(vocab_size, 8, input_length=maxlen))
model.add(layers.Flatten())

# 加上完全連接層(Dense)
model.add(layers.Dense(1, activation='sigmoid'))

# 指定優化器、損失函數
model.compile(optimizer='adam', loss='binary_crossentropy', 
              metrics=['accuracy'])

print(model.summary())

# 模型訓練
model.fit(padded_docs, labels, epochs=50, verbose=0)

# 模型評估
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 4, 8)              400       
_________________________________________________________________
flatten (Flatten)            (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 433
Trainable params: 433
Non-trainable params: 0
_________________________________________________________________
None
Accuracy: 89.999998


In [6]:
model.predict(padded_docs)

array([[0.5814699 ],
       [0.5583228 ],
       [0.5333313 ],
       [0.5721888 ],
       [0.5555229 ],
       [0.48940808],
       [0.46982658],
       [0.47996175],
       [0.528438  ],
       [0.41726732]], dtype=float32)

## 加上 RNN 神經層

In [7]:
model = tf.keras.Sequential()
model.add(layers.Embedding(vocab_size, 8, input_length=maxlen))

# 加上 RNN 神經層，輸出 128 個神經元
model.add(layers.SimpleRNN(128))

# 加上完全連接層(Dense)
model.add(layers.Dense(1, activation='sigmoid'))

# 指定優化器、損失函數
model.compile(optimizer='adam', loss='binary_crossentropy', 
              metrics=['accuracy'])

print(model.summary())
# 模型訓練
model.fit(padded_docs, labels, epochs=50, verbose=0)

# 模型評估
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 4, 8)              400       
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 128)               17536     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 18,065
Trainable params: 18,065
Non-trainable params: 0
_________________________________________________________________
None
Accuracy: 100.000000


In [10]:
y_pred = model.predict(padded_docs)

In [11]:
y_pred

array([[9.9998689e-01],
       [9.9999237e-01],
       [9.9997491e-01],
       [9.9983823e-01],
       [9.9992102e-01],
       [1.7617160e-05],
       [5.5901695e-05],
       [4.1447338e-06],
       [1.4474988e-04],
       [7.4259697e-05]], dtype=float32)

In [14]:
ypred = list(np.argmax(y_pred,axis=0))
ypred
#list(model.predict_classes(padded_docs).reshape(-1))

[1]

## 使用詞向量(Word2Vec)

## 讀取 GloVe 300維的詞向量，產生字典資料型變數，方便搜尋

In [15]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.6B.300d.txt', encoding='utf8') # https://www.kaggle.com/thanakomsn/glove6b300dtxt
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.array(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

## 分詞

In [16]:
# 分詞
from tensorflow.keras.preprocessing.text import Tokenizer
t = Tokenizer()
t.fit_on_texts(docs)

vocab_size = len(t.word_index) + 1

# 轉為序列整數
encoded_docs = t.texts_to_sequences(docs)

# 補 0
padded_docs = pad_sequences(encoded_docs, maxlen=maxlen, padding='post')
padded_docs

array([[ 6,  2,  0,  0],
       [ 3,  1,  0,  0],
       [ 7,  4,  0,  0],
       [ 8,  1,  0,  0],
       [ 9,  0,  0,  0],
       [10,  0,  0,  0],
       [ 5,  4,  0,  0],
       [11,  3,  0,  0],
       [ 5,  1,  0,  0],
       [12, 13,  2, 14]], dtype=int32)

## 轉換為GloVe 300維的詞向量

In [17]:
# 轉換為 GloVe 300維的詞向量
# 初始化輸出
embedding_matrix = np.zeros((vocab_size, 300))

# 讀取詞向量值
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# 任取一筆觀察        
embedding_matrix[2]

array([ 0.19205999,  0.16459   ,  0.060122  ,  0.17696001, -0.27405   ,
        0.079646  , -0.25292999, -0.11763   ,  0.17614   , -1.97870004,
        0.10707   , -0.028088  ,  0.093991  ,  0.48135   , -0.037581  ,
        0.0059231 , -0.11118   , -0.099847  , -0.22189   ,  0.0062044 ,
        0.17721   ,  0.25786   ,  0.42120999, -0.13085   , -0.32839   ,
        0.39208999, -0.050214  , -0.46766999, -0.063107  , -0.0023065 ,
        0.21005   ,  0.26982   , -0.22652   , -0.42958999, -0.89682001,
        0.21932   , -0.0020377 ,  0.1358    , -0.12661999, -0.058927  ,
        0.0049502 , -0.28457999, -0.29530999, -0.29295999, -0.24212   ,
        0.091915  ,  0.01977   ,  0.14503001,  0.26495999,  0.10817   ,
        0.029115  ,  0.075254  ,  0.16463999,  0.12097   , -0.37494001,
        0.52671999,  0.094318  , -0.054813  , -0.021008  ,  0.081353  ,
        0.18735   , -0.14458001, -0.031203  ,  0.31753999,  0.027703  ,
       -0.28657001,  0.34630999, -0.27772   ,  0.18669   , -0.11

## Embedding 設為不需訓練，直接輸入轉換後的向量

In [18]:
model = tf.keras.Sequential()

# trainable=False：不需訓練，直接輸入轉換後的向量
model.add(layers.Embedding(vocab_size, 300, weights=[embedding_matrix], 
                           input_length=maxlen, trainable=False))
model.add(layers.SimpleRNN(128))
model.add(layers.Dense(1, activation='sigmoid'))

# 指定優化器、損失函數
model.compile(optimizer='adam', loss='binary_crossentropy', 
              metrics=['accuracy'])

print(model.summary())

# 模型訓練
model.fit(padded_docs, labels, epochs=50, verbose=0)

# 模型評估
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 4, 300)            4500      
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 128)               54912     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 59,541
Trainable params: 55,041
Non-trainable params: 4,500
_________________________________________________________________
None
Accuracy: 100.000000


In [19]:
#list(model.predict_classes(padded_docs).reshape(-1))
#[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
y_pred = model.predict(padded_docs)
ypred = list(y_pred.reshape(-1))
ypred



[0.9997858,
 0.9997944,
 0.99992037,
 0.99993014,
 0.99990094,
 0.0001231689,
 9.4999166e-05,
 0.00022220612,
 0.00013649464,
 0.00011016103]

In [30]:
def sigmoid(x):
    sig = 1 / (1 + math.exp(-x))
    return round(sig,2)

In [33]:
def mpredit(x):
  d = 0
  if x>0.5:
    d = 1
  return d

In [34]:
import math
for i in ypred:
  x = mpredit(sigmoid(i))
  print(x,end=', ')
print()

1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 
