# Word embeddings

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

!pip install tf-nightly-2.0-preview
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

# 在layers.Embedding 需要兩個變數
# 第一個是可能的單詞數量
# 第二個變數為embeddings的維度
# embedding_layer = layers.Embedding(1000, 32)

#layers.Embedding會經過一連串的神經網路計算來調整文字間的相似性

In [0]:
# 資料匯入
# 25,000電影評論情緒資料（正面與負面）
vocab_size = 10000 #詞彙表
imdb = keras.datasets.imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=vocab_size)

In [0]:
#以下展示的數字為字典中的特定單詞
print(train_data[1])

In [0]:
# 將單詞進行索引
word_index = imdb.get_word_index()

# The first indices are reserved
word_index = {k:(v+3) for k,v in word_index.items()}
word_index["<PAD>"] = 0 #填充長度不一的內容
word_index["<START>"] = 1 #開始的代號
word_index["<UNK>"] = 2  # 詞彙表沒有的詞，如人名
word_index["<UNUSED>"] = 3 #沒用到的詞

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

decode_review(train_data[0])

In [0]:
#for key, name in reverse_word_index.items(): 
#    if name == 'this':
#        print(key)

In [0]:
#reverse_word_index[1]

In [0]:
#透過pad_sequences來將電影的評論進行長度的標準化
maxlen = 500

train_data = keras.preprocessing.sequence.pad_sequences(train_data,
                                                        value=word_index["<PAD>"],
                                                        padding='post',
                                                        maxlen=maxlen)

test_data = keras.preprocessing.sequence.pad_sequences(test_data,
                                                       value=word_index["<PAD>"],
                                                       padding='post',
                                                       maxlen=maxlen)

In [0]:
#len(train_data[1])

In [0]:
#train_data[1]

In [0]:
embedding_dim=16

model = keras.Sequential([
  #第一層採用Embedding（單詞的範圍大小、Embedding向量的維度、單一句子的最大長度）
  layers.Embedding(vocab_size, embedding_dim, input_length=maxlen),
  #對字串做最大池化
  layers.GlobalAveragePooling1D(),
  #建立16顆神經單元的輸出
  layers.Dense(16, activation='relu'),
  #建立1顆神經單元的輸出，並使用0~1間的激活函數，確保輸出為機率的範圍
  layers.Dense(1, activation='sigmoid')
])

model.summary()

In [0]:
#train_data[0]

### Compile and train the model

In [0]:
#加入優化器、Loss計算準則
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

#進行模型計算，
history = model.fit(
    train_data,
    train_labels,
    epochs=30,
    batch_size=512,
    validation_split=0.2)

In [0]:
import matplotlib.pyplot as plt

history_dict = history.history

acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

plt.figure(figsize=(12,9))
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

plt.figure(figsize=(12,9))
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim((0.5,1))
plt.show()

In [0]:
#嘗試將剛剛訓練的內容，讓他可以被Embedding Projector（http://projector.tensorflow.org/）這個網站進行檢索

In [0]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)
#10000為詞彙表，16為詞向量維度

In [0]:
import io

#vecs.tsv向量文件（16個維度）
#meta.tsv單詞文件（10000個單詞）
out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [0]:
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

In [0]:
#http://projector.tensorflow.org/ 上傳 vecs.tsv與meta.tsv
#嘗試輸入beautiful
#嘗試刪除Dense(16) layer 試試看