In [None]:
import tensorflow as tf

# The Embedding layer takes at least two arguments:
# the number of possible words in the vocabulary, here 1000 (1 + maximum word index),
# and the dimensionality of the embeddings, here 32.
embedding_layer = tf.keras.layers.Embedding(1000, 32)

In [None]:
vocab_size = 10000
imdb = tf.keras.datasets.imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=vocab_size, start_char=1, oov_char=2, index_from=3)

In [None]:
print(train_data[0])

In [None]:
# A dictionary mapping words to an integer index
word_index = imdb.get_word_index()

# The first indices are reserved
# Create dict class using key(words)-value(integers)
word_index = {k:(v+3) for k,v in word_index.items()} 

# Set new keys for us 
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3

# Key: integers, Value: words
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print("Train_data[0]: {}\n".format(train_data[0]))
decode_review(train_data[0])

In [None]:
train_labels[0]

In [None]:
# Standardize the lenghts of data to use it as an input of model
maxlen = 500

# Return the 2D Numpy array of shape (train_data, maxlen) 
# by transfoming the input data with zeros peddings 
train_data = tf.keras.preprocessing.sequence.pad_sequences(train_data,
                                                           value=word_index["<PAD>"],
                                                           padding='post', # 'pre' or 'post'
                                                           maxlen=maxlen)

test_data = tf.keras.preprocessing.sequence.pad_sequences(test_data,
                                                          value=word_index["<PAD>"],
                                                          padding='post',
                                                          maxlen=maxlen)

*pad* : 길이가 500보다 짧은 문장은 pad를 넣어 0으로 표현.

In [None]:
print("Len of data: {}".format(len(train_data[0])))
print(train_data[0])

Keras Sequential API를 사용하여 모델을 정의
- 첫 번째 레이어는 **Embedding Layer** 
- 이 레이어는 integer로 인코딩된 vocabulary를 갖고 각 단어 인덱스에 대한 embedding vector를 찾음(integer to float). 
- embedding vector는 모델이 학습될 때 학습됨.
- Vector는 출력 layer에서 차원을 추가하여 `(배치, 시퀀스, 임베딩)`(3D tensor)로 나오게 됨.
- 다음으로 **GlobalAveragePooling1D layer**는 시퀀스 차원 `(배치, 기능)`(2D 텐서)을 평균화하여 각 데이터에 대한 고정 길이 출력 벡터를 반

<img src=https://jsideas.net/assets/materials/20180104/GAP_GMP.png width=600>

- 이 고정 길이 출력 벡터는 16개의 hidden unit이 있는 **Fully Connected (Dense) Layer**를 통과하게 됨.
- Sigmoid 함수를 사용하면 이 값은 0과 1 사이의 부동 소수점으로 리뷰가 긍정적일 확률(또는 신뢰 수준)을 나타냄.

In [None]:
embedding_dim = 16

model = tf.keras.Sequential([
  tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=maxlen),
  tf.keras.layers.GlobalAveragePooling1D(),
  tf.keras.layers.Dense(16, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid')
])

model.summary()

In [None]:
# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
# Then, train the model
history = model.fit(train_data,
                    train_labels,
                    epochs=30,
                    batch_size=512,
                    validation_data=(test_data, test_labels))

In [None]:
import matplotlib.pyplot as plt

history_dict = history.history

acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc) + 1)

plt.figure(figsize=(12,9))
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

plt.figure(figsize=(12,9))
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.ylim((0.5,1))
plt.show()

In [None]:
# model.layers[]: get the layer information
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

In [None]:
weights[1]

In [None]:
import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(vocab_size):
    word = reverse_word_index[word_num]
    embeddings = weights[word_num]
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

[워드 임베딩 프로젝터](https://projector.tensorflow.org/)