# Embedding layer Vs. Pretrained Embedding

## Embedding layer

In [None]:
from keras.preprocessing.text import one_hot
from keras.models import Sequential
from keras.layers import Embedding

In [None]:
# 임의의 문서 3개 생성
sample_text_1="bitty bought a bit of butter"
sample_text_2="but the bit of butter was a bit bitter"
sample_text_3="so she bought some better butter to make the bitter butter better"

corp = [sample_text_1, sample_text_2, sample_text_3]
no_docs=len(corp)

In [None]:
# 단어장의 크기를 50으로 하고 단어에 정수 부여
vocab_size = 50 
encod_corp=[]
for i,doc in enumerate(corp):
    encod_corp.append(one_hot(doc,50))
    print("The encoding for document",i+1," is : ",one_hot(doc,50))

The encoding for document 1  is :  [35, 49, 14, 39, 43, 16]
The encoding for document 2  is :  [44, 26, 39, 43, 16, 45, 14, 39, 38]
The encoding for document 3  is :  [33, 42, 49, 46, 30, 16, 46, 46, 26, 38, 16, 30]


In [None]:
# 임베딩 레이어 생성
model = Sequential()
model.add(Embedding(vocab_size, 128))

In [None]:
# 룩업 테이블 저장
embeddings = model.layers[0].get_weights()[0]

In [None]:
# 룩업 테이블 크기 확인
print(embeddings.shape)

(50, 128)


In [None]:
# 34번째 단어 임베딩 값 확인
embeddings[34]

array([-0.01106865,  0.0394384 , -0.0478355 ,  0.02675352,  0.04862538,
        0.03961532,  0.02246172, -0.01345694, -0.02711718, -0.02639705,
        0.00436708,  0.01028376,  0.01379221, -0.02657043, -0.02174001,
        0.01797474, -0.0052099 ,  0.02376706, -0.0226912 , -0.03302009,
       -0.03598273,  0.04697586,  0.04923508, -0.03926278,  0.0133314 ,
       -0.01996141,  0.03338749,  0.00021373,  0.02966574, -0.02901641,
        0.03277599, -0.04573483,  0.03233311, -0.01095687,  0.00595096,
        0.00175988, -0.00847365,  0.01309201, -0.01706252, -0.01703366,
       -0.01349332,  0.02245844,  0.0243522 ,  0.03013821,  0.01006918,
        0.04626724,  0.04557112,  0.01195331,  0.02492466,  0.03209781,
       -0.0014979 ,  0.00148914,  0.03699131,  0.02931536, -0.00652859,
       -0.01847283,  0.03345947,  0.03421794,  0.02096744,  0.02205279,
        0.03600856, -0.01789178,  0.0378    ,  0.01164681, -0.00064325,
       -0.03809013,  0.00919255, -0.02953138, -0.03147631, -0.03

In [None]:
# 첫번째 문서 임베딩화
embeddings[encod_corp[0]].shape

(6, 128)

In [None]:
# 두번째 문서 임베딩화
embeddings[encod_corp[1]].shape

(9, 128)

In [None]:
# 세번째 문서 임베딩화
embeddings[encod_corp[2]].shape

(12, 128)

## 감성 분류하기(Embedding layer)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [None]:
sentences = ['nice great best amazing', 'stop lies', 'pitiful nerd', 'excellent work', 'supreme quality', 'bad', 'highly respectable']
y_train = [1, 0, 0, 1, 1, 0, 1]

In [None]:
t = Tokenizer()
t.fit_on_texts(sentences)
vocab_size = len(t.word_index) + 1

print(vocab_size)

16


In [None]:
t.word_index

{'amazing': 4,
 'bad': 13,
 'best': 3,
 'excellent': 9,
 'great': 2,
 'highly': 14,
 'lies': 6,
 'nerd': 8,
 'nice': 1,
 'pitiful': 7,
 'quality': 12,
 'respectable': 15,
 'stop': 5,
 'supreme': 11,
 'work': 10}

In [None]:
X_encoded = t.texts_to_sequences(sentences)
print(X_encoded)

[[1, 2, 3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13], [14, 15]]


In [None]:
max_len = max(len(l) for l in X_encoded)
print(max_len)

4


In [None]:
X_train=pad_sequences(X_encoded, maxlen=max_len, padding='post')
y_train=np.array(y_train)
print(X_train)

[[ 1  2  3  4]
 [ 5  6  0  0]
 [ 7  8  0  0]
 [ 9 10  0  0]
 [11 12  0  0]
 [13  0  0  0]
 [14 15  0  0]]


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

model = Sequential()
model.add(Embedding(vocab_size, 4, input_length=max_len)) # 모든 임베딩 벡터는 4차원.
model.add(Flatten()) # Dense의 입력으로 넣기위함.
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(X_train, y_train, epochs=100, verbose=2)

Epoch 1/100
1/1 - 2s - loss: 0.6918 - acc: 0.4286
Epoch 2/100
1/1 - 0s - loss: 0.6901 - acc: 0.4286
Epoch 3/100
1/1 - 0s - loss: 0.6883 - acc: 0.4286
Epoch 4/100
1/1 - 0s - loss: 0.6865 - acc: 0.7143
Epoch 5/100
1/1 - 0s - loss: 0.6848 - acc: 0.7143
Epoch 6/100
1/1 - 0s - loss: 0.6830 - acc: 0.7143
Epoch 7/100
1/1 - 0s - loss: 0.6812 - acc: 0.7143
Epoch 8/100
1/1 - 0s - loss: 0.6795 - acc: 0.7143
Epoch 9/100
1/1 - 0s - loss: 0.6777 - acc: 0.7143
Epoch 10/100
1/1 - 0s - loss: 0.6760 - acc: 0.7143
Epoch 11/100
1/1 - 0s - loss: 0.6742 - acc: 0.7143
Epoch 12/100
1/1 - 0s - loss: 0.6724 - acc: 0.8571
Epoch 13/100
1/1 - 0s - loss: 0.6707 - acc: 0.8571
Epoch 14/100
1/1 - 0s - loss: 0.6689 - acc: 1.0000
Epoch 15/100
1/1 - 0s - loss: 0.6672 - acc: 1.0000
Epoch 16/100
1/1 - 0s - loss: 0.6654 - acc: 1.0000
Epoch 17/100
1/1 - 0s - loss: 0.6636 - acc: 1.0000
Epoch 18/100
1/1 - 0s - loss: 0.6618 - acc: 1.0000
Epoch 19/100
1/1 - 0s - loss: 0.6601 - acc: 1.0000
Epoch 20/100
1/1 - 0s - loss: 0.6583 - a

<tensorflow.python.keras.callbacks.History at 0x7f3d50424f10>

## 감성 분류하기(Pre-trained Embedding)

In [None]:
print(X_train)

[[ 1  2  3  4]
 [ 5  6  0  0]
 [ 7  8  0  0]
 [ 9 10  0  0]
 [11 12  0  0]
 [13  0  0  0]
 [14 15  0  0]]


In [None]:
import numpy as np
import gensim

In [None]:
# 현재 위치에 구글의 사전 훈련된 Word2Vec을 다운로드
!wget "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2021-06-19 02:06:37--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.226.251
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.226.251|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’


2021-06-19 02:07:31 (29.3 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [None]:
# 구글의 사전 훈련된 Word2vec 모델을 로드합니다.
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)  

In [None]:
print(word2vec_model.vectors.shape) # 모델의 크기 확인

(3000000, 300)


In [None]:
embedding_matrix = np.zeros((vocab_size, 300))
# 단어 집합 크기의 행과 300개의 열을 가지는 행렬 생성. 값은 전부 0으로 채워진다.
np.shape(embedding_matrix)

(16, 300)

In [None]:
def get_vector(word):
    if word in word2vec_model:
        return word2vec_model[word]
    else:
        return None

In [None]:
for word, i in t.word_index.items(): # 훈련 데이터의 단어 집합에서 단어와 정수 인덱스를 1개씩 꺼내온다.
    temp = get_vector(word) # 단어(key) 해당되는 임베딩 벡터의 300개의 값(value)를 임시 변수에 저장
    if temp is not None: # 만약 None이 아니라면 임베딩 벡터의 값을 리턴받은 것이므로
        embedding_matrix[i] = temp # 해당 단어 위치의 행에 벡터의 값을 저장한다.

In [None]:
print(word2vec_model['nice'])

[ 0.15820312  0.10595703 -0.18945312  0.38671875  0.08349609 -0.26757812
  0.08349609  0.11328125 -0.10400391  0.17871094 -0.12353516 -0.22265625
 -0.01806641 -0.25390625  0.13183594  0.0859375   0.16113281  0.11083984
 -0.11083984 -0.0859375   0.0267334   0.34570312  0.15136719 -0.00415039
  0.10498047  0.04907227 -0.06982422  0.08642578  0.03198242 -0.02844238
 -0.15722656  0.11865234  0.36132812  0.00173187  0.05297852 -0.234375
  0.11767578  0.08642578 -0.01123047  0.25976562  0.28515625 -0.11669922
  0.38476562  0.07275391  0.01147461  0.03466797  0.18164062 -0.03955078
  0.04199219  0.01013184 -0.06054688  0.09765625  0.06689453  0.14648438
 -0.12011719  0.08447266 -0.06152344  0.06347656  0.3046875  -0.35546875
 -0.2890625   0.19628906 -0.33203125 -0.07128906  0.12792969  0.09619141
 -0.12158203 -0.08691406 -0.12890625  0.27734375  0.265625    0.1796875
  0.12695312  0.06298828 -0.34375    -0.05908203  0.0456543   0.171875
  0.08935547  0.14648438 -0.04638672 -0.00842285 -0.0279

In [None]:
print('단어 nice의 정수 인덱스 :', t.word_index['nice'])

단어 nice의 정수 인덱스 : 1


In [None]:
print(embedding_matrix[1])

[ 0.15820312  0.10595703 -0.18945312  0.38671875  0.08349609 -0.26757812
  0.08349609  0.11328125 -0.10400391  0.17871094 -0.12353516 -0.22265625
 -0.01806641 -0.25390625  0.13183594  0.0859375   0.16113281  0.11083984
 -0.11083984 -0.0859375   0.0267334   0.34570312  0.15136719 -0.00415039
  0.10498047  0.04907227 -0.06982422  0.08642578  0.03198242 -0.02844238
 -0.15722656  0.11865234  0.36132812  0.00173187  0.05297852 -0.234375
  0.11767578  0.08642578 -0.01123047  0.25976562  0.28515625 -0.11669922
  0.38476562  0.07275391  0.01147461  0.03466797  0.18164062 -0.03955078
  0.04199219  0.01013184 -0.06054688  0.09765625  0.06689453  0.14648438
 -0.12011719  0.08447266 -0.06152344  0.06347656  0.3046875  -0.35546875
 -0.2890625   0.19628906 -0.33203125 -0.07128906  0.12792969  0.09619141
 -0.12158203 -0.08691406 -0.12890625  0.27734375  0.265625    0.1796875
  0.12695312  0.06298828 -0.34375    -0.05908203  0.0456543   0.171875
  0.08935547  0.14648438 -0.04638672 -0.00842285 -0.0279

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten, Input

model = Sequential()
model.add(Input(shape=(max_len,), dtype='int32'))
e = Embedding(vocab_size, 300, weights=[embedding_matrix], trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
model.fit(X_train, y_train, epochs=100, verbose=2)

Epoch 1/100
1/1 - 0s - loss: 0.6661 - acc: 0.8571
Epoch 2/100
1/1 - 0s - loss: 0.6472 - acc: 0.8571
Epoch 3/100
1/1 - 0s - loss: 0.6288 - acc: 0.8571
Epoch 4/100
1/1 - 0s - loss: 0.6110 - acc: 0.8571
Epoch 5/100
1/1 - 0s - loss: 0.5937 - acc: 1.0000
Epoch 6/100
1/1 - 0s - loss: 0.5770 - acc: 1.0000
Epoch 7/100
1/1 - 0s - loss: 0.5609 - acc: 1.0000
Epoch 8/100
1/1 - 0s - loss: 0.5454 - acc: 1.0000
Epoch 9/100
1/1 - 0s - loss: 0.5304 - acc: 1.0000
Epoch 10/100
1/1 - 0s - loss: 0.5159 - acc: 1.0000
Epoch 11/100
1/1 - 0s - loss: 0.5020 - acc: 1.0000
Epoch 12/100
1/1 - 0s - loss: 0.4886 - acc: 1.0000
Epoch 13/100
1/1 - 0s - loss: 0.4757 - acc: 1.0000
Epoch 14/100
1/1 - 0s - loss: 0.4633 - acc: 1.0000
Epoch 15/100
1/1 - 0s - loss: 0.4513 - acc: 1.0000
Epoch 16/100
1/1 - 0s - loss: 0.4398 - acc: 1.0000
Epoch 17/100
1/1 - 0s - loss: 0.4287 - acc: 1.0000
Epoch 18/100
1/1 - 0s - loss: 0.4180 - acc: 1.0000
Epoch 19/100
1/1 - 0s - loss: 0.4077 - acc: 1.0000
Epoch 20/100
1/1 - 0s - loss: 0.3978 - a

<tensorflow.python.keras.callbacks.History at 0x7f3ca106c990>

# Tensorflow Hub로부터 Pre-trained Embedding 사용하기

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_hub as hub

In [None]:
(train_data, validation_data, test_data), info = tfds.load('imdb_reviews', split=['train[:80%]', 'train[80%:90%]', 'train[90%:]'], with_info=True, as_supervised=True)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…







HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteP6E2GM/imdb_reviews-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteP6E2GM/imdb_reviews-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteP6E2GM/imdb_reviews-unsupervised.tfrecord


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))

[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [None]:
train_examples_batch, train_labels_batch = next(iter(train_data.batch(10)))
train_examples_batch

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b"This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting could not redeem this movie's ridiculous storyline. This movie is an early nineties US propaganda piece. The most pathetic scenes were those when the Columbian rebels were making their cases for revolutions. Maria Conchita Alonso appeared phony, and her pseudo-love affair with Walken was nothing but a pathetic emotional plug in a movie that was devoid of any real meaning. I am disappointed that there are movies like this, ruining actor's like Christopher Walken's good name. I could barely sit through it.",
       b'I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However on this occasion I fell 

In [None]:
train_labels_batch

<tf.Tensor: shape=(10,), dtype=int64, numpy=array([0, 0, 0, 1, 1, 1, 0, 0, 0, 0])>

In [None]:
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[], 
                           dtype=tf.string, trainable=True)
hub_layer(train_examples_batch[:3])

<tf.Tensor: shape=(3, 20), dtype=float32, numpy=
array([[ 1.765786  , -3.882232  ,  3.9134233 , -1.5557289 , -3.3362343 ,
        -1.7357955 , -1.9954445 ,  1.2989551 ,  5.081598  , -1.1041286 ,
        -2.0503852 , -0.72675157, -0.65675956,  0.24436149, -3.7208383 ,
         2.0954835 ,  2.2969332 , -2.0689783 , -2.9489717 , -1.1315987 ],
       [ 1.8804485 , -2.5852382 ,  3.4066997 ,  1.0982676 , -4.056685  ,
        -4.891284  , -2.785554  ,  1.3874227 ,  3.8476458 , -0.9256538 ,
        -1.896706  ,  1.2113281 ,  0.11474707,  0.76209456, -4.8791065 ,
         2.906149  ,  4.7087674 , -2.3652055 , -3.5015898 , -1.6390051 ],
       [ 0.71152234, -0.6353217 ,  1.7385626 , -1.1168286 , -0.5451594 ,
        -1.1808156 ,  0.09504455,  1.4653089 ,  0.66059524,  0.79308075,
        -2.2268345 ,  0.07446612, -1.4075904 , -0.70645386, -1.907037  ,
         1.4419787 ,  1.9551861 , -0.42660055, -2.8022065 ,  0.43727064]],
      dtype=float32)>

In [None]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer (KerasLayer)     (None, 20)                400020    
_________________________________________________________________
dense_2 (Dense)              (None, 16)                336       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 400,373
Trainable params: 400,373
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
history = model.fit(train_data.shuffle(10000).batch(512),
                    epochs=20,
                    validation_data=validation_data.batch(512),
                    verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
results = model.evaluate(test_data.batch(512), verbose=2)
for name, value in zip(model.metrics_names, results):
  print("%s: %.3f" % (name, value))

5/5 - 0s - loss: 0.2793 - accuracy: 0.8816
loss: 0.279
accuracy: 0.882
