# Keras Embedding layer

In [2]:
# 문장의 긍정, 부정을 판단하는 감성 분류 모델
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
sentences = ["nice great best amazing", "stop lies", "pitiful nerd", "excellent work", "supreme quaility", "bad", "highly respectable"]
y_train = [1, 0, 0, 1, 1, 0, 1]

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
vocab_size = len(tokenizer.word_index) + 1

vocab_size

16

In [5]:
X_encoded = tokenizer.texts_to_sequences(sentences)

X_encoded

[[1, 2, 3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13], [14, 15]]

In [7]:
max_len = max(len(sent) for sent in X_encoded)

max_len

4

In [10]:
X_train = pad_sequences(X_encoded, maxlen=max_len, padding="post")
y_train = np.array(y_train)

X_train

array([[ 1,  2,  3,  4],
       [ 5,  6,  0,  0],
       [ 7,  8,  0,  0],
       [ 9, 10,  0,  0],
       [11, 12,  0,  0],
       [13,  0,  0,  0],
       [14, 15,  0,  0]], dtype=int32)

In [11]:
# 이진 분류
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

embedding_dim = 4

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_len))
model.add(Flatten())
model.add(Dense(1, activation="sigmoid"))

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["acc"])
model.fit(X_train, y_train, epochs=100, verbose=2)

Metal device set to: Apple M1

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB

Epoch 1/100


2023-07-31 19:21:14.129340: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-07-31 19:21:14.130111: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2023-07-31 19:21:14.308201: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-07-31 19:21:14.515257: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


1/1 - 1s - loss: 0.6871 - acc: 0.7143 - 665ms/epoch - 665ms/step
Epoch 2/100
1/1 - 0s - loss: 0.6855 - acc: 0.7143 - 8ms/epoch - 8ms/step
Epoch 3/100
1/1 - 0s - loss: 0.6839 - acc: 0.7143 - 10ms/epoch - 10ms/step
Epoch 4/100
1/1 - 0s - loss: 0.6823 - acc: 0.7143 - 10ms/epoch - 10ms/step
Epoch 5/100
1/1 - 0s - loss: 0.6807 - acc: 0.7143 - 8ms/epoch - 8ms/step
Epoch 6/100
1/1 - 0s - loss: 0.6791 - acc: 0.8571 - 9ms/epoch - 9ms/step
Epoch 7/100
1/1 - 0s - loss: 0.6775 - acc: 0.8571 - 8ms/epoch - 8ms/step
Epoch 8/100
1/1 - 0s - loss: 0.6759 - acc: 0.8571 - 8ms/epoch - 8ms/step
Epoch 9/100
1/1 - 0s - loss: 0.6742 - acc: 0.8571 - 9ms/epoch - 9ms/step
Epoch 10/100
1/1 - 0s - loss: 0.6726 - acc: 1.0000 - 7ms/epoch - 7ms/step
Epoch 11/100
1/1 - 0s - loss: 0.6710 - acc: 1.0000 - 10ms/epoch - 10ms/step
Epoch 12/100
1/1 - 0s - loss: 0.6694 - acc: 1.0000 - 8ms/epoch - 8ms/step
Epoch 13/100
1/1 - 0s - loss: 0.6678 - acc: 1.0000 - 7ms/epoch - 7ms/step
Epoch 14/100
1/1 - 0s - loss: 0.6662 - acc: 1.000

<keras.callbacks.History at 0x28c2cd8b0>

## Pre-Trained Word Embedding

### GloVe

In [12]:
from urllib.request import urlretrieve, urlopen
import gzip, zipfile

urlretrieve("https://nlp.stanford.edu/data/glove.6B.zip", filename="glove.6B.zip")
zf = zipfile.ZipFile("glove.6B.zip")
zf.extractall()
zf.close()

In [15]:
embedding_dict = dict()

with open("glove.6B.100d.txt", encoding="utf8") as f:
    for idx, line in enumerate(f):
        word_vector = line.split()
        word = word_vector[0]
        word_vector_arr = np.asarray(word_vector[1:], dtype=np.float32)
        embedding_dict[word] = word_vector_arr

len(embedding_dict)

400000

In [16]:
embedding_dict["respectable"], len(embedding_dict["respectable"])

(array([-0.049773 ,  0.19903  ,  0.10585  ,  0.1391   , -0.32395  ,
         0.44053  ,  0.3947   , -0.22805  , -0.25793  ,  0.49768  ,
         0.15384  , -0.08831  ,  0.0782   , -0.8299   , -0.037788 ,
         0.16772  , -0.45197  , -0.17085  ,  0.74756  ,  0.98256  ,
         0.81872  ,  0.28507  ,  0.16178  , -0.48626  , -0.006265 ,
        -0.92469  , -0.30625  , -0.067318 , -0.046762 , -0.76291  ,
        -0.0025264, -0.018795 ,  0.12882  , -0.52457  ,  0.3586   ,
         0.43119  , -0.89477  , -0.057421 , -0.53724  ,  0.25587  ,
         0.55195  ,  0.44698  , -0.24252  ,  0.29946  ,  0.25776  ,
        -0.8717   ,  0.68426  , -0.05688  , -0.1848   , -0.59352  ,
        -0.11227  , -0.57692  , -0.013593 ,  0.18488  , -0.32507  ,
        -0.90171  ,  0.17672  ,  0.075601 ,  0.54896  , -0.21488  ,
        -0.54018  , -0.45882  , -0.79536  ,  0.26331  ,  0.18879  ,
        -0.16363  ,  0.3975   ,  0.1099   ,  0.1164   , -0.083499 ,
         0.50159  ,  0.35802  ,  0.25677  ,  0.0

In [17]:
embedding_matrix = np.zeros((vocab_size, 100))

tokenizer.word_index.items()

dict_items([('nice', 1), ('great', 2), ('best', 3), ('amazing', 4), ('stop', 5), ('lies', 6), ('pitiful', 7), ('nerd', 8), ('excellent', 9), ('work', 10), ('supreme', 11), ('quaility', 12), ('bad', 13), ('highly', 14), ('respectable', 15)])

In [19]:
# "great" 벡터값 확인
print(tokenizer.word_index["great"])
print(embedding_dict["great"])

2
[-0.013786   0.38216    0.53236    0.15261   -0.29694   -0.20558
 -0.41846   -0.58437   -0.77355   -0.87866   -0.37858   -0.18516
 -0.128     -0.20584   -0.22925   -0.42599    0.3725     0.26077
 -1.0702     0.62916   -0.091469   0.70348   -0.4973    -0.77691
  0.66045    0.09465   -0.44893    0.018917   0.33146   -0.35022
 -0.35789    0.030313   0.22253   -0.23236   -0.19719   -0.0053125
 -0.25848    0.58081   -0.10705   -0.17845   -0.16206    0.087086
  0.63029   -0.76649    0.51619    0.14073    1.019     -0.43136
  0.46138   -0.43585   -0.47568    0.19226    0.36065    0.78987
  0.088945  -2.7814    -0.15366    0.01015    1.1798     0.15168
 -0.050112   1.2626    -0.77527    0.36031    0.95761   -0.11385
  0.28035   -0.02591    0.31246   -0.15424    0.3778    -0.13599
  0.2946    -0.31579    0.42943    0.086969   0.019169  -0.27242
 -0.31696    0.37327    0.61997    0.13889    0.17188    0.30363
 -1.2776     0.044423  -0.52736   -0.88536   -0.19428   -0.61947
 -0.10146   -0.26301

In [20]:
for word, idx in tokenizer.word_index.items():
    vector_val = embedding_dict.get(word)
    if vector_val is not None:
        embedding_matrix[idx] = vector_val

embedding_matrix[2]

array([-0.013786  ,  0.38216001,  0.53236002,  0.15261   , -0.29694   ,
       -0.20558   , -0.41846001, -0.58437002, -0.77354997, -0.87866002,
       -0.37858   , -0.18516   , -0.12800001, -0.20584001, -0.22925   ,
       -0.42598999,  0.3725    ,  0.26076999, -1.07019997,  0.62915999,
       -0.091469  ,  0.70348001, -0.4973    , -0.77691001,  0.66044998,
        0.09465   , -0.44893   ,  0.018917  ,  0.33146   , -0.35021999,
       -0.35789001,  0.030313  ,  0.22253001, -0.23236001, -0.19719   ,
       -0.0053125 , -0.25848001,  0.58081001, -0.10705   , -0.17845   ,
       -0.16205999,  0.087086  ,  0.63028997, -0.76648998,  0.51618999,
        0.14072999,  1.01900005, -0.43136001,  0.46138   , -0.43584999,
       -0.47567999,  0.19226   ,  0.36065   ,  0.78987002,  0.088945  ,
       -2.78139997, -0.15366   ,  0.01015   ,  1.17980003,  0.15167999,
       -0.050112  ,  1.26259995, -0.77526999,  0.36030999,  0.95761001,
       -0.11385   ,  0.28035   , -0.02591   ,  0.31246001, -0.15

In [21]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

output_dim = 100 # 사전 학습된 임베팅 벡터의 차원과 동일하게

model = Sequential()
e = Embedding(vocab_size, output_dim, input_length=max_len, weights=[embedding_matrix], trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation="sigmoid"))

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["acc"])
model.fit(X_train, y_train, epochs=100, verbose=2)

Epoch 1/100
1/1 - 0s - loss: 0.7320 - acc: 0.5714 - 307ms/epoch - 307ms/step
Epoch 2/100
1/1 - 0s - loss: 0.7128 - acc: 0.5714 - 6ms/epoch - 6ms/step
Epoch 3/100
1/1 - 0s - loss: 0.6943 - acc: 0.5714 - 7ms/epoch - 7ms/step
Epoch 4/100
1/1 - 0s - loss: 0.6765 - acc: 0.5714 - 6ms/epoch - 6ms/step
Epoch 5/100
1/1 - 0s - loss: 0.6593 - acc: 0.5714 - 8ms/epoch - 8ms/step
Epoch 6/100
1/1 - 0s - loss: 0.6429 - acc: 0.5714 - 7ms/epoch - 7ms/step
Epoch 7/100


2023-07-31 19:44:15.643603: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


1/1 - 0s - loss: 0.6271 - acc: 0.5714 - 8ms/epoch - 8ms/step
Epoch 8/100
1/1 - 0s - loss: 0.6119 - acc: 0.5714 - 6ms/epoch - 6ms/step
Epoch 9/100
1/1 - 0s - loss: 0.5973 - acc: 0.5714 - 10ms/epoch - 10ms/step
Epoch 10/100
1/1 - 0s - loss: 0.5833 - acc: 0.5714 - 6ms/epoch - 6ms/step
Epoch 11/100
1/1 - 0s - loss: 0.5698 - acc: 0.5714 - 9ms/epoch - 9ms/step
Epoch 12/100
1/1 - 0s - loss: 0.5568 - acc: 0.5714 - 7ms/epoch - 7ms/step
Epoch 13/100
1/1 - 0s - loss: 0.5443 - acc: 0.5714 - 7ms/epoch - 7ms/step
Epoch 14/100
1/1 - 0s - loss: 0.5322 - acc: 0.5714 - 6ms/epoch - 6ms/step
Epoch 15/100
1/1 - 0s - loss: 0.5205 - acc: 0.8571 - 8ms/epoch - 8ms/step
Epoch 16/100
1/1 - 0s - loss: 0.5092 - acc: 1.0000 - 7ms/epoch - 7ms/step
Epoch 17/100
1/1 - 0s - loss: 0.4983 - acc: 1.0000 - 9ms/epoch - 9ms/step
Epoch 18/100
1/1 - 0s - loss: 0.4877 - acc: 1.0000 - 7ms/epoch - 7ms/step
Epoch 19/100
1/1 - 0s - loss: 0.4774 - acc: 1.0000 - 6ms/epoch - 6ms/step
Epoch 20/100
1/1 - 0s - loss: 0.4674 - acc: 1.0000 

<keras.callbacks.History at 0x28bda8a90>

### Word2Vec

In [26]:
import gensim

# urlretrieve("https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz", filename="GoogleNews-vectors-negative300.bin.gz")
word2vec_model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)

word2vec_model.vectors.shape

(3000000, 300)

In [27]:
embedding_matrix = np.zeros((vocab_size, 300))

def get_vector(word, model):
    if word in model:
        return model[word]
    else:
        return None

for word, idx in tokenizer.word_index.items():
    vector_val = get_vector(word, word2vec_model)
    if vector_val is not None:
        embedding_matrix[idx] = vector_val

In [28]:
word2vec_model["nice"]

array([ 0.15820312,  0.10595703, -0.18945312,  0.38671875,  0.08349609,
       -0.26757812,  0.08349609,  0.11328125, -0.10400391,  0.17871094,
       -0.12353516, -0.22265625, -0.01806641, -0.25390625,  0.13183594,
        0.0859375 ,  0.16113281,  0.11083984, -0.11083984, -0.0859375 ,
        0.0267334 ,  0.34570312,  0.15136719, -0.00415039,  0.10498047,
        0.04907227, -0.06982422,  0.08642578,  0.03198242, -0.02844238,
       -0.15722656,  0.11865234,  0.36132812,  0.00173187,  0.05297852,
       -0.234375  ,  0.11767578,  0.08642578, -0.01123047,  0.25976562,
        0.28515625, -0.11669922,  0.38476562,  0.07275391,  0.01147461,
        0.03466797,  0.18164062, -0.03955078,  0.04199219,  0.01013184,
       -0.06054688,  0.09765625,  0.06689453,  0.14648438, -0.12011719,
        0.08447266, -0.06152344,  0.06347656,  0.3046875 , -0.35546875,
       -0.2890625 ,  0.19628906, -0.33203125, -0.07128906,  0.12792969,
        0.09619141, -0.12158203, -0.08691406, -0.12890625,  0.27

In [29]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten, Input

model = Sequential()
model.add(Input(shape=(max_len, ), dtype=np.int32))
e = Embedding(vocab_size, 300, input_length=max_len, weights=[embedding_matrix], trainable=False)
model.add(e)
model.add(Flatten())
model.add(Dense(1, activation="sigmoid"))

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["acc"])
model.fit(X_train, y_train, epochs=100, verbose=2)

Epoch 1/100
1/1 - 0s - loss: 0.7266 - acc: 0.4286 - 295ms/epoch - 295ms/step
Epoch 2/100
1/1 - 0s - loss: 0.7077 - acc: 0.4286 - 9ms/epoch - 9ms/step
Epoch 3/100
1/1 - 0s - loss: 0.6893 - acc: 0.5714 - 7ms/epoch - 7ms/step
Epoch 4/100
1/1 - 0s - loss: 0.6715 - acc: 0.7143 - 8ms/epoch - 8ms/step
Epoch 5/100
1/1 - 0s - loss: 0.6543 - acc: 0.7143 - 7ms/epoch - 7ms/step
Epoch 6/100
1/1 - 0s - loss: 0.6376 - acc: 0.7143 - 7ms/epoch - 7ms/step
Epoch 7/100
1/1 - 0s - loss: 0.6214 - acc: 0.7143 - 6ms/epoch - 6ms/step
Epoch 8/100
1/1 - 0s - loss: 0.6058 - acc: 0.7143 - 6ms/epoch - 6ms/step
Epoch 9/100


2023-07-31 19:57:28.902143: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


1/1 - 0s - loss: 0.5908 - acc: 0.7143 - 5ms/epoch - 5ms/step
Epoch 10/100
1/1 - 0s - loss: 0.5762 - acc: 0.8571 - 7ms/epoch - 7ms/step
Epoch 11/100
1/1 - 0s - loss: 0.5622 - acc: 1.0000 - 6ms/epoch - 6ms/step
Epoch 12/100
1/1 - 0s - loss: 0.5486 - acc: 1.0000 - 7ms/epoch - 7ms/step
Epoch 13/100
1/1 - 0s - loss: 0.5355 - acc: 1.0000 - 6ms/epoch - 6ms/step
Epoch 14/100
1/1 - 0s - loss: 0.5229 - acc: 1.0000 - 6ms/epoch - 6ms/step
Epoch 15/100
1/1 - 0s - loss: 0.5107 - acc: 1.0000 - 6ms/epoch - 6ms/step
Epoch 16/100
1/1 - 0s - loss: 0.4989 - acc: 1.0000 - 5ms/epoch - 5ms/step
Epoch 17/100
1/1 - 0s - loss: 0.4875 - acc: 1.0000 - 6ms/epoch - 6ms/step
Epoch 18/100
1/1 - 0s - loss: 0.4764 - acc: 1.0000 - 6ms/epoch - 6ms/step
Epoch 19/100
1/1 - 0s - loss: 0.4658 - acc: 1.0000 - 7ms/epoch - 7ms/step
Epoch 20/100
1/1 - 0s - loss: 0.4555 - acc: 1.0000 - 6ms/epoch - 6ms/step
Epoch 21/100
1/1 - 0s - loss: 0.4455 - acc: 1.0000 - 7ms/epoch - 7ms/step
Epoch 22/100
1/1 - 0s - loss: 0.4358 - acc: 1.0000 

<keras.callbacks.History at 0x3ddf26b80>

사전 훈련된 임베딩은 용량 문제로 모두 삭제...