# 커널 : A Detailed Explanation of Keras Embedding Layer
- https://www.kaggle.com/rajmehra03/a-detailed-explanation-of-keras-embedding-layer/

In [1]:
#Ignore the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

#data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
#configure
#sets matplotlib to incline and displays graphs below the corressponding cell.
%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

#nltk
import nltk

#stop-words
from nltk.corpus import stopwords
stop_words=set(nltk.corpus.stopwords.words('english'))

#tokenizing
from nltk import word_tokenize,sent_tokenize

#keras
import keras
from keras.preprocessing.text import one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense , Flatten ,Embedding,Input
from keras.models import Model

Using TensorFlow backend.


In [2]:
sample_text_1="bitty bought a bir of butter"
sample_text_2="but the bit of butter was a bit bitter"
sample_text_3="so she bought some better butter to make the bitter butter better"

corp=[sample_text_1,sample_text_2,sample_text_3]
no_docs=len(corp)

In [3]:
vocab_size=50
encod_corp=[]
for i,doc in enumerate(corp):
    encod_corp.append(one_hot(doc,50))
    print("The encoding for document",i+1," is : ",one_hot(doc,50))

The encoding for document 1  is :  [34, 9, 6, 24, 12, 8]
The encoding for document 2  is :  [40, 48, 42, 12, 8, 18, 6, 42, 11]
The encoding for document 3  is :  [3, 28, 9, 24, 9, 8, 46, 33, 48, 11, 8, 9]


In [4]:
# length of maximum document. will be nedded whenever create embedding for the words
maxlen=-1
for doc in corp:
    tokens=nltk.word_tokenize(doc)
    if(maxlen<len(tokens)):
        maxlen=len(tokens)
print("The maximum number of words in any document is : ",maxlen)    

The maximum number of words in any document is :  12


In [5]:
# now to create embeddings all of our docs need to be of same length. hence we can pad the docs with zeros.
pad_corp=pad_sequences(encod_corp,maxlen=maxlen,padding='post',value=0.0)
print("No of padded documents: ",len(pad_corp))

No of padded documents:  3


In [6]:
for i,doc in enumerate(pad_corp):
    print("The padded encoding for document",i+1," is : ",doc)

The padded encoding for document 1  is :  [34  9  6 24 12  8  0  0  0  0  0  0]
The padded encoding for document 2  is :  [40 48 42 12  8 18  6 42 11  0  0  0]
The padded encoding for document 3  is :  [ 3 28  9 24  9  8 46 33 48 11  8  9]


In [7]:
#specifying the input shape
input=Input(shape=(no_docs,maxlen),dtype='float64')

In [8]:
'''
shape of input.
each document has 12 element or words which is the value of our maxlen varialble.

'''
word_input=Input(shape=(maxlen,),dtype='float64')

#creating the embedding
word_embedding=Embedding(input_dim=vocab_size,output_dim=8,input_length=maxlen)(word_input)

word_vec=Flatten()(word_embedding) # flatten
embed_model =Model([word_input],word_vec) # combining all into a Keras model

In [9]:
embed_model.compile(optimizer=keras.optimizers.Adam(lr=1e-3),loss='binary_crossentropy',metrics=['acc']) 
# compiling the model. parameters can be tuned as always.

In [10]:
print(type(word_embedding))
print(word_embedding)

<class 'tensorflow.python.framework.ops.Tensor'>
Tensor("embedding_1/embedding_lookup/Identity_1:0", shape=(None, 12, 8), dtype=float32)


In [11]:
print(embed_model.summary()) #summary of the model

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 12)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 12, 8)             400       
_________________________________________________________________
flatten_1 (Flatten)          (None, 96)                0         
Total params: 400
Trainable params: 400
Non-trainable params: 0
_________________________________________________________________
None


In [12]:
embeddings=embed_model.predict(pad_corp) #finally getting the embeddings.

In [13]:
print("Shape of embeddings : ",embeddings.shape)
print(embeddings)

Shape of embeddings :  (3, 96)
[[ 0.03011117 -0.02670218 -0.00847875  0.034897   -0.01051865 -0.02711916
  -0.02487536 -0.02674422 -0.00881178 -0.0483201  -0.00987961  0.02456332
   0.00283928 -0.01529993  0.01785861 -0.01242632 -0.03739148 -0.02144562
   0.04590155 -0.02630723 -0.0317054  -0.04067265 -0.02565771  0.0188408
   0.01338479  0.01484675 -0.00882455 -0.00779973  0.0283351  -0.04327079
   0.02083454 -0.04233762 -0.03554603  0.04314442 -0.03227742 -0.0430434
  -0.01460426 -0.03224637 -0.02712467 -0.03528814 -0.04699592 -0.02463116
  -0.00887086 -0.04108449 -0.01765037 -0.03592086 -0.02967109  0.00558911
  -0.0462851   0.00482037 -0.01747657 -0.00098497  0.03795874  0.03153331
  -0.00326921 -0.01445078 -0.0462851   0.00482037 -0.01747657 -0.00098497
   0.03795874  0.03153331 -0.00326921 -0.01445078 -0.0462851   0.00482037
  -0.01747657 -0.00098497  0.03795874  0.03153331 -0.00326921 -0.01445078
  -0.0462851   0.00482037 -0.01747657 -0.00098497  0.03795874  0.03153331
  -0.0032

In [14]:
embeddings=embeddings.reshape(-1,maxlen,8)
print("Shape of embeddings : ",embeddings.shape)
print(embeddings)

Shape of embeddings :  (3, 12, 8)
[[[ 0.03011117 -0.02670218 -0.00847875  0.034897   -0.01051865
   -0.02711916 -0.02487536 -0.02674422]
  [-0.00881178 -0.0483201  -0.00987961  0.02456332  0.00283928
   -0.01529993  0.01785861 -0.01242632]
  [-0.03739148 -0.02144562  0.04590155 -0.02630723 -0.0317054
   -0.04067265 -0.02565771  0.0188408 ]
  [ 0.01338479  0.01484675 -0.00882455 -0.00779973  0.0283351
   -0.04327079  0.02083454 -0.04233762]
  [-0.03554603  0.04314442 -0.03227742 -0.0430434  -0.01460426
   -0.03224637 -0.02712467 -0.03528814]
  [-0.04699592 -0.02463116 -0.00887086 -0.04108449 -0.01765037
   -0.03592086 -0.02967109  0.00558911]
  [-0.0462851   0.00482037 -0.01747657 -0.00098497  0.03795874
    0.03153331 -0.00326921 -0.01445078]
  [-0.0462851   0.00482037 -0.01747657 -0.00098497  0.03795874
    0.03153331 -0.00326921 -0.01445078]
  [-0.0462851   0.00482037 -0.01747657 -0.00098497  0.03795874
    0.03153331 -0.00326921 -0.01445078]
  [-0.0462851   0.00482037 -0.01747657 -0

In [15]:
for i,doc in enumerate(embeddings):
    for j,word in enumerate(doc):
        print("The encoding for ",j+1,"th word","in",i+1,"th document is : \n\n",word)

The encoding for  1 th word in 1 th document is : 

 [ 0.03011117 -0.02670218 -0.00847875  0.034897   -0.01051865 -0.02711916
 -0.02487536 -0.02674422]
The encoding for  2 th word in 1 th document is : 

 [-0.00881178 -0.0483201  -0.00987961  0.02456332  0.00283928 -0.01529993
  0.01785861 -0.01242632]
The encoding for  3 th word in 1 th document is : 

 [-0.03739148 -0.02144562  0.04590155 -0.02630723 -0.0317054  -0.04067265
 -0.02565771  0.0188408 ]
The encoding for  4 th word in 1 th document is : 

 [ 0.01338479  0.01484675 -0.00882455 -0.00779973  0.0283351  -0.04327079
  0.02083454 -0.04233762]
The encoding for  5 th word in 1 th document is : 

 [-0.03554603  0.04314442 -0.03227742 -0.0430434  -0.01460426 -0.03224637
 -0.02712467 -0.03528814]
The encoding for  6 th word in 1 th document is : 

 [-0.04699592 -0.02463116 -0.00887086 -0.04108449 -0.01765037 -0.03592086
 -0.02967109  0.00558911]
The encoding for  7 th word in 1 th document is : 

 [-0.0462851   0.00482037 -0.0174765