# 커널 : A Detailed Explanation of Keras Embedding Layer
- https://www.kaggle.com/rajmehra03/a-detailed-explanation-of-keras-embedding-layer/

In [6]:
#Ignore the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

#data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
#configure
#sets matplotlib to incline and displays graphs below the corressponding cell.
%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

#nltk
import nltk

#stop-words
from nltk.corpus import stopwords
stop_words=set(nltk.corpus.stopwords.words('english'))

#tokenizing
from nltk import word_tokenize,sent_tokenize

#keras
import keras
from keras.preprocessing.text import one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense , Flatten ,Embedding,Input
from keras.models import Model

In [7]:
sample_text_1="bitty bought a bir of butter"
sample_text_2="but the bit of butter was a bit bitter"
sample_text_3="so she bought some better butter to make the bitter butter better"

corp=[sample_text_1,sample_text_2,sample_text_3]
no_docs=len(corp)

In [8]:
vocab_size=50
encod_corp=[]
for i,doc in enumerate(corp):
    encod_corp.append(one_hot(doc,50))
    print("The encoding for document",i+1," is : ",one_hot(doc,50))

The encoding for document 1  is :  [45, 30, 37, 8, 32, 27]
The encoding for document 2  is :  [46, 31, 32, 32, 27, 43, 37, 32, 20]
The encoding for document 3  is :  [9, 18, 30, 25, 22, 27, 1, 34, 31, 20, 27, 22]


In [11]:
# length of maximum document. will be nedded whenever create embedding for the words
maxlen=-1
for doc in corp:
    tokens=nltk.word_tokenize(doc)
    if(maxlen<len(tokens)):
        maxlen=len(tokens)
print("The maximum number of words in any document is : ",maxlen)    

The maximum number of words in any document is :  12


In [13]:
# now to create embeddings all of our docs need to be of same length. hence we can pad the docs with zeros.
pad_corp=pad_sequences(encod_corp,maxlen=maxlen,padding='post',value=0.0)
print("No of padded documents: ",len(pad_corp))

No of padded documents:  3


In [14]:
for i,doc in enumerate(pad_corp):
    print("The padded encoding for document",i+1," is : ",doc)

The padded encoding for document 1  is :  [45 30 37  8 32 27  0  0  0  0  0  0]
The padded encoding for document 2  is :  [46 31 32 32 27 43 37 32 20  0  0  0]
The padded encoding for document 3  is :  [ 9 18 30 25 22 27  1 34 31 20 27 22]


In [15]:
#specifying the input shape
input=Input(shape=(no_docs,maxlen),dtype='float64')

In [16]:
'''
shape of input.
each document has 12 element or words which is the value of our maxlen varialble.

'''
word_input=Input(shape=(maxlen,),dtype='float64')

#creating the embedding
word_embedding=Embedding(input_dim=vocab_size,output_dim=8,input_length=maxlen)(word_input)

word_vec=Flatten()(word_embedding) # flatten
embed_model =Model([word_input],word_vec) # combining all into a Keras model

In [19]:
embed_model.compile(optimizer=keras.optimizers.Adam(lr=1e-3),loss='binary_crossentropy',metrics=['acc']) 
# compiling the model. parameters can be tuned as always.