### 임베딩 : Embedding
- 텍스트를 유사도 기준으로 수치화 ==> 밀집행렬
- 토큰화 실행 후 커진 희소행렬을 밀집행렬 ==> 축소

In [50]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import SimpleRNN, RNN, LSTM, GRU, Input, Dense, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
sentences = ['Today is a sunny day',
              'Today is a rainy day',
              'Is it sunny today?']

텍스트 전처리 ==> 토큰화 / 패딩

In [3]:
# create tokenizer object
NUM_WORDS = 1000    # word dictionary count of dataset
OOV = '<UNK>'       # character that isn't in word dictionary

tokenizer = Tokenizer(num_words = NUM_WORDS, oov_token = OOV)

In [4]:
# make word dictionary based on "sentences" data
tokenizer.fit_on_texts(sentences)

In [5]:
tokenizer.word_counts, tokenizer.word_index

(OrderedDict([('today', 3),
              ('is', 3),
              ('a', 2),
              ('sunny', 2),
              ('day', 2),
              ('rainy', 1),
              ('it', 1)]),
 {'<UNK>': 1,
  'today': 2,
  'is': 3,
  'a': 4,
  'sunny': 5,
  'day': 6,
  'rainy': 7,
  'it': 8})

In [6]:
# whole words count
word_index = len(tokenizer.word_index)
word_index

8

In [8]:
# convert sentence => number
# sentences = ['Today is a sunny day',
#               'Today is a rainy day',
#               'Is it sunny today?']
rets = tokenizer.texts_to_sequences(sentences)
rets

[[2, 3, 4, 5, 6], [2, 3, 4, 7, 6], [3, 8, 5, 2]]

In [10]:
# Equalize length of whole words ==> padding
lengths = [len(x) for x in rets]
TOKEN_LENGTH = max(lengths)

In [12]:
rets2 = pad_sequences(rets, maxlen=TOKEN_LENGTH, padding='post', truncating='post')
rets2, rets2.shape, rets2.ndim

(array([[2, 3, 4, 5, 6],
        [2, 3, 4, 7, 6],
        [3, 8, 5, 2, 0]]),
 (3, 5),
 2)

### 모델에 적용
- Embedding 층에 역할 확인

In [51]:
model = Sequential()

In [45]:
# 5개의 토큰이 하나의 문장 ==> 타입스템프
# 단어 수만큼 컬럼을 가짐  ==> 2개로 변환
# 토큰별 단어수만큼 행 1 X 9 =====> 1 X 2
#                   input_dim, output_dim, input_length
#                       고정       지정         고정
# model.add(Embedding(word_index+1, 7, input_length=5))
model.add(Input(shape=(3, 5)))
model.add(SimpleRNN(4))
model.add(Dense(1))

In [52]:
model.add(Embedding(word_index+1, 7, input_length=5))
model.add(Bidirectional(LSTM(16, return_sequences=True)))
model.add(Dense(32, activation='softmax'))
model.add(Dense(1, activation='sigmoid'))

In [53]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 5, 7)              63        
                                                                 
 bidirectional (Bidirectiona  (None, 5, 32)            3072      
 l)                                                              
                                                                 
 dense_3 (Dense)             (None, 5, 32)             1056      
                                                                 
 dense_4 (Dense)             (None, 5, 1)              33        
                                                                 
Total params: 4,224
Trainable params: 4,224
Non-trainable params: 0
_________________________________________________________________


🔹parameter (18) = output_dim (2) * token count (9)

In [47]:
#             default 'None'
model.compile(loss='mse')

In [54]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])

In [48]:
out = model.predict(rets2)



In [49]:
rets2.shape

(3, 5)

In [20]:
rets2

array([[2, 3, 4, 5, 6],
       [2, 3, 4, 7, 6],
       [3, 8, 5, 2, 0]])

In [21]:
# result after embedding
out.shape

(3, 5, 2)

In [22]:
out

array([[[ 0.02840031,  0.04759468],
        [ 0.00658851, -0.0191051 ],
        [ 0.00370295,  0.01637696],
        [ 0.02575673,  0.030749  ],
        [ 0.00236554,  0.00055034]],

       [[ 0.02840031,  0.04759468],
        [ 0.00658851, -0.0191051 ],
        [ 0.00370295,  0.01637696],
        [ 0.02577429,  0.03535037],
        [ 0.00236554,  0.00055034]],

       [[ 0.00658851, -0.0191051 ],
        [-0.01274029,  0.00075477],
        [ 0.02575673,  0.030749  ],
        [ 0.02840031,  0.04759468],
        [ 0.0499277 , -0.00040925]]], dtype=float32)