In [1]:
from keras.preprocessing.text import Tokenizer

s1 = 'Taste is bad.'
s2 = 'Steak is chewy and bad.'
s3 = 'So bad and sad hate the steak'

sentences = [s1,s2,s3]

#initialize tokenizer
tokenizer = Tokenizer() 
#if there's a lot of text in a doc/sentence Tokenizer(num_words = n), where n is limit number of words to form a vector
tokenizer.fit_on_texts(sentences)

# เปลี่ยนประโยค เป็น Vectors (Tokenizer)

In [2]:
#see all sentences/docs in the tokenizer
tokenizer.document_count

3

In [3]:
#each word appears in [how many] docs/sentences
tokenizer.word_docs

defaultdict(int,
            {'bad': 3,
             'is': 2,
             'taste': 1,
             'steak': 2,
             'chewy': 1,
             'and': 2,
             'so': 1,
             'the': 1,
             'hate': 1,
             'sad': 1})

In [4]:
#number of words in all docs
tokenizer.word_counts

OrderedDict([('taste', 1),
             ('is', 2),
             ('bad', 3),
             ('steak', 2),
             ('chewy', 1),
             ('and', 2),
             ('so', 1),
             ('sad', 1),
             ('hate', 1),
             ('the', 1)])

In [5]:
#word index as dict (this will be used for analysis)
tokenizer.word_index

{'bad': 1,
 'is': 2,
 'steak': 3,
 'and': 4,
 'taste': 5,
 'chewy': 6,
 'so': 7,
 'sad': 8,
 'hate': 9,
 'the': 10}

In [7]:
tokenizer.word_index['bad']

1

### Text to index (Encoding)
ก่อนใช้เป็น input ต้อง encode ก่อน

In [8]:
#encode the sentence
sent_encoded = tokenizer.texts_to_sequences(sentences)
sent_encoded

[[5, 2, 1], [3, 2, 6, 4, 1], [7, 1, 4, 8, 9, 10, 3]]

In [9]:
#compare encoded sentance with real sentence
print(s2)
tokenizer.texts_to_sequences([s2])

Steak is chewy and bad.


[[3, 2, 6, 4, 1]]

### Padding
ปกติแล้วเวลาจะ Input ข้อมูลเข้าไป ต้องมที input_shape เท่าๆกัน ดังนั้นต้อง padding เพื่อ ทำให้ input_shape เท่ากัน
แปลว่า Padding คือ ขั้นตอนทำ data augmentation

In [10]:
#Padding the sentences
from keras.preprocessing.sequence import pad_sequences
import numpy as np

In [11]:
#let number of words in a longest sentence
max_len = max([len(i) for i in sent_encoded])
print(max_len)

7


In [12]:
#start padding (encoded_sentences, trucating='post' or 'pre' ตัดออก, padding='post' or 'pre' เติม0ที่ไหน, maxlen=max_len)
sents_pad = pad_sequences(sent_encoded, truncating='post', padding='post', maxlen=max_len)
print(sents_pad)

[[ 5  2  1  0  0  0  0]
 [ 3  2  6  4  1  0  0]
 [ 7  1  4  8  9 10  3]]


In [13]:
tokenizer.sequences_to_texts(sents_pad)

['taste is bad', 'steak is chewy and bad', 'so bad and sad hate the steak']

# เปลี่ยนคำเป็น Vectors (Word Embedding)

In [14]:
from keras.models import Sequential
from keras.layers import Embedding

vocab_size = len(tokenizer.word_index) + 1 #เนื่องจากการประมวลใส่คำไม่มี 0
vocab_size

11

In [15]:
#define desired vector length
embed_len = 5
model = Sequential([
    Embedding(vocab_size, embed_len, input_length=max_len)
])

#vocab_size ที่ต้อง + 1 เพราะตอนแรก word_index มันไม่ได้เริ่มจาก 0 แต่ทีนี้เรา padding ไป ก็เลยมี 0 ที่ไม่มีความหมายอะไร
#embed_len คือ size ของ vector คำที่ต้องการ ยิ่งค่าเยอะ ยิ่งละเอียดมาก เช่นของ GOOGLE Word2Vec มี size = 300
#input_length คือความยาวของประโยค/เอกสารนั้นๆ หลังจากที่ Process แล้ว (Padded or Trunicated)

model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.summary() #ค่า Param = 55 มาจาก vocab_size x embed_len = 11 * 5

Metal device set to: Apple M1
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 7, 5)              55        
                                                                 
Total params: 55
Trainable params: 55
Non-trainable params: 0
_________________________________________________________________


2022-04-22 15:43:38.389920: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-04-22 15:43:38.390543: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [16]:
vectors = model.predict(sents_pad)
vectors.shape #(number of docs, number of words, embed_len)

2022-04-22 15:43:38.555570: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-04-22 15:43:38.600059: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


(3, 7, 5)

In [17]:
print(vectors.round(3))

[[[-0.009 -0.038 -0.047 -0.021 -0.014]
  [-0.039 -0.009 -0.013 -0.04   0.02 ]
  [-0.013  0.002 -0.039 -0.047  0.021]
  [ 0.032 -0.036  0.043 -0.003 -0.008]
  [ 0.032 -0.036  0.043 -0.003 -0.008]
  [ 0.032 -0.036  0.043 -0.003 -0.008]
  [ 0.032 -0.036  0.043 -0.003 -0.008]]

 [[-0.014 -0.001  0.04  -0.013 -0.012]
  [-0.039 -0.009 -0.013 -0.04   0.02 ]
  [-0.046  0.045 -0.043  0.015  0.026]
  [-0.023 -0.029  0.045  0.027 -0.03 ]
  [-0.013  0.002 -0.039 -0.047  0.021]
  [ 0.032 -0.036  0.043 -0.003 -0.008]
  [ 0.032 -0.036  0.043 -0.003 -0.008]]

 [[ 0.032 -0.026 -0.011  0.026 -0.036]
  [-0.013  0.002 -0.039 -0.047  0.021]
  [-0.023 -0.029  0.045  0.027 -0.03 ]
  [ 0.003  0.011  0.046 -0.005 -0.048]
  [-0.012  0.036 -0.006  0.019  0.001]
  [-0.027  0.027  0.041 -0.016 -0.045]
  [-0.014 -0.001  0.04  -0.013 -0.012]]]


In [20]:
#vector คำว่า taste
print(vectors[0][0].round(3))

[-0.009 -0.038 -0.047 -0.021 -0.014]


In [19]:
#ดูค่า vector แทน word
print('Sentence, Word, Vector')
print('---------------------------------------------------------')
for i, sents in enumerate(vectors):
    for j, word_v in enumerate(sents):
        words = tokenizer.sequences_to_texts(sents_pad)[i].split() #จะได้เป็น List ของ คำทั้งหมดใน 1 ประโยค
        if j < len(words):
            print(f'{i+1:7} {words[j]:>8} {word_v.round(3)}')
        else:
            print(f'{i+1:7} {"":>8} {word_v.round(3)}') #blank

Sentence, Word, Vector
---------------------------------------------------------
      1    taste [-0.009 -0.038 -0.047 -0.021 -0.014]
      1       is [-0.039 -0.009 -0.013 -0.04   0.02 ]
      1      bad [-0.013  0.002 -0.039 -0.047  0.021]
      1          [ 0.032 -0.036  0.043 -0.003 -0.008]
      1          [ 0.032 -0.036  0.043 -0.003 -0.008]
      1          [ 0.032 -0.036  0.043 -0.003 -0.008]
      1          [ 0.032 -0.036  0.043 -0.003 -0.008]
      2    steak [-0.014 -0.001  0.04  -0.013 -0.012]
      2       is [-0.039 -0.009 -0.013 -0.04   0.02 ]
      2    chewy [-0.046  0.045 -0.043  0.015  0.026]
      2      and [-0.023 -0.029  0.045  0.027 -0.03 ]
      2      bad [-0.013  0.002 -0.039 -0.047  0.021]
      2          [ 0.032 -0.036  0.043 -0.003 -0.008]
      2          [ 0.032 -0.036  0.043 -0.003 -0.008]
      3       so [ 0.032 -0.026 -0.011  0.026 -0.036]
      3      bad [-0.013  0.002 -0.039 -0.047  0.021]
      3      and [-0.023 -0.029  0.045  0.027 -0.03 ]
 