In [2]:
import tensorflow as tf

vocab = {      # 사용할 단어 사전 정의
    "i": 0,
    "need": 1,
    "some": 2,
    "more": 3,
    "coffee": 4,
    "cake": 5,
    "cat": 6,
    "dog": 7
}

sentence = "i i i i need some more coffee coffee coffee"
# 위 sentence
_input = [vocab[w] for w in sentence.split()]  # [0, 0, 0, 0, 1, 2, 3, 4, 4, 4]

vocab_size = len(vocab)   # 8

one_hot = tf.one_hot(_input, vocab_size)
print(one_hot.numpy())    # 원-핫 인코딩 벡터를 출력해 봅시다.

[[1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]]


In [3]:
distribution_size = 2   # 보기 좋게 2차원으로 분산 표현하도록 하죠!
linear = tf.keras.layers.Dense(units=distribution_size, use_bias=False)
one_hot_linear = linear(one_hot)

print("Linear Weight")
print(linear.weights[0].numpy())

print("\nOne-Hot Linear Result")
print(one_hot_linear.numpy())

Linear Weight
[[ 0.5653559  -0.5170727 ]
 [ 0.2654872   0.7272166 ]
 [ 0.5326898  -0.4164199 ]
 [ 0.6900704   0.5792124 ]
 [-0.01504093  0.6197102 ]
 [ 0.04579431  0.3707863 ]
 [-0.32532495  0.53717947]
 [-0.6155596  -0.31493938]]

One-Hot Linear Result
[[ 0.5653559  -0.5170727 ]
 [ 0.5653559  -0.5170727 ]
 [ 0.5653559  -0.5170727 ]
 [ 0.5653559  -0.5170727 ]
 [ 0.2654872   0.7272166 ]
 [ 0.5326898  -0.4164199 ]
 [ 0.6900704   0.5792124 ]
 [-0.01504093  0.6197102 ]
 [-0.01504093  0.6197102 ]
 [-0.01504093  0.6197102 ]]


In [5]:
words = tf.constant([[3, 57, 35]])
# 3번 단어 / 57번 단어 / 35번 단어로 이루어진 한 문장입니다.

print("Embedding을 진행할 문장:", words.shape)
embedding_layer = tf.keras.layers.Embedding(input_dim=64, output_dim=100)
# 총 64개의 단어를 포함한 Embedding 레이어를 선언할 것이고,
# 각 단어는 100차원으로 분산 표현 할 것입니다.

print("Embedding된 문장:", embedding_layer(words).shape)
print("Embedding Layer의 Weight 형태:", embedding_layer.weights[0].shape)

Embedding을 진행할 문장: (1, 3)
Embedding된 문장: (1, 3, 100)
Embedding Layer의 Weight 형태: (64, 100)


## One-hot encoding 실습

In [6]:
import numpy as np

In [7]:
# 입력 문장 (모델 입력)
raw_inputs = [
             "나는 학생 입니다",
             "나는 좋은 선생님 입니다",
             "당신은 매우 좋은 선생님 입니다"
]

# 정답 학생(1) 기타(0)
raw_labels = [1, 0, 0]

In [8]:
words = []
for s in raw_inputs:
  words.extend(s.split())

In [9]:
print(words)

['나는', '학생', '입니다', '나는', '좋은', '선생님', '입니다', '당신은', '매우', '좋은', '선생님', '입니다']


In [10]:
# 중복 단어 제거
words = list(dict.fromkeys(words))

In [11]:
print(words)

['나는', '학생', '입니다', '좋은', '선생님', '당신은', '매우']


In [12]:
word_to_id = {"<PAD>" : 0, "<UNK>" : 1}

for w in words:
  word_to_id[w] = len(word_to_id)
  print(word_to_id)

{'<PAD>': 0, '<UNK>': 1, '나는': 2}
{'<PAD>': 0, '<UNK>': 1, '나는': 2, '학생': 3}
{'<PAD>': 0, '<UNK>': 1, '나는': 2, '학생': 3, '입니다': 4}
{'<PAD>': 0, '<UNK>': 1, '나는': 2, '학생': 3, '입니다': 4, '좋은': 5}
{'<PAD>': 0, '<UNK>': 1, '나는': 2, '학생': 3, '입니다': 4, '좋은': 5, '선생님': 6}
{'<PAD>': 0, '<UNK>': 1, '나는': 2, '학생': 3, '입니다': 4, '좋은': 5, '선생님': 6, '당신은': 7}
{'<PAD>': 0, '<UNK>': 1, '나는': 2, '학생': 3, '입니다': 4, '좋은': 5, '선생님': 6, '당신은': 7, '매우': 8}


In [13]:
word_to_id.items()

dict_items([('<PAD>', 0), ('<UNK>', 1), ('나는', 2), ('학생', 3), ('입니다', 4), ('좋은', 5), ('선생님', 6), ('당신은', 7), ('매우', 8)])

In [14]:
id_to_words = {i : w for w, i in word_to_id.items()}

In [15]:
print(id_to_words)

{0: '<PAD>', 1: '<UNK>', 2: '나는', 3: '학생', 4: '입니다', 5: '좋은', 6: '선생님', 7: '당신은', 8: '매우'}


In [16]:
train_inputs = []

# 문장마다 시퀀스 변환하는 과정
for s in raw_inputs:
  row = [word_to_id[w] for w in s.split()]

  # padding
  row += [0] * (5-len(row))
  train_inputs.append(row)

train_inputs = np.array(train_inputs)
print(train_inputs)

[[2 3 4 0 0]
 [2 5 6 4 0]
 [7 8 5 6 4]]


In [17]:
train_inputs.shape

(3, 5)

In [18]:
# one-hot matrix 생성
onehot_matrix = np.eye(len(word_to_id))
print(onehot_matrix)

[[1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]]


```
[[2 3 4 0 0]
 [2 5 6 4 0]
 [7 8 5 6 4]]
 ```

In [20]:
train_onehot = onehot_matrix[train_inputs]
print(train_onehot)

[[[0. 0. 1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 0. 1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 0. 0. 0. 0. 0. 0. 1. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0.]]]


In [21]:
train_onehot.shape

(3, 5, 9)

In [23]:
print(np.argmax(train_onehot, axis= -1))

[[2 3 4 0 0]
 [2 5 6 4 0]
 [7 8 5 6 4]]


In [24]:
x = np.argmax(train_onehot, axis= -1)

In [25]:
print(x)

[[2 3 4 0 0]
 [2 5 6 4 0]
 [7 8 5 6 4]]


In [26]:
import tensorflow as tf
import tensorflow.keras.layers as L

In [27]:
x_len = train_onehot.shape

In [28]:
print(x_len)

(3, 5, 9)


In [29]:
inp = tf.convert_to_tensor(x, dtype=tf.int32)
inp_len = tf.convert_to_tensor(x_len, dtype=tf.int32)

In [30]:
print(inp)
print(inp_len)

tf.Tensor(
[[2 3 4 0 0]
 [2 5 6 4 0]
 [7 8 5 6 4]], shape=(3, 5), dtype=int32)
tf.Tensor([3 5 9], shape=(3,), dtype=int32)


In [31]:
inp, inp_len

(<tf.Tensor: shape=(3, 5), dtype=int32, numpy=
 array([[2, 3, 4, 0, 0],
        [2, 5, 6, 4, 0],
        [7, 8, 5, 6, 4]], dtype=int32)>,
 <tf.Tensor: shape=(3,), dtype=int32, numpy=array([3, 5, 9], dtype=int32)>)

In [35]:
vocab_size = 1000
dim = 5
embed = L.Embedding(vocab_size, dim)

In [36]:
a = embed(inp)

In [37]:
print(a)

tf.Tensor(
[[[ 0.03834956 -0.02609004  0.04314781 -0.03576643  0.03387083]
  [-0.02082502 -0.02331835  0.01972257  0.00847347  0.04327509]
  [ 0.0133037   0.03124977 -0.00494809 -0.01096557  0.01424554]
  [-0.0203968   0.00360709  0.01525115  0.0487816   0.00620184]
  [-0.0203968   0.00360709  0.01525115  0.0487816   0.00620184]]

 [[ 0.03834956 -0.02609004  0.04314781 -0.03576643  0.03387083]
  [ 0.04193551 -0.0094525   0.03540405  0.00751925  0.0479377 ]
  [-0.03734121 -0.01758115  0.03752077  0.04897464 -0.03444402]
  [ 0.0133037   0.03124977 -0.00494809 -0.01096557  0.01424554]
  [-0.0203968   0.00360709  0.01525115  0.0487816   0.00620184]]

 [[ 0.01935952  0.00383105 -0.00446109 -0.0100923   0.04213479]
  [ 0.0457113  -0.04805186  0.04489099  0.0112355   0.00024242]
  [ 0.04193551 -0.0094525   0.03540405  0.00751925  0.0479377 ]
  [-0.03734121 -0.01758115  0.03752077  0.04897464 -0.03444402]
  [ 0.0133037   0.03124977 -0.00494809 -0.01096557  0.01424554]]], shape=(3, 5, 5), dtype

In [38]:
a.shape

TensorShape([3, 5, 5])