In [4]:
import tensorflow as tf

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [10]:
str = "원 핫 인코딩이란 정답인 원소만 1이고 나머지는 모두 0인 배열입니다"

**1. keras one hot encoding 활용.**

In [12]:
# tokenizer의 fit_on_texts 메소드에 str 문장 넣기.
# 공백을 기준으로 데이터를 분리해서 tokenizing.

tokenizer = Tokenizer()
tokenizer.fit_on_texts([str])

print(tokenizer.word_index)

{'원': 1, '핫': 2, '인코딩이란': 3, '정답인': 4, '원소만': 5, '1이고': 6, '나머지는': 7, '모두': 8, '0인': 9, '배열입니다': 10}


In [13]:
# tokenizer에는 단어별 index 값들이 저장된 상태.
encoded = tokenizer.texts_to_sequences([str])[0]
print(encoded)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [14]:
onehot = tf.keras.utils.to_categorical(encoded)
print(onehot)

[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


In [15]:
# 겹칠 가능성을 대비하여, 인코딩 범위를 조절 가능
onehot_20 = tf.keras.utils.to_categorical(encoded, 20)
print(onehot_20)

[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


**2. 단어를 기준으로 one-hot encoding code 구현**

In [16]:
import numpy as np

In [50]:
str = ['새로운 코드를 짜서 제대로 구현되는지 확인해보려고 합니다.','싸이의 예술이야를 듣고 감동 받았어요.']

In [51]:
# str을 잘라서 token별 index를 담을 공간.
token_index = {}

In [54]:
for sample in str:
    for word in sample.split():   # 공백기준으로 문자열 str의 token 나누기.
        print(word)
        if word not in token_index:   # 단어마다 고유한 index 할당.
            token_index[word] = len(token_index) + 1   # index는 0부터 사용.
            
print(token_index)

새로운
코드를
짜서
제대로
구현되는지
확인해보려고
합니다.
싸이의
예술이야를
듣고
감동
받았어요.
{'새로운': 1, '코드를': 2, '짜서': 3, '제대로': 4, '구현되는지': 5, '확인해보려고': 6, '합니다.': 7, '싸이의': 8, '예술이야를': 9, '듣고': 10, '감동': 11, '받았어요.': 12}


In [55]:
print(token_index.values())

dict_values([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])


In [56]:
# zero shot 적용하기
max_length = 10    # 한 sentence에는 10개의 word(token)만 허용.
results = np.zeros((len(str), max_length, max(token_index.values())+1))

print(results)

[[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]]


In [59]:
print(results[0][0:3])

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [60]:
# 리스트 안에서 enumerate를 하면 튜플로 인덱스 0부터 묶이고, 딕셔너리 안에서 enumerate를 하면 인덱스와 값이 key, value로 들어간다.
for i, sample in enumerate(str):
    for j, word in list(enumerate(sample.split()))[:max_length]:
        index = token_index.get(word)  # word 부분이 token_index의 딕셔너리의 key로 들어가서 index에 value인 숫자가 들어간다.
        results[i, j, index] = 1.      # i는 문장의 인덱스, j는 문장안에서 단어의 인덱스, index는 toekn_index의 value값

print(results)

[[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]]


In [61]:
print(results[0][0:3])

[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
