One-Hot Encoding
-

In [6]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence

text = '해보지 않으면 해낼 수 없다'

result = text_to_word_sequence(text)
print("\n원문\n", text)
print("\n토큰화: \n" , result)

2025-04-17 22:28:32.767970: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.



원문
 해보지 않으면 해낼 수 없다

토큰화: 
 ['해보지', '않으면', '해낼', '수', '없다']


In [10]:
import numpy as np
samples = ['The cat sat on the mat.','The dog ate my homework.']

token_index = {}
for sample in samples:
    print(sample)
    for word in sample.replace('.', ' ').split():
        if word not in token_index:
            token_index[word] = len(token_index) + 1

print(token_index)

The cat sat on the mat.
The dog ate my homework.
{'The': 1, 'cat': 2, 'sat': 3, 'on': 4, 'the': 5, 'mat': 6, 'dog': 7, 'ate': 8, 'my': 9, 'homework': 10}


In [12]:
max_length = 10

results = np.zeros((len(samples), max_length, max(token_index.values()) + 1))

for i, smaple in enumerate(samples):
    print(sample)
    for j, word in list(enumerate(sample.replace('.', ' ').split()))[:max_length]:
        index = token_index.get(word)
        results[i, j, index] = 1.
print(results)

The dog ate my homework.
The dog ate my homework.
[[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]]


In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

samples = ['The cat sat on the mat.', 'The dog ate my homework.']

#select 1000 most occuring words 
tokenizer = Tokenizer (num_words=1000)

#putting sample in tokenizer
tokenizer.fit_on_texts(samples)
sequences = tokenizer.texts_to_sequences(samples)

#one hot encoding
one_hot_results = tokenizer.texts_to_matrix(samples, mode='binary')

#get word index 
word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

x = to_categorical(sequences[0], num_classes=10)
print(x)
y = to_categorical(sequences[1], num_classes = 10)
print(y)
      

2025-04-18 19:26:52.993396: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Found 9 unique tokens
[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]]
[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


In [5]:
import numpy
import tensorflow as tf
from numpy import array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding

docs = ['먼저 텍스트의 각 단어를 나누어 토큰화 합니다.',
       '텍스트의 단어로 토큰화 해야 딥러닝에서 인식됩니다.',
       '토큰화 한 결과는 딥러닝에서 사용 할 수 있습니다.'
       ]

token = Tokenizer()
token.fit_on_texts(docs)

print("\n딘아 키운트: ", token.document_count)
print("\n각 단어가 몇개의 문장에 포함되어 있는가:\n", token.word_docs)
print("\n각 단어에 매겨진 인덱스 값:\n", token.word_index)


딘아 키운트:  3

각 단어가 몇개의 문장에 포함되어 있는가:
 defaultdict(<class 'int'>, {'나누어': 1, '텍스트의': 2, '토큰화': 3, '단어를': 1, '각': 1, '먼저': 1, '합니다': 1, '딥러닝에서': 2, '인식됩니다': 1, '해야': 1, '단어로': 1, '사용': 1, '있습니다': 1, '수': 1, '한': 1, '할': 1, '결과는': 1})

각 단어에 매겨진 인덱스 값:
 {'토큰화': 1, '텍스트의': 2, '딥러닝에서': 3, '먼저': 4, '각': 5, '단어를': 6, '나누어': 7, '합니다': 8, '단어로': 9, '해야': 10, '인식됩니다': 11, '한': 12, '결과는': 13, '사용': 14, '할': 15, '수': 16, '있습니다': 17}


Word Embedding
-

In [11]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
import warnings
warnings.filterwarnings('ignore')

sentences = [
    "왕은 나라를 다스리는 남자입니다.",
    "여왕은 나라를 다스리는 여자입니다.",
    "남자가 왕이 되었습니다.",
    "여자가 여왕이 되었습니다.",
    "왕과 여왕은 함께 궁전에서 살고 있습니다.",
    "남자와 여자가 파티에 참석했습니다.",
    "왕은 충성스러운 신하들을 거느립니다.",
    "여왕은 아름다운 드레스를 입고 있었습니다.",
    "남자가 여왕의 조언자가 되었습니다.",
    "여자가 왕의 자리를 노리고 있습니다."
]

tokenizer = Tokenizer(
    filters = '!"#$%&)_*+,-./:;<=>?@[\\]^_`{}~\t\n'
)

tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
word_index = tokenizer.word_index
print("단어 목록:", word_index)
padded = pad_sequences(sequences, padding='post')
print("패딩된 시퀀스:\n", padded)

vocab_size=  len(word_index) + 1
embedding_dim = 8
max_len = padded.shape[1]
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim = embedding_dim))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer ='adam', loss='binary_crossentropy')
model.fit(padded, np.ones((len(sentences), 1)), epochs=10, verbose=2)
embedding_matrix = model.layers[0].get_weights()[0]

for word in word_index:
    if "왕" in word:
        print(f"\n*  단어 '{word}'의 임베딩 벡터: ")
        print(embedding_matrix[word_index[word]])

단어 목록: {'여왕은': 1, '되었습니다': 2, '여자가': 3, '왕은': 4, '나라를': 5, '다스리는': 6, '남자가': 7, '있습니다': 8, '남자입니다': 9, '여자입니다': 10, '왕이': 11, '여왕이': 12, '왕과': 13, '함께': 14, '궁전에서': 15, '살고': 16, '남자와': 17, '파티에': 18, '참석했습니다': 19, '충성스러운': 20, '신하들을': 21, '거느립니다': 22, '아름다운': 23, '드레스를': 24, '입고': 25, '있었습니다': 26, '여왕의': 27, '조언자가': 28, '왕의': 29, '자리를': 30, '노리고': 31}
패딩된 시퀀스:
 [[ 4  5  6  9  0  0]
 [ 1  5  6 10  0  0]
 [ 7 11  2  0  0  0]
 [ 3 12  2  0  0  0]
 [13  1 14 15 16  8]
 [17  3 18 19  0  0]
 [ 4 20 21 22  0  0]
 [ 1 23 24 25 26  0]
 [ 7 27 28  2  0  0]
 [ 3 29 30 31  8  0]]
Epoch 1/10
1/1 - 2s - 2s/step - loss: 0.7072
Epoch 2/10
1/1 - 0s - 46ms/step - loss: 0.7016
Epoch 3/10
1/1 - 0s - 75ms/step - loss: 0.6961
Epoch 4/10
1/1 - 0s - 44ms/step - loss: 0.6907
Epoch 5/10
1/1 - 0s - 103ms/step - loss: 0.6853
Epoch 6/10
1/1 - 0s - 205ms/step - loss: 0.6799
Epoch 7/10
1/1 - 0s - 64ms/step - loss: 0.6746
Epoch 8/10
1/1 - 0s - 53ms/step - loss: 0.6693
Epoch 9/10
1/1 - 0s - 58ms/step - loss: 0.6640
E

텍스트를 읽고 긍정, 부정 예측하기
-

In [14]:
from numpy import array

docs = ["너무 재밌네요","최고예요","참 잘 만든 영화예요","추천하고 싶은 영화입니다","한번 더 보고싶네요","글쎄요","별로예요","생각보다 지루하네요","연기가 어색해요","재미없어요"]

classes = array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

from tensorflow.keras.preprocessing.text import Tokenizer
token = Tokenizer()
token.fit_on_texts(docs)
print(token.word_index)
x = token.texts_to_sequences(docs)
print("\n리뷰 텍스트, 토큰화 결과:\n", x)

from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_x = pad_sequences(x, 4)
print("\n패딩 결과: \n", padded_x)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding

print("\n딥러닝 모델 시작: ")
word_size = len(token.word_index) +1
model = Sequential()
model.add(Embedding(word_size, 8, input_length=4))
#embedding_layer = Embedding(word_size, 8, input_length=4)
#model.add(embedding_layer)

model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(padded_x, classes, epochs=20)
print("\n Accuracy: %.4f" % (model.evaluate(padded_x, classes)[1]))

{'너무': 1, '재밌네요': 2, '최고예요': 3, '참': 4, '잘': 5, '만든': 6, '영화예요': 7, '추천하고': 8, '싶은': 9, '영화입니다': 10, '한번': 11, '더': 12, '보고싶네요': 13, '글쎄요': 14, '별로예요': 15, '생각보다': 16, '지루하네요': 17, '연기가': 18, '어색해요': 19, '재미없어요': 20}

리뷰 텍스트, 토큰화 결과:
 [[1, 2], [3], [4, 5, 6, 7], [8, 9, 10], [11, 12, 13], [14], [15], [16, 17], [18, 19], [20]]

패딩 결과: 
 [[ 0  0  1  2]
 [ 0  0  0  3]
 [ 4  5  6  7]
 [ 0  8  9 10]
 [ 0 11 12 13]
 [ 0  0  0 14]
 [ 0  0  0 15]
 [ 0  0 16 17]
 [ 0  0 18 19]
 [ 0  0  0 20]]

딥러닝 모델 시작: 
Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.5000 - loss: 0.6882   
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - accuracy: 0.6000 - loss: 0.6860
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step - accuracy: 0.7000 - loss: 0.6839
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step - accuracy: 0.8000 - loss: 0.6818
Epoch 5/20
[1m1/1[0m [32m━━

In [20]:
#임베딩 벡터 확인 및 유사도 측정
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

embedding_matrix = model.layers[0].get_weights()[0]
print("\n[단어 임베딩 행령 shape]:", embedding_matrix.shape)

def print_word_vector(word):
    idx = token.word_index[word]
    print(f"단어 '{word}'의 임베딩 벡터:")
    print(embedding_matrix[idx])

def similarity(word1, word2):
    vec1 = embedding_matrix[token.word_index[word1]].reshape(1, -1)
    vec2 = embedding_matrix[token.word_index[word2]].reshape(1, -1)
    sim = cosine_similarity(vec1, vec2)[0][0]
    print(f"'{word1}' 와 '{word2}' 의 코사인 유사도: {sim:.4f}")

print_word_vector("최고예요")
print_word_vector("지루하네요")

similarity("최고예요", "재밌네요") #pos - pos
similarity("지루하네요", "재미없어요") #neg - neg
similarity("최고예요", "지루하네요") #pos-neg


[단어 임베딩 행령 shape]: (21, 8)
단어 '최고예요'의 임베딩 벡터:
[-0.06290983 -0.01066267 -0.0399123   0.06911495 -0.02837222  0.03329787
 -0.01973979  0.00547971]
단어 '지루하네요'의 임베딩 벡터:
[ 0.04863785 -0.03293007  0.03272315 -0.06564949  0.02040491 -0.0581199
 -0.03901257  0.01794027]
'최고예요' 와 '재밌네요' 의 코사인 유사도: -0.1104
'지루하네요' 와 '재미없어요' 의 코사인 유사도: 0.4976
'최고예요' 와 '지루하네요' 의 코사인 유사도: -0.7501


In [24]:
#새 리뷰 예측
new_reviews = ["참 재밌네요", "별로였어요", "잘 만든 영화입니다", "지루하네요 재미없어요"]
new_sequences = token.texts_to_sequences(new_reviews)
padded_new_sequences = pad_sequences(new_sequences, 4)

predictions = model.predict(padded_new_sequences)

for review, prediction in zip(new_reviews, predictions):
    print(f"리뷰: {review} -> 예측: {'긍정' if prediction > 0.5 else '부정'} (확률: {prediction[0]:.4f})")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
리뷰: 참 재밌네요 -> 예측: 부정 (확률: 0.4942)
리뷰: 별로였어요 -> 예측: 부정 (확률: 0.4700)
리뷰: 잘 만든 영화입니다 -> 예측: 긍정 (확률: 0.5203)
리뷰: 지루하네요 재미없어요 -> 예측: 부정 (확률: 0.4862)


토크나이저, 모델저장
-

In [28]:
from numpy import array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding
import pickle

docs = ["너무 재밌네요","최고예요","참 잘 만든 영화예요","추천하고 싶은 영화입니다","한번 더 보고싶네요","글쎄요","별로예요","생각보다 지루하네요","연기가 어색해요","재미없어요"]

classes = array([1, 1, 1, 1, 1, 0, 0, 0, 0, 0])

token = Tokenizer()
token.fit_on_texts(docs)
print(token.word_index)

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(token, handle, protocol=pickle.HIGHEST_PROTOCOL)

x = token.texts_to_sequences(docs)
print("\n리뷰 텍스트, 토큰화 결과:\n", x)
padded_x = pad_sequences(x, 4)
print("\n패딩 결과: \n", padded_x)

print("\n딥러닝 모델 시작: ")
word_size = len(token.word_index) +1
model = Sequential()
embedding_layer = Embedding(word_size, 2, input_length=4)
model.add(embedding_layer)
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(padded_x, classes, epochs=20)
print("\n Accuracy: %.4f" % (model.evaluate(padded_x, classes)[1]))

model.save('sentiment_model.keras')

{'너무': 1, '재밌네요': 2, '최고예요': 3, '참': 4, '잘': 5, '만든': 6, '영화예요': 7, '추천하고': 8, '싶은': 9, '영화입니다': 10, '한번': 11, '더': 12, '보고싶네요': 13, '글쎄요': 14, '별로예요': 15, '생각보다': 16, '지루하네요': 17, '연기가': 18, '어색해요': 19, '재미없어요': 20}

리뷰 텍스트, 토큰화 결과:
 [[1, 2], [3], [4, 5, 6, 7], [8, 9, 10], [11, 12, 13], [14], [15], [16, 17], [18, 19], [20]]

패딩 결과: 
 [[ 0  0  1  2]
 [ 0  0  0  3]
 [ 4  5  6  7]
 [ 0  8  9 10]
 [ 0 11 12 13]
 [ 0  0  0 14]
 [ 0  0  0 15]
 [ 0  0 16 17]
 [ 0  0 18 19]
 [ 0  0  0 20]]

딥러닝 모델 시작: 
Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 989ms/step - accuracy: 0.3000 - loss: 0.7025
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 62ms/step - accuracy: 0.3000 - loss: 0.7014
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.3000 - loss: 0.7002
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.3000 - loss: 0.6990
Epoch 5/20
[1m1/1[0m [32m━━

In [30]:
#model load
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
import pickle

with open('tokenizer.pickle', 'rb') as handle:
    token = pickle.load(handle)

model = load_model('sentiment_model.keras')

new_reviews = ["너무 재밌네요", "나는 별로예요", "잘 만든 영화입니다", "지루하네요 재미없어요"]
new_sequences = token.texts_to_sequences(new_reviews)
print(new_sequences)
padded_new_sequences = pad_sequences(new_sequences, 4)

predictions = model.predict(padded_new_sequences)

for review, predictions in zip(new_reviews, predictions):
    print(f"리뷰: {review} -> 예측: {'긍정' if prediction > 0.5 else '부정'} (확률: {prediction[0]:.4f})")

[[1, 2], [15], [5, 6, 10], [17, 20]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
리뷰: 너무 재밌네요 -> 예측: 부정 (확률: 0.4862)
리뷰: 나는 별로예요 -> 예측: 부정 (확률: 0.4862)
리뷰: 잘 만든 영화입니다 -> 예측: 부정 (확률: 0.4862)
리뷰: 지루하네요 재미없어요 -> 예측: 부정 (확률: 0.4862)
