# 단어의 임베딩
- 빈도수 계산 : 빈도기반 - TF (비율) 상대빈도   
- TDM(metrix) TF를 행렬로 만든 것, 사전을 이용한 단순빈도   
- TF-IDF : TF*IDF   
- IDF : 역문서빈도 

In [1]:
text = "John likes to watch movies. Mary likes movies too. \
Mary also likes to watch football games."

words = text.replace('.','').split()
print(words)

['John', 'likes', 'to', 'watch', 'movies', 'Mary', 'likes', 'movies', 'too', 'Mary', 'also', 'likes', 'to', 'watch', 'football', 'games']


In [2]:
import numpy as np
word_count = np.unique(words, return_counts=True)
word_count

(array(['John', 'Mary', 'also', 'football', 'games', 'likes', 'movies',
        'to', 'too', 'watch'], dtype='<U8'),
 array([1, 2, 1, 1, 1, 3, 2, 2, 1, 2]))

In [3]:
# 딕셔너리 TF 생성
word_to_cnt = {}
for word, cnt in zip (*word_count):
    word_to_cnt[word] = cnt
print(word_to_cnt)

{'John': 1, 'Mary': 2, 'also': 1, 'football': 1, 'games': 1, 'likes': 3, 'movies': 2, 'to': 2, 'too': 1, 'watch': 2}


In [4]:
word_to_cnt['movies']

2

In [2]:
corpus = [
"John likes to watch movies. Mary likes movies too.",
"Mary also likes to watch football games."
]

# TDM : 문장 내에 있는 단어 빈도수

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
vector = CountVectorizer()
tdm_array = vector.fit_transform(corpus).toarray()
tf_dic = vector.vocabulary_
print(tdm_array)
print(tf_dic)

[[0 0 0 1 2 1 2 1 1 1]
 [1 1 1 0 1 1 0 1 0 1]]
{'john': 3, 'likes': 4, 'to': 7, 'watch': 9, 'movies': 6, 'mary': 5, 'too': 8, 'also': 0, 'football': 1, 'games': 2}


In [4]:
tf_dic = vector.vocabulary_
print(vector.vocabulary_)

{'john': 3, 'likes': 4, 'to': 7, 'watch': 9, 'movies': 6, 'mary': 5, 'too': 8, 'also': 0, 'football': 1, 'games': 2}


In [6]:
import pandas as pd
tf_dic_sorted = dict(sorted(tf_dic.items(),
                            key=lambda item: item[1]))
tdm = pd.DataFrame(tdm_array, columns=tf_dic_sorted.keys())
print(tdm)

   also  football  games  john  likes  mary  movies  to  too  watch
0     0         0      0     1      2     1       2   1    1      1
1     1         1      1     0      1     1       0   1    0      1


# TF-IDF

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()
tfidf_array = tfidf_vec.fit_transform(corpus).toarray()
tfidf_dic = tfidf_vec.vocabulary_
tfidf_dic_sorted = dict(sorted(tfidf_dic.items(),
                               key=lambda item: item[1]))
tfidf_dtm = pd.DataFrame(tfidf_array,
                         columns=tfidf_dic_sorted.keys())
tfidf_dtm

Unnamed: 0,also,football,games,john,likes,mary,movies,to,too,watch
0,0.0,0.0,0.0,0.323699,0.460629,0.230315,0.647398,0.230315,0.323699,0.230315
1,0.446101,0.446101,0.446101,0.0,0.317404,0.317404,0.0,0.317404,0.0,0.317404


In [9]:
from gensim.models import Word2Vec
corpus = [
"John likes to watch movies. Mary likes movies too.",
"Mary also likes to watch football games."
]
word_list = []

In [10]:
for word in corpus:
    word_list.append(word.replace('.','').split())
word_list

[['John', 'likes', 'to', 'watch', 'movies', 'Mary', 'likes', 'movies', 'too'],
 ['Mary', 'also', 'likes', 'to', 'watch', 'football', 'games']]

In [11]:
model = Word2Vec(word_list, sg=0, vector_size=100,
                 window=3, min_count=1)
model.wv.most_similar('likes', 'movies')

[('John', 0.1716446876525879),
 ('also', 0.06594576686620712),
 ('Mary', 0.008838452398777008),
 ('watch', -0.06765827536582947),
 ('games', -0.08544929325580597),
 ('football', -0.08948154002428055),
 ('too', -0.11860240250825882),
 ('to', -0.13643862307071686)]

In [12]:
model = Word2Vec(word_list, sg=1, vector_size=100,
                 window=3, min_count=1)
model.wv.most_similar('likes', 'movies')

[('John', 0.1716446876525879),
 ('also', 0.06594576686620712),
 ('Mary', 0.008853347040712833),
 ('watch', -0.06765827536582947),
 ('games', -0.08544929325580597),
 ('football', -0.08948154002428055),
 ('too', -0.11860240250825882),
 ('to', -0.13643866777420044)]

In [13]:
model = Word2Vec(word_list, sg=0, vector_size=100,
                 window=3, min_count=1)
model.wv.most_similar('John', 'Mary')

[('likes', 0.15334714949131012),
 ('football', 0.07839644700288773),
 ('also', 0.015055439434945583),
 ('too', 0.007465750444680452),
 ('movies', -0.006201200652867556),
 ('games', -0.07736954092979431),
 ('to', -0.12009605765342712),
 ('watch', -0.16032634675502777)]

In [14]:
from tensorflow.keras.datasets import imdb
(X_train, y_train),(X_test, y_test) = imdb.load_data(num_words=10000)
(X_train, y_train),(X_test, y_test)

2025-09-04 16:14:16.287776: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


((array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
         list([1, 194, 1153, 194, 8255, 78, 

In [15]:
from tensorflow.keras import Sequential, layers
model_dnn = Sequential([
    layers.Input(shape=(80,)), #80개 단어
    layers.Embedding(input_dim=10000, output_dim=32),  #(80,32)
    layers.Flatten(), #1차원으로 펼침
    layers.Dense(64, activation='relu'),
    layers.Dense(2,activation='softmax')
])


2025-09-04 16:14:24.334833: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-09-04 16:14:24.547100: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-09-04 16:14:24.547192: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-09-04 16:14:24.548265: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:984] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2025-09-04 16:14:24.548335: I external/local_xla/xla/stream_executor

In [16]:
model_dnn.summary()

In [18]:
# 텍스트데이터 전처리
from tensorflow.keras.preprocessing.sequence import pad_sequences
X_train_pad = pad_sequences(X_train, maxlen=80, truncating='post', padding='post')
X_test_pad = pad_sequences(X_test, maxlen=80, truncating='post', padding='post')

In [19]:
model_dnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_dnn.fit(X_train_pad, y_train, epochs=10, batch_size=200)

Epoch 1/10


I0000 00:00:1756970094.587574  101656 service.cc:145] XLA service 0x7a5654004760 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1756970094.587631  101656 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce RTX 3050 Laptop GPU, Compute Capability 8.6
2025-09-04 16:14:54.622140: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-09-04 16:14:54.781373: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907


[1m 59/125[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m0s[0m 3ms/step - accuracy: 0.5481 - loss: 0.6865

I0000 00:00:1756970096.749399  101656 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.6882 - loss: 0.5641
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8872 - loss: 0.2813
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9788 - loss: 0.0864
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9981 - loss: 0.0172
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9999 - loss: 0.0038
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 1.0000 - loss: 0.0018
Epoch 7/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 1.0000 - loss: 0.0011
Epoch 8/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 1.0000 - loss: 8.0040e-04
Epoch 9/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7a5731223400>

In [21]:
# RNN 모델
from tensorflow.keras import Sequential, layers
model_rnn = Sequential([
    layers.Input(shape=(80,)), #80개 단어
    layers.Embedding(input_dim=10000, output_dim=32),  #(80,32)
    #layers.Flatten(), #1차원으로 펼침
    #layers.Dense(64, activation='relu'),
    layers.SimpleRNN(64),
    layers.Dense(2,activation='softmax')
])


In [22]:
model_rnn.summary()

In [25]:
model_rnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_rnn.fit(X_train_pad, y_train, epochs=10, batch_size=200)
model_rnn.evaluate(X_test_pad, y_test)

Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 23ms/step - accuracy: 0.9964 - loss: 0.0115
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - accuracy: 0.9984 - loss: 0.0064
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.9986 - loss: 0.0054
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - accuracy: 0.9971 - loss: 0.0106
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - accuracy: 0.9928 - loss: 0.0215
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - accuracy: 0.9963 - loss: 0.0111
Epoch 7/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - accuracy: 0.9992 - loss: 0.0034
Epoch 8/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - accuracy: 0.9950 - loss: 0.0169
Epoch 9/10
[1m125/125[0m [32m

[1.5203739404678345, 0.7264000177383423]

앞의 모형에서 아래의 내용을 바꿔보고 그 결과를 기록해 보세요.

1 옵티마이저를 sgd로 바꿔보세요. accuracy: 51.3   
2 전체 단어의 개수를 1000개로 바꿔보세요. accuracy: 77.5   
3 영화평의 길이를 200개로 바꿔보세요. accuracy: 51.9   
4 pad_sequence의 truncating과 padding을 pre로 바꿔보세요. accuracy: 80.9   
5 RNN 층(뉴런 128개)을 하나 더 추가해 보세요. accuracy: 76.9

In [28]:
# RNN 모델
from tensorflow.keras import Sequential, layers
model_rnn_1 = Sequential([
    layers.Input(shape=(200,)), #80개 단어
    layers.Embedding(input_dim=1000, output_dim=32),  #(80,32)
    #layers.Flatten(), #1차원으로 펼침
    #layers.Dense(64, activation='relu'),
    layers.SimpleRNN(64),
    layers.Dense(2,activation='softmax')
])
X_train_pad_1 = pad_sequences(X_train, maxlen=200, truncating='pre', padding='pre')
X_test_pad_1 = pad_sequences(X_test, maxlen=200, truncating='pre', padding='pre')

model_rnn_1.compile(optimizer='sgd', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_rnn_1.fit(X_train_pad_1, y_train, epochs=10, batch_size=200)
model_rnn_1.evaluate(X_test_pad_1, y_test)

Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 43ms/step - accuracy: 0.5064 - loss: 0.6966
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 43ms/step - accuracy: 0.5133 - loss: 0.6929
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 36ms/step - accuracy: 0.5205 - loss: 0.6921
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 32ms/step - accuracy: 0.5212 - loss: 0.6914
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 34ms/step - accuracy: 0.5254 - loss: 0.6906
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 36ms/step - accuracy: 0.5334 - loss: 0.6898
Epoch 7/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 36ms/step - accuracy: 0.5354 - loss: 0.6888
Epoch 8/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 53ms/step - accuracy: 0.5415 - loss: 0.6879
Epoch 9/10
[1m125/125[0m [32m

[0.6955499053001404, 0.5169600248336792]

In [29]:
# RNN 모델
from tensorflow.keras import Sequential, layers
model_rnn_2 = Sequential([
    layers.Input(shape=(200,)), #80개 단어
    layers.Embedding(input_dim=1000, output_dim=32),  #(80,32)
    #layers.Flatten(), #1차원으로 펼침
    #layers.Dense(64, activation='relu'),
    layers.SimpleRNN(64),
    layers.Dense(2,activation='softmax')
])
X_train_pad_2 = pad_sequences(X_train, maxlen=200, truncating='post', padding='post')
X_test_pad_2 = pad_sequences(X_test, maxlen=200, truncating='post', padding='post')

model_rnn_2.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_rnn_2.fit(X_train_pad_2, y_train, epochs=10, batch_size=200)
model_rnn_2.evaluate(X_test_pad_2, y_test)

Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 43ms/step - accuracy: 0.5042 - loss: 0.6952
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 40ms/step - accuracy: 0.5342 - loss: 0.6868
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - accuracy: 0.5693 - loss: 0.6690
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.6057 - loss: 0.6352
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 31ms/step - accuracy: 0.6434 - loss: 0.5880
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - accuracy: 0.6591 - loss: 0.5516
Epoch 7/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 33ms/step - accuracy: 0.6873 - loss: 0.5016
Epoch 8/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 33ms/step - accuracy: 0.6839 - loss: 0.5223
Epoch 9/10
[1m125/125[0m [32m

[0.9938990473747253, 0.5362799763679504]

# RNN 구조의 한계
# 하이퍼파라미터의 튜닝 & EDA
# CNN 으로 구성

In [38]:
model_cnn = Sequential(
    [
        layers.Input(shape=(200,)),
        layers.Embedding(input_dim=1000, output_dim=32),
        layers.Conv1D(64, 3, activation='relu'),
        layers.GlobalMaxPool1D(),
        layers.Dense(64, activation='relu'),
        layers.Dense(2,activation='softmax')
    ]
)

In [39]:
model_cnn.summary()

In [40]:
X_train_pad = pad_sequences(X_train, maxlen=200, truncating='post', padding='post')
X_test_pad  = pad_sequences(X_test, maxlen=200, truncating='post', padding='post')

In [41]:
model_cnn.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_cnn.fit(X_train_pad, y_train, epochs=10, batch_size=200)
model_cnn.evaluate(X_test_pad, y_test)

Epoch 1/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.6924 - loss: 0.5739
Epoch 2/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8281 - loss: 0.3839
Epoch 3/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8615 - loss: 0.3234
Epoch 4/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8829 - loss: 0.2828
Epoch 5/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8974 - loss: 0.2491
Epoch 6/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9139 - loss: 0.2163
Epoch 7/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9290 - loss: 0.1862
Epoch 8/10
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9384 - loss: 0.1629
Epoch 9/10
[1m125/125[0m [32m━━━━━━━━

[0.4546443223953247, 0.8416799902915955]