<a href="https://colab.research.google.com/github/jihyoung-lee/AI-study/blob/main/stock_news_predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [54]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# 데이터
texts = ["상승", "하락", "급등세", "큰폭 하락"]
labels = [1, 0, 1, 0]
# 토크나이징 = 사전등록
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

#시퀀스로 변경 (단어->숫자로 변경)
sequences = tokenizer.texts_to_sequences(texts)

#사전 갯수 한도
vocab_size = tokenizer.num_words if tokenizer.num_words else len(tokenizer.word_index) + 1

# 길이 맞추기(패딩)
padded = pad_sequences(sequences, padding='pre')
sequence_length = len(padded[0])

# numpy 변환
padded = np.array(padded)
labels = np.array(labels)

print("토큰화 : ", tokenizer.word_index)
print("시퀀스 : ", sequences)
print("데이터 : ", padded)
print("데이터 길이 : " ,len(padded[0]))



토큰화 :  {'하락': 1, '상승': 2, '급등세': 3, '큰폭': 4}
시퀀스 :  [[2], [1], [3], [4, 1]]
데이터 :  [[0 2]
 [0 1]
 [0 3]
 [4 1]]
데이터 길이 :  2


In [55]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

model = Sequential([
    Embedding(input_dim= vocab_size, output_dim=8, input_length=sequence_length), # 숫자 인덱스를 단어벡터로 변환해주는 층
    Flatten(), # 벡터들을 하나로 쭉 펴주는 층
    Dense(8, activation='relu'),
    Dense(1, activation='sigmoid') # 이진 분류
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(padded, labels, epochs=20)


Epoch 1/20




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.7500 - loss: 0.6899
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 0.7500 - loss: 0.6881
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.7500 - loss: 0.6864
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.7500 - loss: 0.6848
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.7500 - loss: 0.6831
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 1.0000 - loss: 0.6815
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - accuracy: 1.0000 - loss: 0.6798
Epoch 8/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step - accuracy: 1.0000 - loss: 0.6783
Epoch 9/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms

<keras.src.callbacks.history.History at 0x79367f3d3fd0>

In [59]:
test_text = ["급등", "폭락", "금리 인상", "상승", "하락"]
test_seq = tokenizer.texts_to_sequences(test_text)
test_padded = pad_sequences(test_seq, padding='pre', maxlen=sequence_length)
predictions = model.predict(test_padded)

for text, prob in zip(test_text, predictions):
    print(f"'{text}' → 호재 확률: {prob[0]:.4f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
'급등' → 호재 확률: 0.5015
'폭락' → 호재 확률: 0.5015
'금리 인상' → 호재 확률: 0.5015
'상승' → 호재 확률: 0.5184
'하락' → 호재 확률: 0.4866
