# RNN

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, LSTM, GRU, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 샘플 텍스트 데이터
data = """나는 오늘 기분이 좋아.
나는 내일도 기분이 좋을 거야.
기분이 좋은 날엔 춤을 추고 싶어."""

In [None]:
# 1. 토큰화: 텍스트 데이터를 숫자로 변환
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data.split('\n'))
sequences = tokenizer.texts_to_sequences(data.split('\n'))

In [None]:
# 2. 단어 인덱스 확인
word_index = tokenizer.word_index
print("단어 인덱스:", word_index)

In [None]:
# 3. 시퀀스를 학습 데이터로 변환
input_sequences = []
for sequence in sequences:
    for i in range(1, len(sequence)):
        input_sequences.append(sequence[:i+1])

In [None]:
print(input_sequences)

In [None]:
# 4. 패딩 처리
max_len = max(len(x) for x in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_len,padding='pre')

In [None]:
# 5. 입력(X)과 출력(y) 분리
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

In [None]:
print(X)

In [None]:
print(y)

In [None]:
# 6. 출력(y)을 원-핫 인코딩
y = tf.keras.utils.to_categorical(y, num_classes=len(word_index) + 1)

In [None]:
# RNN 모델 정의
rnn_model = Sequential([
    Embedding(input_dim=len(word_index) + 1,
              output_dim=10,
              input_length=max_len - 1), # 임베딩 층
    SimpleRNN(64, return_sequences=False), # RNN 층
    Dense(len(word_index) + 1,
          activation='softmax') # 출력 층
])

# 모델 컴파일 및 학습
rnn_model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
rnn_model.fit(X, y, epochs=20, verbose=1)

In [None]:
# LSTM 모델 정의
lstm_model = Sequential([
    Embedding(input_dim=len(word_index) + 1,
              output_dim=10,
              input_length=max_len - 1), # 임베딩 층
    LSTM(64, return_sequences=False), # LSTM 층
    Dense(len(word_index) + 1,
          activation='softmax') # 출력 층
])
# 모델 컴파일 및 학습
lstm_model.compile(loss='categorical_crossentropy', 
                   optimizer='adam', 
                   metrics=['accuracy'])
lstm_model.fit(X, y, epochs=20, verbose=1)

In [None]:
# GRU 모델 정의
gru_model = Sequential([
    Embedding(input_dim=len(word_index) + 1,output_dim=10,input_length=max_len - 1), # 임베딩 층
    GRU(64, return_sequences=False), # GRU 층
    Dense(len(word_index) + 1,activation='softmax') # 출력 층
])
# 모델 컴파일 및 학습
gru_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
gru_model.fit(X, y, epochs=20, verbose=1)

In [None]:
def generate_text(model, tokenizer, seed_text, next_words, max_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_len - 1, padding='pre')
        predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                seed_text += " " + word
                break
    return seed_text

In [None]:
# 텍스트 생성 예제
seed_text = "나는 내일"
print("RNN 생성 결과:", generate_text(rnn_model, tokenizer, seed_text, next_words=5, max_len=max_len))
print("LSTM 생성 결과:", generate_text(lstm_model, tokenizer, seed_text, next_words=5, max_len=max_len))
print("GRU 생성 결과:", generate_text(gru_model, tokenizer, seed_text, next_words=5, max_len=max_len))