# 문자 단위 RNN 언어 모델

In [86]:
import numpy as np
from urllib import request
from tensorflow.keras.utils import to_categorical

In [87]:
request.urlretrieve("http://www.gutenberg.org/files/11/11-0.txt", filename="11-0.txt")

sentences = []
with open("11-0.txt") as f:
    for sentence in f:
        # 공백, 바이트 열 제거 및 소문자화
        sentence = sentence.strip().lower().encode().decode("ascii", "ignore")
        if len(sentence) > 0:
            sentences.append(sentence)

In [88]:
sentences[:5]

['the project gutenberg ebook of alices adventures in wonderland, by lewis carroll',
 'this ebook is for the use of anyone anywhere in the united states and',
 'most other parts of the world at no cost and with almost no restrictions',
 'whatsoever. you may copy it, give it away or re-use it under the terms',
 'of the project gutenberg license included with this ebook or online at']

In [89]:
total_data = " ".join(sentences)
char_vocab = sorted(list(set(total_data)))
vocab_size = len(char_vocab)

len(total_data), vocab_size

(159484, 56)

In [90]:
char_to_idx = {char: idx for idx, char in enumerate(char_vocab)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

In [91]:
# 훈련 데이터 구성
seq_len = 60 # 한 문장의 길이를 60으로 설정
n_samples = int(np.floor((len(total_data) - 1) / seq_len))

n_samples

2658

In [92]:
X_train = []
y_train = []

for i in range(n_samples):
    X_sample = total_data[i * seq_len:(i + 1) * seq_len]
    X_encoded = [char_to_idx[char] for char in X_sample] # 정수 인코딩
    X_train.append(X_encoded)

    y_sample = total_data[i * seq_len + 1:(i + 1) * seq_len + 1] # 한 문자 shift
    y_encoded = [char_to_idx[char] for char in y_sample]
    y_train.append(y_encoded)

In [93]:
# 문자 단위 RNN에서는 Word Embedding을 사용하지 않고 OHE를 사용
X_train_ohe = to_categorical(X_train)
y_train_ohe = to_categorical(y_train)

X_train_ohe.shape, y_train_ohe.shape # batch size: 2658, timestpes: 60, input_dim: 56

((2658, 60, 56), (2658, 60, 56))

In [94]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, TimeDistributed

In [95]:
# Modeling
hiddent_size = 256

model = Sequential()
model.add(LSTM(hiddent_size, input_shape=(None, X_train_ohe.shape[2]), return_sequences=True))
model.add(LSTM(hiddent_size, return_sequences=True))
model.add(TimeDistributed(Dense(vocab_size, activation="softmax")))

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.fit(X_train_ohe, y_train_ohe, epochs=80, verbose=2)

Epoch 1/80


2023-07-18 20:29:50.389069: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-07-18 20:29:50.688806: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-07-18 20:29:50.875779: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-07-18 20:29:51.130384: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-07-18 20:29:51.401864: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


84/84 - 7s - loss: 3.0593 - accuracy: 0.1826 - 7s/epoch - 81ms/step
Epoch 2/80
84/84 - 4s - loss: 2.6972 - accuracy: 0.2579 - 4s/epoch - 42ms/step
Epoch 3/80
84/84 - 4s - loss: 2.3825 - accuracy: 0.3290 - 4s/epoch - 44ms/step
Epoch 4/80
84/84 - 4s - loss: 2.2483 - accuracy: 0.3605 - 4s/epoch - 43ms/step
Epoch 5/80
84/84 - 4s - loss: 2.1442 - accuracy: 0.3847 - 4s/epoch - 44ms/step
Epoch 6/80
84/84 - 4s - loss: 2.0639 - accuracy: 0.4057 - 4s/epoch - 45ms/step
Epoch 7/80
84/84 - 4s - loss: 2.0037 - accuracy: 0.4221 - 4s/epoch - 46ms/step
Epoch 8/80
84/84 - 4s - loss: 1.9405 - accuracy: 0.4392 - 4s/epoch - 46ms/step
Epoch 9/80
84/84 - 4s - loss: 1.8908 - accuracy: 0.4517 - 4s/epoch - 47ms/step
Epoch 10/80
84/84 - 4s - loss: 1.8488 - accuracy: 0.4628 - 4s/epoch - 48ms/step
Epoch 11/80
84/84 - 4s - loss: 1.8040 - accuracy: 0.4727 - 4s/epoch - 48ms/step
Epoch 12/80
84/84 - 4s - loss: 1.7634 - accuracy: 0.4847 - 4s/epoch - 48ms/step
Epoch 13/80
84/84 - 4s - loss: 1.7276 - accuracy: 0.4957 - 4

<keras.callbacks.History at 0x2c363d490>

In [107]:
def generate_sentence(model, length):
    """특정 문자를 전달 받아 다음 문자를 계속 생성하는 함수"""
    idx = [np.random.randint(vocab_size)]
    y_char = [idx_to_char[idx[-1]]]
    print(f"{idx[-1]}번 문자 {y_char[0]}로 예측 시작")
    X = np.zeros((1, length, vocab_size)) # LSTM의 입력 시퀀스

    for i in range(length):
        X[0][i][idx] = 1 # 예측 문자의 인덱스를 입력 시퀀스에 추가
        print(idx_to_char[idx[-1]], end="")
        idx = np.argmax(model.predict(X[:, :i + 1, :])[0], axis=1)
        y_char.append(idx_to_char[idx[-1]])
    return "".join(y_char)

res = generate_sentence(model, 100)

res

51번 문자 v로 예측 시작


've raglgviyaaaeiiyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy'

## 다대일 RNN

In [129]:
import numpy as np
from tensorflow.keras.utils import to_categorical

In [130]:
raw_text = '''
I get on with life as a programmer,
I like to contemplate beer.
But when I start to daydream,
My mind turns straight to wine.

Do I love wine more than beer?

I like to use words about beer.
But when I stop my talking,
My mind turns straight to wine.

I hate bugs and errors.
But I just think back to wine,
And I'm happy once again.

I like to hang out with programming and deep learning.
But when left alone,
My mind turns straight to wine.
'''

In [131]:
# 하나의 문자열로
tokens = raw_text.split()
raw_text = " ".join(tokens)

raw_text

"I get on with life as a programmer, I like to contemplate beer. But when I start to daydream, My mind turns straight to wine. Do I love wine more than beer? I like to use words about beer. But when I stop my talking, My mind turns straight to wine. I hate bugs and errors. But I just think back to wine, And I'm happy once again. I like to hang out with programming and deep learning. But when left alone, My mind turns straight to wine."

In [132]:
# 중복을 제거한 문자 집합 생성
char_vocab = sorted(list(set(raw_text)))
vocab_size = len(char_vocab)

print(char_vocab)
print(vocab_size)

[' ', "'", ',', '.', '?', 'A', 'B', 'D', 'I', 'M', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y']
33


In [133]:
char_to_idx = {char: idx for idx, char in enumerate(char_vocab)}

print(char_to_idx)

{' ': 0, "'": 1, ',': 2, '.': 3, '?': 4, 'A': 5, 'B': 6, 'D': 7, 'I': 8, 'M': 9, 'a': 10, 'b': 11, 'c': 12, 'd': 13, 'e': 14, 'f': 15, 'g': 16, 'h': 17, 'i': 18, 'j': 19, 'k': 20, 'l': 21, 'm': 22, 'n': 23, 'o': 24, 'p': 25, 'r': 26, 's': 27, 't': 28, 'u': 29, 'v': 30, 'w': 31, 'y': 32}


In [134]:
# timesteps = 10
length = 11
sequences = []

for i in range(length, len(raw_text)):
    seq = raw_text[i - length: i] 
    sequences.append(seq)

sequences[:10]

['I get on wi',
 ' get on wit',
 'get on with',
 'et on with ',
 't on with l',
 ' on with li',
 'on with lif',
 'n with life',
 ' with life ',
 'with life a']

In [135]:
encoded_sequences = []

for seq in sequences:
    encoded_sequence = [char_to_idx[char] for char in seq]
    encoded_sequences.append(encoded_sequence)

encoded_sequences[:5]

[[8, 0, 16, 14, 28, 0, 24, 23, 0, 31, 18],
 [0, 16, 14, 28, 0, 24, 23, 0, 31, 18, 28],
 [16, 14, 28, 0, 24, 23, 0, 31, 18, 28, 17],
 [14, 28, 0, 24, 23, 0, 31, 18, 28, 17, 0],
 [28, 0, 24, 23, 0, 31, 18, 28, 17, 0, 21]]

In [136]:
encoded_sequences = np.array(encoded_sequences)
X_train = encoded_sequences[:, :-1]
y_train = encoded_sequences[:, -1]

X_train[:5]

array([[ 8,  0, 16, 14, 28,  0, 24, 23,  0, 31],
       [ 0, 16, 14, 28,  0, 24, 23,  0, 31, 18],
       [16, 14, 28,  0, 24, 23,  0, 31, 18, 28],
       [14, 28,  0, 24, 23,  0, 31, 18, 28, 17],
       [28,  0, 24, 23,  0, 31, 18, 28, 17,  0]])

In [137]:
# OHE
X_train_ohe = np.array([to_categorical(X, num_classes=vocab_size) for X in X_train])
y_train_ohe = to_categorical(y_train, num_classes=vocab_size)

In [138]:
X_train_ohe.shape

(426, 10, 33)

In [139]:
# Modeling
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences

hidden_units = 64

model = Sequential()
model.add(LSTM(hidden_units, input_shape=(X_train_ohe.shape[1], X_train_ohe.shape[2])))
model.add(Dense(vocab_size, activation="softmax"))

model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
model.fit(X_train_ohe, y_train_ohe, epochs=100, verbose=2)

Epoch 1/100


2023-07-19 19:49:33.340695: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-07-19 19:49:33.487582: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-07-19 19:49:33.598965: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


14/14 - 1s - loss: 3.4621 - accuracy: 0.1338 - 1s/epoch - 97ms/step
Epoch 2/100
14/14 - 0s - loss: 3.3339 - accuracy: 0.1972 - 264ms/epoch - 19ms/step
Epoch 3/100
14/14 - 0s - loss: 3.0535 - accuracy: 0.1972 - 242ms/epoch - 17ms/step
Epoch 4/100
14/14 - 0s - loss: 2.9835 - accuracy: 0.1972 - 248ms/epoch - 18ms/step
Epoch 5/100
14/14 - 0s - loss: 2.9585 - accuracy: 0.1972 - 234ms/epoch - 17ms/step
Epoch 6/100
14/14 - 0s - loss: 2.9451 - accuracy: 0.1972 - 226ms/epoch - 16ms/step
Epoch 7/100
14/14 - 0s - loss: 2.9257 - accuracy: 0.1972 - 285ms/epoch - 20ms/step
Epoch 8/100
14/14 - 0s - loss: 2.9123 - accuracy: 0.1972 - 273ms/epoch - 20ms/step
Epoch 9/100
14/14 - 0s - loss: 2.9043 - accuracy: 0.1972 - 252ms/epoch - 18ms/step
Epoch 10/100
14/14 - 0s - loss: 2.8707 - accuracy: 0.1972 - 257ms/epoch - 18ms/step
Epoch 11/100
14/14 - 0s - loss: 2.8429 - accuracy: 0.1995 - 224ms/epoch - 16ms/step
Epoch 12/100
14/14 - 0s - loss: 2.8223 - accuracy: 0.2019 - 256ms/epoch - 18ms/step
Epoch 13/100
14/

<keras.callbacks.History at 0x2c948e460>

In [141]:
def generate_sentence(model, char_to_idx, seq_len, seed_text, n):
    """문자열을 입력 받아 해당 문자열로부터 다음 문자를 예측하는 것을 반복하여 그 문장을 반환하는 함수"""
    init_text = seed_text
    sentence = ""

    for _ in range(n): # n번만 예측
        encoded = [char_to_idx[char] for char in seed_text] # 정수 인코딩
        encoded = pad_sequences([encoded], maxlen=seq_len, padding="pre") # 패딩
        encoded = to_categorical(encoded, num_classes=len(char_to_idx)) # OHE

        res = model.predict(encoded, verbose=0)
        res = np.argmax(res, axis=1)

        for char, idx in char_to_idx.items():
            if idx == res:
                break
        
        seed_text = seed_text + char
        sentence = sentence + char
    sentence = init_text + sentence
    return sentence

print(generate_sentence(model, char_to_idx, 10, "I get on w", 80))

2023-07-19 19:51:18.510470: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-07-19 19:51:18.565965: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


I get on with life as a programmer, I like to use words about beer. But when I stapt to aa
