In [10]:
import numpy as np
from tensorflow.keras.utils import to_categorical

In [11]:
raw_text = '''
I get on with life as a programmer,
I like to contemplate beer.
But when I start to daydream,
My mind turns straight to wine.

Do I love wine more than beer?

I like to use words about beer.
But when I stop my talking,
My mind turns straight to wine.

I hate bugs and errors.
But I just think back to wine,
And I'm happy once again.

I like to hang out with programming and deep learning.
But when left alone,
My mind turns straight to wine.
'''
print(raw_text)


I get on with life as a programmer,
I like to contemplate beer.
But when I start to daydream,
My mind turns straight to wine.

Do I love wine more than beer?

I like to use words about beer.
But when I stop my talking,
My mind turns straight to wine.

I hate bugs and errors.
But I just think back to wine,
And I'm happy once again.

I like to hang out with programming and deep learning.
But when left alone,
My mind turns straight to wine.



In [12]:
# 문자열의 단락 없애기
tokens = raw_text.split()
raw_text = " ".join(tokens)

print(raw_text)


I get on with life as a programmer, I like to contemplate beer. But when I start to daydream, My mind turns straight to wine. Do I love wine more than beer? I like to use words about beer. But when I stop my talking, My mind turns straight to wine. I hate bugs and errors. But I just think back to wine, And I'm happy once again. I like to hang out with programming and deep learning. But when left alone, My mind turns straight to wine.


In [13]:
# 중복을 제거한 문자 집합 생성
char_vocab = sorted(list(set(raw_text)))
vocab_size = len(char_vocab)
print('문자 집합 :',char_vocab)
print ('문자 집합의 크기 : {}'.format(vocab_size))

문자 집합 : [' ', "'", ',', '.', '?', 'A', 'B', 'D', 'I', 'M', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y']
문자 집합의 크기 : 33


In [14]:
char_to_index = dict((char, index) for index, char in enumerate(char_vocab)) # 문자에 고유한 정수 인덱스 부여
print(char_to_index)

{' ': 0, "'": 1, ',': 2, '.': 3, '?': 4, 'A': 5, 'B': 6, 'D': 7, 'I': 8, 'M': 9, 'a': 10, 'b': 11, 'c': 12, 'd': 13, 'e': 14, 'f': 15, 'g': 16, 'h': 17, 'i': 18, 'j': 19, 'k': 20, 'l': 21, 'm': 22, 'n': 23, 'o': 24, 'p': 25, 'r': 26, 's': 27, 't': 28, 'u': 29, 'v': 30, 'w': 31, 'y': 32}


In [15]:
len(raw_text)

437

In [19]:
length =11
sequences = []

for i in range(length,len(raw_text)):
    # 0~ 11 , 1~ 12 , 2 ~ 13 이런식으로 len까지 슬라이싱해서 append
    seq = raw_text[i-length:i]
    sequences.append(seq)

print('총 훈련 샘플의 수 :',len(sequences))

총 훈련 샘플의 수 : 426


In [22]:
sequences[80:90]

[' to daydrea',
 'to daydream',
 'o daydream,',
 ' daydream, ',
 'daydream, M',
 'aydream, My',
 'ydream, My ',
 'dream, My m',
 'ream, My mi',
 'eam, My min']

In [24]:
# 정수 인코딩 진행하기
encoded_sequences = []
for sequence in sequences: # 전체 데이터에서 문장 샘플을 1개씩 꺼낸다.
    encoded_sequence = [char_to_index[char] for char in sequence] # 문장 샘플에서 각 문자에 대해서 정수 인코딩을 수행.
    encoded_sequences.append(encoded_sequence)

In [25]:
encoded_sequences[:5]

[[8, 0, 16, 14, 28, 0, 24, 23, 0, 31, 18],
 [0, 16, 14, 28, 0, 24, 23, 0, 31, 18, 28],
 [16, 14, 28, 0, 24, 23, 0, 31, 18, 28, 17],
 [14, 28, 0, 24, 23, 0, 31, 18, 28, 17, 0],
 [28, 0, 24, 23, 0, 31, 18, 28, 17, 0, 21]]

In [27]:
encoded_sequences = np.array(encoded_sequences)

# 맨 마지막 위치의 문자를 분리
X_data = encoded_sequences[:,:-1]
# 맨 마지막 위치의 문자를 저장
y_data = encoded_sequences[:,-1]

In [32]:
print(X_data[:5])
print("=============================")
print(y_data[:5])

print(vocab_size)

[[ 8  0 16 14 28  0 24 23  0 31]
 [ 0 16 14 28  0 24 23  0 31 18]
 [16 14 28  0 24 23  0 31 18 28]
 [14 28  0 24 23  0 31 18 28 17]
 [28  0 24 23  0 31 18 28 17  0]]
[18 28 17  0 21]
33


In [34]:
# 원-핫 인코딩
X_data_one_hot = [to_categorical(encoded, num_classes=vocab_size) for encoded in X_data]
X_data_one_hot = np.array(X_data_one_hot)
y_data_one_hot = to_categorical(y_data, num_classes=vocab_size)

In [35]:
print(X_data_one_hot.shape)

(426, 10, 33)


In [36]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences

hidden_units = 64

model = Sequential()
model.add(LSTM(hidden_units, input_shape=(X_data_one_hot.shape[1], X_data_one_hot.shape[2])))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_data_one_hot, y_data_one_hot, epochs=100, verbose=2)

2023-03-15 03:42:22.665463: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: /lib/x86_64-linux-gnu/libcuda.so.1: file too short; LD_LIBRARY_PATH: /usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-03-15 03:42:22.665854: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2023-03-15 03:42:22.666017: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (f7f8d14d730b): /proc/driver/nvidia/version does not exist
2023-03-15 03:42:22.668223: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/100
14/14 - 2s - loss: 3.4698 - accuracy: 0.1221 - 2s/epoch - 109ms/step
Epoch 2/100
14/14 - 0s - loss: 3.3682 - accuracy: 0.1972 - 115ms/epoch - 8ms/step
Epoch 3/100
14/14 - 0s - loss: 3.1344 - accuracy: 0.1972 - 110ms/epoch - 8ms/step
Epoch 4/100
14/14 - 0s - loss: 2.9971 - accuracy: 0.1972 - 98ms/epoch - 7ms/step
Epoch 5/100
14/14 - 0s - loss: 2.9638 - accuracy: 0.1972 - 96ms/epoch - 7ms/step
Epoch 6/100
14/14 - 0s - loss: 2.9433 - accuracy: 0.1972 - 95ms/epoch - 7ms/step
Epoch 7/100
14/14 - 0s - loss: 2.9187 - accuracy: 0.1972 - 100ms/epoch - 7ms/step
Epoch 8/100
14/14 - 0s - loss: 2.9050 - accuracy: 0.1972 - 96ms/epoch - 7ms/step
Epoch 9/100
14/14 - 0s - loss: 2.8785 - accuracy: 0.1972 - 90ms/epoch - 6ms/step
Epoch 10/100
14/14 - 0s - loss: 2.8514 - accuracy: 0.1972 - 88ms/epoch - 6ms/step
Epoch 11/100
14/14 - 0s - loss: 2.8272 - accuracy: 0.1972 - 98ms/epoch - 7ms/step
Epoch 12/100
14/14 - 0s - loss: 2.7903 - accuracy: 0.1972 - 95ms/epoch - 7ms/step
Epoch 13/100
14/14 - 0

<keras.callbacks.History at 0x7fa1ac22d790>

In [38]:
def sentence_generation(model, char_to_index, seq_length, seed_text, n):

    # 초기 시퀀스
    init_text = seed_text
    sentence = ''

    # 다음 문자 예측은 총 n번만 반복.
    for _ in range(n):
        encoded = [char_to_index[char] for char in seed_text] # 현재 시퀀스에 대한 정수 인코딩
        encoded = pad_sequences([encoded], maxlen=seq_length, padding='pre') # 데이터에 대한 패딩
        encoded = to_categorical(encoded, num_classes=len(char_to_index))

        # 입력한 X(현재 시퀀스)에 대해서 y를 예측하고 y(예측한 문자)를 result에 저장.
        result = model.predict(encoded, verbose=0)
        result = np.argmax(result, axis=1)

        for char, index in char_to_index.items():
            if index == result:
                break

        # 현재 시퀀스 + 예측 문자를 현재 시퀀스로 변경
        seed_text = seed_text + char

        # 예측 문자를 문장에 저장
        sentence = sentence + char

    # n번의 다음 문자 예측이 끝나면 최종 완성된 문장을 리턴.
    sentence = init_text + sentence
    return sentence

In [39]:
print(sentence_generation(model, char_to_index, 10, 'I get on w', 80))

I get on with life as a programmer, I like to hang out with programming and deep learning.
