In [1]:
import keras
import numpy as np

path = keras.utils.get_file(
    'nietzsche.txt',
    origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = open(path).read().lower()
print('말뭉치 크기:', len(text))

Using TensorFlow backend.


Downloading data from https://s3.amazonaws.com/text-datasets/nietzsche.txt
말뭉치 크기: 600893


In [2]:
# 60개 글자로 된 시퀀스를 추출합니다.
maxlen = 60

# 세 글자씩 건너 뛰면서 새로운 시퀀스를 샘플링합니다.
step = 3

# 추출한 시퀀스를 담을 리스트
sentences = []

# 타깃(시퀀스 다음 글자)을 담을 리스트
next_chars = []

for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('시퀀스 개수:', len(sentences))

# 말뭉치에서 고유한 글자를 담은 리스트
chars = sorted(list(set(text)))
print('고유한 글자:', len(chars))
# chars 리스트에 있는 글자와 글자의 인덱스를 매핑한 딕셔너리
char_indices = dict((char, chars.index(char)) for char in chars)

# 글자를 원-핫 인코딩하여 0과 1의 이진 배열로 바꿉니다.
print('벡터화...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

시퀀스 개수: 200278
고유한 글자: 57
벡터화...


In [0]:
from keras import layers

model = keras.models.Sequential()
model.add(layers.Conv1D(128,7,activation='relu',input_shape=(maxlen, len(chars))))
model.add(layers.MaxPooling1D(3))
model.add(layers.Flatten())
model.add(layers.Dense(len(chars), activation='softmax'))

In [56]:
model.summary()

Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_15 (Conv1D)           (None, 54, 10)            4000      
_________________________________________________________________
max_pooling1d_12 (MaxPooling (None, 27, 10)            0         
_________________________________________________________________
flatten_11 (Flatten)         (None, 270)               0         
_________________________________________________________________
dense_13 (Dense)             (None, 57)                15447     
Total params: 19,447
Trainable params: 19,447
Non-trainable params: 0
_________________________________________________________________


In [0]:
optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [0]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [0]:
import random
import sys

random.seed(42)
start_index = random.randint(0, len(text) - maxlen - 1)

# 60 에포크 동안 모델을 훈련합니다
for epoch in range(1, 60):
    print('에포크', epoch)
    # 데이터에서 한 번만 반복해서 모델을 학습합니다
    model.fit(x, y, batch_size=128, epochs=1)

    # 무작위로 시드 텍스트를 선택합니다
    seed_text = text[start_index: start_index + maxlen]
    print('--- 시드 텍스트: "' + seed_text + '"')

    # 여러가지 샘플링 온도를 시도합니다
    for temperature in [0.2, 0.5, 1.0, 1.2]:
        print('------ 온도:', temperature)
        generated_text = seed_text
        sys.stdout.write(generated_text)

        # 시드 텍스트에서 시작해서 400개의 글자를 생성합니다
        for i in range(400):
            # 지금까지 생성된 글자를 원-핫 인코딩으로 바꿉니다
            sampled = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(generated_text):
                sampled[0, t, char_indices[char]] = 1.

            # 다음 글자를 샘플링합니다
            preds = model.predict(sampled, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char = chars[next_index]

            generated_text += next_char
            generated_text = generated_text[1:]

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

에포크 1
Epoch 1/1
--- 시드 텍스트: "the slowly ascending ranks and classes, in which,
through fo"
------ 온도: 0.2
the slowly ascending ranks and classes, in which,
through foo  aoot ee ito  ooo te  ioaee  tta  oon nee  too  ttt te ota  ttt  ato et e  eot s  too  too  tet t  oat its t  ooa  tot e  oeo  ttoe o  ott ee t  aae tt  ott tee e  tot n  oote en en aee  taae  tnt st thh eeo  ate nt te  oot te ee  oaetee  oto  tno t  ooo ee tt  teh te ooa  ttne tt  tnt tto  oat te ei  taa  itt  toe eto  eat  tto  ttt ee eotn nn teh es  eao e  nan  foo  sto  aot  tat tt  tas te e
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through foo t  eoo ei t e  uron notn calc rtae insat i ni b  atoa t  oao ss irt  oat  aot e gl wtaena ,e olt ass tos oed ihnli l  tohe rtca n  ooot te cors  rtosn ot toeh ter  tanasc ap teaa e uee  stst e nlo
,e thh
 haa et  ifr ef eersn a t t  wrn tetes ntlti t  hror t o  sot tl h o ii  teht u os ea oe oie ese tho  nii te
 aatgea  tedn cerists t e  hhspyr gnos  tca