In [1]:
from tensorflow import keras
keras.__version__

'2.2.4-tf'

In [2]:
import tensorflow as tf   #tf 2.0 GPU error 해결법
gpu_options = tf.compat.v1.GPUOptions(per_process_gpu_memory_fraction=0.7)

sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(gpu_options=gpu_options))

# LSTM으로 텍스트 생성하기

이 노트북은 [케라스 창시자에게 배우는 딥러닝](https://tensorflow.blog/deep-learning-with-python/) 책의 8장 1절의 코드 예제입니다. 책에는 더 많은 내용과 그림이 있습니다. 이 노트북에는 소스 코드에 관련된 설명만 포함합니다. 이 노트북의 설명은 케라스 버전 2.2.2에 맞추어져 있습니다. 케라스 최신 버전이 릴리스되면 노트북을 다시 테스트하기 때문에 설명과 코드의 결과가 조금 다를 수 있습니다.

----

[...]

## 글자 수준의 LSTM 텍스트 생성 모델 구현

이런 아이디어를 케라스로 구현해 보죠. 먼저 언어 모델을 학습하기 위해 많은 텍스트 데이터가 필요합니다. 위키피디아나 반지의 제왕처럼 아주 큰 텍스트 파일이나 텍스트 파일의 묶음을 사용할 수 있습니다. 이 예에서는 19세기 후반 독일의 철학자 니체의 글을 사용하겠습니다(영어로 번역된 글입니다). 학습할 언어 모델은 일반적인 영어 모델이 아니라 니체의 문체와 특정 주제를 따르는 모델일 것입니다.

## 데이터 전처리

먼저 말뭉치를 다운로드하고 소문자로 바꿉니다:

In [3]:
from tensorflow import keras
import numpy as np

path = keras.utils.get_file(
    'nietzsche.txt',
    origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = open(path).read().lower()
print('말뭉치 크기:', len(text))

말뭉치 크기: 600893


In [4]:
type(text)

str

그 다음 `maxlen` 길이를 가진 시퀀스를 중복하여 추출합니다. 추출된 시퀀스를 원-핫 인코딩으로 변환하고 크기가 `(sequences, maxlen, unique_characters)`인 3D 넘파이 배열 `x`로 합칩니다. 동시에 훈련 샘플에 상응하는 타깃을 담은 배열 `y`를 준비합니다. 타깃은 추출된 시퀀스 다음에 오는 원-핫 인코딩된 글자입니다.

In [5]:
# 60개 글자로 된 시퀀스를 추출합니다.
maxlen = 60

# 세 글자씩 건너 뛰면서 새로운 시퀀스를 샘플링합니다.
step = 3

# 추출한 시퀀스를 담을 리스트
sentences = []

# 타깃(시퀀스 다음 글자)을 담을 리스트
next_chars = []

for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('시퀀스 개수:', len(sentences))

# 말뭉치에서 고유한 글자를 담은 리스트
chars = sorted(list(set(text)))
print('고유한 글자:', len(chars))
# chars 리스트에 있는 글자와 글자의 인덱스를 매핑한 딕셔너리
char_indices = dict((char, chars.index(char)) for char in chars)

# 글자를 원-핫 인코딩하여 0과 1의 이진 배열로 바꿉니다.
print('벡터화...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

시퀀스 개수: 200278
고유한 글자: 58
벡터화...


## 네트워크 구성

이 네트워크는 하나의 `LSTM` 층과 그 뒤에 `Dense` 분류기가 뒤따릅니다. 분류기는 가능한 모든 글자에 대한 소프트맥스 출력을 만듭니다. 순환 신경망이 시퀀스 데이터를 생성하는 유일한 방법은 아닙니다. 최근에는 1D 컨브넷도 이런 작업에 아주 잘 들어 맞는다는 것이 밝혀졌습니다.

In [6]:
from tensorflow.keras import layers

model = keras.models.Sequential()
model.add(layers.LSTM(128, input_shape=(maxlen, len(chars))))
model.add(layers.Dense(len(chars), activation='softmax'))

W0909 19:57:45.934534  3144 tf_logging.py:161] <tensorflow.python.keras.layers.recurrent.UnifiedLSTM object at 0x000001B394EBE278>: Note that this layer is not optimized for performance. Please use tf.keras.layers.CuDNNLSTM for better performance on GPU.


타깃이 원-핫 인코딩되어 있기 때문에 모델을 훈련하기 위해 `categorical_crossentropy` 손실을 사용합니다:

In [7]:
optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

## 언어 모델 훈련과 샘플링

훈련된 모델과 시드로 쓰일 간단한 텍스트가 주어지면 다음과 같이 반복하여 새로운 텍스트를 생성할 수 있습니다.

1.	지금까지 생성된 텍스트를 주입하여 모델에서 다음 글자에 대한 확률 분포를 뽑습니다.
2.	특정 온도로 이 확률 분포의 가중치를 조정합니다.
3.	가중치가 조정된 분포에서 무작위로 새로운 글자를 샘플링합니다.
4.	새로운 글자를 생성된 텍스트의 끝에 추가합니다.

다음 코드는 모델에서 나온 원본 확률 분포의 가중치를 조정하고 새로운 글자의 인덱스를 추출합니다(샘플링 함수입니다):

In [8]:
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

마지막으로 다음 반복문은 반복적으로 훈련하고 텍스트를 생성합니다. 에포크마다 학습이 끝난 후 여러가지 온도를 사용해 텍스트를 생성합니다. 이렇게 하면 모델이 수렴하면서 생성된 텍스트가 어떻게 진화하는지 볼 수 있습니다. 온도가 샘플링 전략에 미치는 영향도 보여 줍니다.

In [9]:
import random
import sys

random.seed(42)
start_index = random.randint(0, len(text) - maxlen - 1)

# 60 에포크 동안 모델을 훈련합니다
for epoch in range(1, 60):
    print('에포크', epoch)
    # 데이터에서 한 번만 반복해서 모델을 학습합니다
    model.fit(x, y, batch_size=128, epochs=1)

    # 무작위로 시드 텍스트를 선택합니다
    seed_text = text[start_index: start_index + maxlen]
    print('--- 시드 텍스트: "' + seed_text + '"')

    # 여러가지 샘플링 온도를 시도합니다
    for temperature in [0.2, 0.5, 1.0, 1.2]:
        print('------ 온도:', temperature)
        generated_text = seed_text
        sys.stdout.write(generated_text)

        # 시드 텍스트에서 시작해서 400개의 글자를 생성합니다
        for i in range(400):
            # 지금까지 생성된 글자를 원-핫 인코딩으로 바꿉니다
            sampled = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(generated_text):
                sampled[0, t, char_indices[char]] = 1.

            # 다음 글자를 샘플링합니다
            preds = model.predict(sampled, verbose=0)[0]
            next_index = sample(preds, temperature)
            next_char = chars[next_index]

            generated_text += next_char
            generated_text = generated_text[1:]

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

에포크 1
--- 시드 텍스트: "the slowly ascending ranks and classes, in which,
through fo"
------ 온도: 0.2
the slowly ascending ranks and classes, in which,
through for the destrues and the saccession of the sense of the still and all the still the discertain to the still the still the saccess of the discertions of the sense of the stould to the domentary the dould the still and the still the still the man and commanses and the still the gree and the saccess of the still the scheired and the discomentation of the scholen and the decelved and selfing the still a
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through for the intertes will the discertable of the science of the senter of something the sought of
the sense of most
explestion, and the philosophy for the discertioned to the to the part in the sage of the chillitations the the long and in the greated not and deman religions, and of the semselves of the greated in the more without the will where was in the one master and 

creentionh--"faluar; bleur matter tow will the
phriocslyry and the motesly consporicary and
placen of rum
에포크 5
--- 시드 텍스트: "the slowly ascending ranks and classes, in which,
through fo"
------ 온도: 0.2
the slowly ascending ranks and classes, in which,
through for the spirit of the problem of the same things that the problem of the scholaring the still the morality of the former the problem of the former the strong and strong and the problem of the problem of the strong the spirit of the soul and the dangerous the problem of the possess and the strong the problem of the problem of the same pain of the problem of the soul and such a problem of the morality
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through for all ever rull of the former than the the religions and because not the gradule strign the fairs of the cause than an attention is in the varitable of the intellicion, they are that all the through the samethiness have all ender things himself the agreenary bec

inscribes (or. the dut anyshines and frees, butanj hand up strangerest valicity withat wenkig als life vanitever new creatureests
in the atiate perhaps and free upon philosopher, that even as alsolard and
underitually is, is knowledge and iother, nati
에포크 9
--- 시드 텍스트: "the slowly ascending ranks and classes, in which,
through fo"
------ 온도: 0.2
the slowly ascending ranks and classes, in which,
through for a mean of the same thinks and the spirit of the superiority of the superiority of the conscience and conscience and in the same than the spirit and superiority of the spirit of the same thinker the sering that the strong and also a superiority of the same thinker and the spirit of the spirit of the superiority of the many and also the strife in their strange of the strength of the same than the 
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through for the way
in any discoment and "thinker and afforting are minds as i something with their astrupupse and conscience,

fact divine how as loagier with a actumoune man to very result had
nef bare of me, that no donutilant of meitorne
enthusian as
sorl has to man has
had hod among echakiod vise aagot, but dohble soul,
it galg uage."
     in really well
thbe us unicures flow. they are order coted--bllisser to snvi"g in
the upon the ascecue onotiordarily that scienc
에포크 13
--- 시드 텍스트: "the slowly ascending ranks and classes, in which,
through fo"
------ 온도: 0.2
the slowly ascending ranks and classes, in which,
through for the streest of the strength of the standard of the strength of the strength of the spirit of the false of the spirit of the false when they are any discovers in the state of the most desires the antituge of the sense of the present standard of the spirit, and all the most state of the spirit of the most standard and profounder and the strength of the spirit of the strength is the most sense of t
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through for the antidouality,

through found the
sub"yn? they beinds give ourselves, has haige it: i hale: that do conturplusion. senteas clourse tooghes the first now aftermuptionant's gromanism, the consist.=--there possikible who, he more once bad this, all
menequent and
impleas themselves of this lomesicy, coises it, a pains, surmittal,
.yy low te "who one may less being lugesd bienevous.--selfoubless, and accorbat is consider to jud
에포크 17
--- 시드 텍스트: "the slowly ascending ranks and classes, in which,
through fo"
------ 온도: 0.2
the slowly ascending ranks and classes, in which,
through for the same the spirit and some for and in the same the spirit of the superioright of the conscience of the spirit of the self success of the same the reality of the spirit of the spirit of the spirit of the spirit of the most the spirit of the self and the spirit of the still soul is the most existing for a free spirit of the superiorive with the same the spirit of the spirit of the speciation of 
------ 온도: 0.5
the slowly ascen

  This is separate from the ipykernel package so we can avoid doing imports until


 conscience of the superiorificate that and for the most social conscience, but the men, without world the promises and the existence of men of the good decided to the spirit in the words also the fact of the world of the self develo
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through foundation should dealh, to gradul; said inventianciently to yet account of a notice, inphond, for the feal.


reobuty at man, one _mirat.

1achated ageble is therefore humilally, and
inaristensually pariitian dear, and tranghing, privile recirbsations extreched for socialo_ges, roudg thinks by for acchupish anduted depresent
for inphilosistic hase we lookamed of nature of the states of ethic of who
------ 온도: 1.2
the slowly ascending ranks and classes, in which,
through formoudporiful trays, as interree before on that it larce of chikely deepenerards, approjused to are
a cust
med or
attentish,
was
      a "faith of slorts; one circeing, which
i--footorial sounwisid trvent; aedutitess

13h but it is the sense in which one must have consequently what is endurable man who so conscious and prepresent as the end in the man, they man, the free spirit
that when i have been should that the world in the commander that men because so nature, is the more morality, and in the mere, and something that one of the fact the person of t
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through forked inavourituwical gheniois-: the ingreise us affeident of justice. the unracper is a new, fell-pogiromed volibou id, i because .xuption and good to caste, thratemently the sunding that one of distinction through the last, the whole that the fact beliehe very pleking the masterurhthy as
one follow which proof it towards the instincts as betray that sexity
they dignofar to the culped to evolves l
------ 온도: 1.2
the slowly ascending ranks and classes, in which,
through for its na cordegable
degrees) enthalistic intoishuper with those as it is naturaming to been hver life-condu

through for a romanticism of the soul," and of the means of the also praise all the strength is to do that they were the deserve the science," to be all the stronger and the problements, it is in the same twill will, they in the being suspicions, as a man is not the most the sense and the most strongly and because they be according to the strong to be generally and the way to the disticrable the constance t
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through for ever
thought of fa
shomerly differens,
and plefulile, as.=--would be not all dispraididick in the case the naten               one pleasured to--in my absund another; overly readind everyditital science is honortedity eternallys will se.

  y sbie feel their said madaved
to spote,
no mences than the worst: the europe of the coursu a man! and what is justice burdopful
astumed turn hath a servilit
------ 온도: 1.2
the slowly ascending ranks and classes, in which,
through for
doghr
their patienture, it suip the 

superiority of the superiority of the superiority of the superiority of the superiority, and all the superiority, and as a man is a more south and belief of the superiority of the fact the south in the problem of the moral and the and the superiority of the superiority, and in the moral conditi
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through formar problem of distinguen in the superiority, which the lart. the moral out of character is a more not only a person and regard, the former to the superable there as a men of such a world he is not makes be demanding and every depths of the man of all the destiny and as a master of the best morals are in the old and destination is the what is the respected and here on the primitions of the
head o
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through fore alsere of "knowlzthon, that isage,
the same instleties, which
one on a je
thingsisititingary, writings perhable a virtuousness
of a helpt--gover the g

through for the conscience the same as it is the most desire to a man and still an accompanded the strong and all the consequent and the sense of the most desire to be a man and the enemetere the constant present than the allent to the conscience and the most present the spirituality of the strength and interesting and all the present to the present and conscience and the same the standard of the present an
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through for himself which, in the experience of all the state has the something and a man and the truth to stage of a woman,
in the same time of all more advantation to the contrary and still originally in the same trigile and conscience, as a thinked precious always be dreadful and the most prevaluation of the most higher man and which only a peoples and individual
and such a feels or he is to say, and who
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through for the
assion aih, the deceive i heir e

through for the same to the same to the sense of the same to the same to the sense in the most proper to the same to the sense of the problem of the most desires and more problem of the sense of the sublime to the substitute of the sense of the same the sentiments and sentiments of the subtle constrains to the problem of the sense the sublime to the science, and the same to the sentiments of the problem of 
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through for restric is also the conception (from a put litiount of the conditions of profound to what sense of the same in the emotion as the sense of the same mankind.


11

=forget and convantation. the conscience of the same the extend of the power and possible to the alter of the straight at all certainly reason. the mysterious realities, of the same all philosophy and will? if with the opposite that th
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through formerly regroan" threal timeen most exc

through formerly deceived the state of the spirit of the state of the most such and delights of the profound the conscience of the sense of the conscience is all the faith of the most more in the expection of the conscience of the sense of the state of the expection of the comparison to the problem of the philosophers and conscience of the consideration of the state of the philosophy and stifted to the sens
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through form! and is the general men of the badness or the disposing more interesting in the has always all the consciences and connection of the philosophers will avomag-substitude of the principles, as the faith and still life and the things and look of the only the false in the stone has not be deceived; all the conditions to people of the badness.=--the inventiment of the most stook of the nationan and 
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through fortunally the reschen-at the generation

through formerly and soul as a state of the same will to self-contradiction and soul that the strength of the most soul as a man will also the same will to be a powerful and soul and soul that the strength and profound to the sense the same will and soul as a soul as a man and soul as a soul as a man and something of the early soul" the soul and soul that the strength of the sense of the sense of the sense 
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through form of the sense the world of the same will in the today and can be not something is to self-suffering and apone in the world of the basis of the part of the end and still and likewious the compared and ssand: here also the moral sufficiently and decense and sense of the preternce and sufficiently to me the translipoligidly of the world of the sense of the stone and soul as he we far its two garrer
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through form and seet other and destruc ntice
ev

through formor to the same the standard of the compare of the compared with the early be in the compared by the promise formule of the same the standard of the promption of the compared and souls, and the state of the same the great the same time that what we would be the promise formule of the sense of the standard of the same to the promption of the bad world and the promise form of the promption of the m
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through formation and thinking the depths, to the case of the present man always and the conditions of power
of the father of clearly complice of the promise attain to the present world will that it is because the
man of the vaulfly in the substance of the most religious the present the part of his spole "the fundamental party who free spirits of rank the principle and the simples and suffering in which a m
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through founder there, it is
broughts with sympa

through formed of the same in the sense of the same all the present world--and who be and something is the sense of the same the sense of the same all the subjublent to the same time and the philosophers of the same and the same and something something who is a man as a present the most soul is to the sense of the same as a present to the sense of the superling the strength of the same as in the sense of th
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through fore the distingued noble
history of the sense of the sediousness of the sense of the reason of which a matters and in command, and not have been a soul as we may also the causa
of the reason of the world is a man as the world.

138. it sees the antiqueration for the sense of the beliemen
and reason to whom
the reason to his sense and prefers the sense of the case of command that one has a nobly one
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through for the
manixately pupires
himself pleas

through forming to the fact that it is almost the superiority and delight in the most sure the good to the struggaved and surplus of the subject is the most surpluses and self-reason to the world as the three will some and all the sense of the world because it wishes to be belie the strong to the suffering to the present man will be all the proper-dolunction to the desire to be body and says and sufferings 
------ 온도: 0.5
the slowly ascending ranks and classes, in which,
through for them to the concerning the possibility that it is a pain of the farious has different and commanders of the fact that it is always be difficults of the reason, is a moral and all even in the superior moral proper-greations which there is not almost suffering--the fundamental of the lay may be als, and in the european and constantly there is the disposing
and new distance, or has been a condition
------ 온도: 1.0
the slowly ascending ranks and classes, in which,
through foregoissy-when that as we coverach, reg

여기서 볼 수 있듯이 낮은 온도는 아주 반복적이고 예상되는 텍스트를 만듭니다. 하지만 국부적인 구조는 매우 실제와 같습니다. 특히 모든 단어(단어는 글자의 지역 패턴으로 이루어집니다)가 실제 영어 단어입니다. 높은 온도에서 생성된 텍스트는 아주 흥미롭고 놀라우며 창의적이기도 합니다. 이따금 꽤 그럴싸하게 보이는 완전히 새로운 단어를 창조합니다(‘begarmed’와 ‘isharent’ 같은 단어입니다). 높은 온도에서는 국부적인 구조가 무너지기 시작합니다. 대부분의 단어가 어느정도 무작위한 문자열로 보입니다. 확실히 이 네트워크에서는 텍스트 생성에 가장 좋은 온도는 0.5입니다. 항상 다양한 샘플링 전략으로 실험해 봐야합니다! 학습된 구조와 무작위성 사이에 균형을 잘 맞추면 흥미로운 것을 만들 수 있습니다.

더 많은 데이터에서 크고 깊은 모델을 훈련하면 이것보다 훨씬 논리적이고 실제와 같은 텍스트 샘플을 생성할 수 있습니다. 당연히 우연이 아닌 의미 있는 텍스트가 생성된다고 기대하지 마세요. 글자를 연속해서 나열하기 위한 통계 모델에서 데이터를 샘플링한 것뿐입니다. 언어는 의사소통의 수단입니다. 의사소통이 의미하는 것과 의사소통이 인코딩된 메시지의 통계 구조 사이는 차이가 있습니다. 이 차이를 검증하기 위해 다음과 같은 사고 실험을 해보죠. 컴퓨터가 대부분의 디지털 통신에서 하는 것처럼 사람의 언어가 의사소통을 압축하는데 더 뛰어나다면 어떨까요? 언어의 의미가 줄진 않지만 고유한 통계 구조가 사라질 것입니다. 이는 방금과 같은 언어 모델을 학습하는 것을 불가능하게 만듭니다.

## 정리

* 이전의 토큰이 주어지면 다음 토큰(들)을 예측하는 모델을 훈련하여 시퀀스 데이터를 생성할 수 있습니다.
* 텍스트의 경우 이런 모델을 언어 모델이라 부릅니다. 단어 또는 글자 단위 모두 가능합니다.
* 다음 토큰을 샘플링할 때 모델이 만든 출력에 집중하는 것과 무작위성을 주입하는 것 사이에 균형을 맞추어야 합니다.
* 이를 위해 소프트맥스 온도 개념을 사용합니다. 항상 다양한 온도를 실험해서 적절한 값을 찾습니다.