# Text Generator using RNN
Example script to generate text from Nietzsche's writings. At least 20 epochs are required before the generated text starts sounding coherent.<br>
It is recommended to run this script on GPU, as recurrent networks are quite computationally intensive. If you try this script on new data, make sure your corpus has at least ~100k characters. ~1M is better.

In [1]:
# import random    << use np.random instead.
import numpy as np
import random
import sys
import tensorflow as tf
import chardet

from collections import Counter
from konlpy.tag import Twitter
from scipy.sparse import *
from tensorflow.python.keras.callbacks import LambdaCallback    # What is Lambda Callback??
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Activation
from tensorflow.python.keras.layers import LSTM
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.utils import get_file    # Do I have to use get_file instead of direct load?

  from ._conv import register_converters as _register_converters


In [2]:
def load_file(path, filename):
    # path = get_file('path/to/text/file.txt', origin=None)
    # filename = 'D:/Seed_Downloads/Novel Dataset/Asian fantasy/NOVEL_01001.txt'
    if path[-1] is '/':
        file = path + filename
    else:
        file = path + '/' + filename

    read_file = open(file, 'rb').read()
    encode_type = chardet.detect(read_file)['encoding']
    text = open(file, encoding=encode_type).read().lower()

    print('corpus length:', len(text))
    
    return text

In [3]:
text = load_file('D:/Seed_Downloads/Novel Dataset/Asian fantasy/', 'NOVEL_01001.txt')

print('First 100 characters: {}'.format('\n' + text[:100]))

corpus length: 2928103
First 100 characters: 
제 목:[검마전/ sword & magic story]-- 001.

< 검 마 전 : sword & magic story >

눈꺼풀이 무겁다. 머리도 띵하고. 눈을 떠야하는데.


In [4]:
def remove_indicers(text):
    temp_text = text
    
    temp_text = temp_text.replace(" ", " SPACE ")
    temp_text = temp_text.replace("\n", " ENTER ")
    
    return temp_text

In [5]:
text = remove_indicers(text)

print('Fixed first 100 characters: {}'.format('\n' + text[:100]))

Fixed first 100 characters: 
제 SPACE 목:[검마전/ SPACE sword SPACE & SPACE magic SPACE story]-- SPACE 001. ENTER  ENTER < SPACE 검 SPA


In [6]:
def split_to_word(text_replace):
    twitter = Twitter()
    text_split = twitter.morphs(text_replace)
    words = sorted(list(set(text_split)))
    min_word = min(words, key=len)
    max_word = max(words, key=len)

    print('Total words:', len(words))
    print('min word is: {} with length of {}'.format(min_word, len(min_word)))
    print('max word is: {} with length of {}'.format(max_word, len(max_word)))

    word_indices = dict((c, i) for i, c in enumerate(words))
    indices_word = dict((i, c) for i, c in enumerate(words))
    
    return text_split, words, word_indices, indices_word

In [7]:
text_split, words, word_indices, indices_word = split_to_word(text)

print('Split check: {}'.format(text_split[100]))
print('Word check: {}'.format(words[100]))

Total words: 30211
min word is: ! with length of 1
max word is: "@#%*(*^$#&*#$" with length of 15
Split check: 느낌
Word check: ..]


In [8]:
def sentence_create(text_split):
    sentences = []
    temp = ""

    twitter = Twitter()
    
    for i, word in enumerate(text_split):
        temp = temp + word + ' '
        if word == 'ENTER':
            temp = twitter.morphs(temp)
            sentences.append(temp)
            temp = ""

        else:
            continue

    print('Total number of Sentences: {}'.format(len(sentences)))
    
    return sentences

In [9]:
sentences = sentence_create(text_split)

print('Sentence check: {}'.format(sentences[0]))

Total number of Sentences: 109452
Sentence check: ['제', 'SPACE', '목', ':[', '검', '마전', '/', 'SPACE', 'sword', 'SPACE', '&', 'SPACE', 'magic', 'SPACE', 'story', ']--', 'SPACE', '001', '.', 'ENTER']


In [10]:
def generate_input_sequence(sentences):
    # cut the text in semi-redundant sequences of maxlen characters
    longest_sentence = max(sentences)
    maxlen = len(longest_sentence)
    step = 3
    next_words = []
    sentence_data = []
    seq_count_ = (len(text_split) - maxlen) // step

    for i in range(0, len(text_split) - maxlen, step):
        sentence_data.append(text_split[i: i + maxlen])    # nth char ~ n+maxlen char = Sentence
        next_words.append(text_split[i + maxlen])    # next_chars = from ith char ~ end
    
    return sentence_data, next_words, maxlen

In [11]:
sentence_data, next_words, maxlen = generate_input_sequence(sentences)

print('nb sequences:', len(sentence_data))
print('101th sentence length: {}'.format(len(sentence_data[100])))
print('max length: {}'.format(maxlen))

nb sequences: 682030
101th sentence length: 26
max length: 26


In [12]:
Memory_to_use = len(sentence_data) * maxlen * len(words) / (2^30)

print('Expected memory to prepare: {}GB'.format(Memory_to_use))

Expected memory to prepare: 19133036306.42857GB


In [13]:
print('Sequence check...')
for i in range(3):
    print('Sentence {}:\n{}'.format(i, sentence_data[100+i]))

Sequence check...
Sentence 0:
['귀', '를', 'SPACE', '기울여', '도', 'SPACE', '바람소리', '조차', 'SPACE', '들리', '지', 'SPACE', '않는', '다', '.', 'SPACE', '그럼', '..', 'SPACE', '설마', 'SPACE', '난', 'ENTER', '정말로', 'SPACE', '죽은']
Sentence 1:
['기울여', '도', 'SPACE', '바람소리', '조차', 'SPACE', '들리', '지', 'SPACE', '않는', '다', '.', 'SPACE', '그럼', '..', 'SPACE', '설마', 'SPACE', '난', 'ENTER', '정말로', 'SPACE', '죽은', '거', '?', 'SPACE']
Sentence 2:
['바람소리', '조차', 'SPACE', '들리', '지', 'SPACE', '않는', '다', '.', 'SPACE', '그럼', '..', 'SPACE', '설마', 'SPACE', '난', 'ENTER', '정말로', 'SPACE', '죽은', '거', '?', 'SPACE', '으아', '..!', 'SPACE']


In [14]:
def word_to_indices(text_split):
    indice_list = []
    for i, word in enumerate(text_split):
        add_indice = word_indices[word]
        indice_list.append(add_indice)
        
    return indice_list

In [15]:
encoded_text = word_to_indices(text_split)
encoded_next_word = word_to_indices(next_words)

print('Encoded text sample: {}'.format(str(encoded_text[:20])))
print('length of encoded list: {}'.format(len(encoded_text)))
print('Encoded trigger word sample: {}'.format(str(encoded_next_word[:20])))
print('length of encoded list: {}'.format(len(encoded_next_word)))

Encoded text sample: [23380, 1008, 11442, 987, 2873, 10386, 110, 1008, 1308, 1008, 25, 1008, 1199, 1008, 1303, 1012, 1008, 114, 66, 1007]
length of encoded list: 2046114
Encoded trigger word sample: [1008, 985, 1008, 1199, 1008, 1007, 1008, 66, 7990, 28452, 6645, 9258, 6794, 23511, 5959, 24023, 1008, 1008, 998, 21317]
length of encoded list: 682030


In [16]:
def numpy_zeros(sentence_data, encoded_next_word, maxlen, wordlen):
    empty_input = np.zeros((10000, maxlen, wordlen), dtype=np.bool)
    empty_output = np.zeros((10000, wordlen), dtype=np.bool)
    x_list = []
    y_list = []
    iter_count = 0
        
    for row, sentence in enumerate(sentence_data):
        
        if (row % 10000) is 0:
            if row is not 0:
                iter_count += 1
                inputfile = 'B{:03d}X'.format(iter_count)
                outputfile = 'B{:03d}Y'.format(iter_count)
                
                np.save(inputfile, empty_input)
                np.save(outputfile, empty_output)
                x_list.append(inputfile)
                y_list.append(outputfile)
                
                print('Iteration #{} processed...'.format(iter_count))
                
                empty_input = np.zeros((10000, maxlen, wordlen), dtype=np.bool)
                empty_output = np.zeros((10000, wordlen), dtype=np.bool)
                
        else:
            row_count = row - iter_count * 10000
            output_index = encoded_next_word[row]
            empty_output[row_count, output_index] = 1

            for col, word in enumerate(sentence):
                input_index = word_indices[word]
                empty_input[row_count, col, input_index] = 1
                
                
    return x_list, y_list

In [17]:
x_list, y_list = numpy_zeros(sentence_data, encoded_next_word, maxlen, len(words))

Iteration #1 processed...
Iteration #2 processed...
Iteration #3 processed...
Iteration #4 processed...
Iteration #5 processed...
Iteration #6 processed...
Iteration #7 processed...
Iteration #8 processed...
Iteration #9 processed...


OSError: -735074592 requested and 0 written

### 현재 문제
- 이슈: 너무 크다 (파일사이즈)
    - 모든 방법을 이용해봤지만 여전히 너무 크다
- 결론: Iteration을 통해 Feeding을 실시간으로 하는 방법을 사용한다

<br>1. 원하는 Size의 Input/Output배열을 만든다
<br>2. 실시간 생성을 통한 feeding 진행
- 0~9999 생성
<br> Train (0 Epoch)
- 10000 ~ 19999 생성
<br> Train (0 Epoch)
- 20000 ~ 29999 생성
<br> Train (0 Epoch)
- ... (이하 생략)
- 680000 ~ 682030 생성
<br> Train (0 Epoch)
<Br>3. Predict
<br>4. Loss 산출
<br>5. 2번의 과정을 다시 진행

In [None]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(words))))
model.add(Dense(len(words)))
model.add(Activation('softmax'))

In [None]:
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [None]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = np.random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

In [None]:
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

In [None]:
model.fit(x, y,
          batch_size=128,
          epochs=25,
          callbacks=[print_callback])