# Natural Language Generation

In [26]:
import tensorflow as tf
import os

In [40]:
train_text = open('input.txt', 'rb').read().decode(encoding='utf-8')

print("Length of text: {}characters".format(len(train_text)))
print()

train_text = train_text.split('\n')
train_text = [sentence[17:] for sentence in train_text]

print(train_text[:5])

Length of text: 1214055characters

['1\r', '마\r', 'ㅎㅇ\r', '뭐야\r', '늦었네 \r']


In [42]:
import re

def clean_str(string):    
    string = re.sub(r"[^가-힣A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    string = re.sub(r"\'{2,}", "\'", string)
    string = re.sub(r"\'", "", string)
    
    return string


train_text = [clean_str(sentence) for sentence in train_text]
train_text_X = []
for sentence in train_text:
    train_text_X.extend(sentence.split(' '))
    
train_text_X = [word for word in train_text_X if word != '']

print(train_text_X[:20])

['1', '마', '뭐야', '늦었네', '후', '버그거렸노', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '기타창에', '넣을', '아이콘', '뭐']


In [43]:
vocab = sorted(set(train_text_X))
vocab.append('UNK')
print('{} unique words'.format(len(vocab)))

32554 unique words


In [44]:
import numpy as np
word2idx = {u:i for i, u in enumerate(vocab)}
idx2word = np.array(vocab)

text_as_int = np.array([word2idx[c] for c in train_text_X])

print('{')
for word,_ in zip(word2idx, range(10)):
    print('  {:4s}: {:3d},'.format(repr(word), word2idx[word]))
print('   ...\n}')

print('index of UNK: {}'.format(word2idx['UNK']))

{
  '!' :   0,
  ',' :   1,
  '0' :   2,
  '00':   3,
  '0000':   4,
  '000000195259':   5,
  '0000115286':   6,
  '0000802645':   7,
  '0000프로':   8,
  '0004399591':   9,
   ...
}
index of UNK: 32553


In [45]:
print(train_text_X[:20])
print(text_as_int[:20])

['1', '마', '뭐야', '늦었네', '후', '버그거렸노', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '기타창에', '넣을', '아이콘', '뭐']
[  172 12263 14157  8142 32348 15093   172   417   670   789   860   962
  1024  1062  1116     2  6138  7432 19832 14113]


In [46]:
seq_length = 25
examples_per_epoch = len(text_as_int)
sentence_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sentence_dataset = sentence_dataset.batch(seq_length + 1, drop_remainder=True)

for item in sentence_dataset.take(1):
    print(idx2word[item.numpy()])
    print(item.numpy())

['1' '마' '뭐야' '늦었네' '후' '버그거렸노' '1' '2' '3' '4' '5' '6' '7' '8' '9' '0'
 '기타창에' '넣을' '아이콘' '뭐' '없노' '\\?' '그게' '먼말이노' '기타창' '버튼에']
[  172 12263 14157  8142 32348 15093   172   417   670   789   860   962
  1024  1062  1116     2  6138  7432 19832 14113 21213  1595  5268 13094
  6137 15149]


In [47]:
def split_input_target(chunk):
    return [chunk[:-1], chunk[-1]]

train_dataset = sentence_dataset.map(split_input_target)
for x, y in train_dataset.take(1):
    print(idx2word[x.numpy()])
    print(x.numpy())
    print(idx2word[y.numpy()])
    print(y.numpy())

['1' '마' '뭐야' '늦었네' '후' '버그거렸노' '1' '2' '3' '4' '5' '6' '7' '8' '9' '0'
 '기타창에' '넣을' '아이콘' '뭐' '없노' '\\?' '그게' '먼말이노' '기타창']
[  172 12263 14157  8142 32348 15093   172   417   670   789   860   962
  1024  1062  1116     2  6138  7432 19832 14113 21213  1595  5268 13094
  6137]
버튼에
15149


In [48]:
BATCH_SIZE = 128
steps_per_epoch = examples_per_epoch
BUFFER_SIZE = 10000

train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [49]:
total_words = len(vocab)

In [50]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(total_words, 100, input_length=seq_length),
    tf.keras.layers.LSTM(units=100, return_sequences=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(units=100),
    tf.keras.layers.Dense(total_words, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 25, 100)           3255400   
_________________________________________________________________
lstm_5 (LSTM)                (None, 25, 100)           80400     
_________________________________________________________________
dropout_3 (Dropout)          (None, 25, 100)           0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_2 (Dense)              (None, 32554)             3287954   
Total params: 6,704,154
Trainable params: 6,704,154
Non-trainable params: 0
_________________________________________________________________


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

def testmodel(epoch, logs):
    if epoch % 5 != 0 and epoch != 49:
        return
    test_sentence = train_text[0]
    
    next_words = 100
    for _ in range(next_words):
        test_text_X = test_sentence.split(' ')[-seq_length:]
        test_text_X = np.array([word2idx[c] if c in word2idx else word2idx['UNK'] for c in test_text_X])
        test_text_X = pad_sequences([test_text_X], maxlen=seq_length, padding='pre', value=word2idx['UNK'])
        
        output_idx = model.predict_classes(test_text_X)
        test_sentence += ' ' + idx2word[output_idx[0]]
        
    print()
    print(test_sentence)
    print()
    
testmodelcb = tf.keras.callbacks.LambdaCallback(on_epoch_end=testmodel)

history = model.fit(train_dataset.repeat(), epochs=50, steps_per_epoch=steps_per_epoch, callbacks=[testmodelcb], verbose=2)
        

Train for 58929 steps
Epoch 1/50
