In [1]:
# Imports
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
import numpy as np
import os
import datetime

## Dataset 

Upload dataset and add path. Check that the length of the text is as expected. 

In [3]:
import os
path = os.getcwd() + '/Data'
file_name = '/Iliad_v3.txt'

text = open(path + file_name, 'rb').read().decode(encoding='utf-8')
words = [w for w in text.split(' ') if w.strip() != '' or w == '\n']
print("Textfile {0} is {1} words long".format(file_name[1:-1], len(words)))

print('Text Sample: ')
print(text[:100])


Textfile Iliad_v3.tx is 153260 words long
Text Sample: 
  achilles' wrath, to greece the direful spring
  of woes unnumber'd, heavenly goddess, sing!
  that


## Prepare the text

Prepare the text by mapping unique characters to indices

In [11]:
# Find all unique characters
vocab = sorted(set(text))
print ('There are {} unique characters'.format(len(vocab)))

# Create array with the characters and the index they are mapped to
char2int = {c:i for i, c in enumerate(vocab)}
int2char = np.array(vocab)
print('Character vector with mapping:\n')
for char,_ in zip(char2int, range(len(vocab))):
   print(' {:4s}: {:3d},'.format(repr(char), char2int[char]))

There are 34 unique characters
Character vector with mapping:

 '\n':   0,
 ' ' :   1,
 '!' :   2,
 "'" :   3,
 ',' :   4,
 '-' :   5,
 '.' :   6,
 '?' :   7,
 'a' :   8,
 'b' :   9,
 'c' :  10,
 'd' :  11,
 'e' :  12,
 'f' :  13,
 'g' :  14,
 'h' :  15,
 'i' :  16,
 'j' :  17,
 'k' :  18,
 'l' :  19,
 'm' :  20,
 'n' :  21,
 'o' :  22,
 'p' :  23,
 'q' :  24,
 'r' :  25,
 's' :  26,
 't' :  27,
 'u' :  28,
 'v' :  29,
 'w' :  30,
 'x' :  31,
 'y' :  32,
 'z' :  33,


In [12]:
# Map text to integers
text_as_int = np.array([char2int[ch] for ch in text], dtype=np.int32)
print ('{}\n mapped to integers:\n {}'.format((text[:100]), text_as_int[:100]))

  achilles' wrath, to greece the direful spring
  of woes unnumber'd, heavenly goddess, sing!
  that
 mapped to integers:
 [ 1  1  8 10 15 16 19 19 12 26  3  1 30 25  8 27 15  4  1 27 22  1 14 25
 12 12 10 12  1 27 15 12  1 11 16 25 12 13 28 19  1 26 23 25 16 21 14  0
  1  1 22 13  1 30 22 12 26  1 28 21 21 28 20  9 12 25  3 11  4  1 15 12
  8 29 12 21 19 32  1 14 22 11 11 12 26 26  4  1 26 16 21 14  2  0  1  1
 27 15  8 27]


Separate data for training and testing

In [14]:
tr_text = text_as_int[:704000] #text separated for training, divisible by the batch size (64)
val_text = text_as_int[704000:] #text separated for validation

## Build the model

In [7]:
# Populate the library of tunables - I like keeping them centralized in case I need to change things around:
batch_size = 64
buffer_size = 10000
embedding_dim = 256
epochs = 50
seq_length = 200
examples_per_epoch = len(text)//seq_length
#lr = 0.001 #will use default for Adam optimizer
rnn_units = 1024
vocab_size = len(vocab)

In [9]:
tr_char_dataset = tf.data.Dataset.from_tensor_slices(tr_text)
val_char_dataset = tf.data.Dataset.from_tensor_slices(val_text)

tr_sequences = tr_char_dataset.batch(seq_length+1, drop_remainder=True)
val_sequences = val_char_dataset.batch(seq_length+1, drop_remainder=True)


In [10]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

tr_dataset = tr_sequences.map(split_input_target).shuffle(buffer_size).batch(batch_size, drop_remainder=True)
val_dataset = val_sequences.map(split_input_target).shuffle(buffer_size).batch(batch_size, drop_remainder=True)
print(tr_dataset, val_dataset)

<BatchDataset shapes: ((64, 200), (64, 200)), types: (tf.int32, tf.int32)> <BatchDataset shapes: ((64, 200), (64, 200)), types: (tf.int32, tf.int32)>
