In [260]:
import tensorflow as tf
import numpy as np
import os
import time

### SOURCE TEXT LOADING
- loading .txt
- printing the beginning, number of characters, number of unique characters
    

In [261]:
source = open('stone.txt').read()
print(source[:300])


Harry Potter and the Sorcerer's Stone


CHAPTER ONE

THE BOY WHO LIVED

Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say
that they were perfectly normal, thank you very much. They were the last
people you'd expect to be involved in anything strange or mysterious,
because they ju


In [262]:
print ('{} characters'.format(len(source)))

439742 characters


In [263]:
vocab = sorted(set(source))
print ('{} unique characters'.format(len(vocab)))

79 unique characters


In [264]:
# unique characters to indices mapping
char2index = {u:i for i, u in enumerate(vocab)}
index2char = np.array(vocab)

text_as_int = np.array([char2index[c] for c in source])

print(text_as_int)

[32 52 69 ... 38 28  1]


In [265]:
print ('{} -- characters mapped to int -- > {}'.format(repr(source[:13]), text_as_int[:13]))

'Harry Potter ' -- characters mapped to int -- > [32 52 69 69 76  2 40 66 71 71 56 69  2]


### DATASET PREPROCESSING
- character to index mapping and converting text to numerical representation
- defining max input sequence length
- creating TensorFlow Dataset from the source
- creating batches from dataset
- input-target split function (:-1 is input, and last element is expected value)
- applying input-target split function to each batch

In [266]:
seq_length = 100
examples_per_epoch = len(source)//(seq_length+1)

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
  print(index2char[i.numpy()])


H
a
r
r
y


In [267]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
  print(repr(''.join(index2char[item.numpy()])))

"Harry Potter and the Sorcerer's Stone\n\n\nCHAPTER ONE\n\nTHE BOY WHO LIVED\n\nMr. and Mrs. Dursley, of numb"
'er four, Privet Drive, were proud to say\nthat they were perfectly normal, thank you very much. They w'
"ere the last\npeople you'd expect to be involved in anything strange or mysterious,\nbecause they just "
"didn't hold with such nonsense.\n\nMr. Dursley was the director of a firm called Grunnings, which made\n"
'drills. He was a big, beefy man with hardly any neck, although he did\nhave a very large mustache. Mrs'


In [268]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

### PARAMETERS
- training parameters & data shuffling
- model hyperparameters

In [269]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<_BatchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>

In [270]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 300 #256

# Number of RNN units
rnn_units1 = 1024
rnn_units2 = 1024
rnn_units=[rnn_units1, rnn_units2]
print(vocab_size)

79


### BUILDING MODEL
- GRU, LSTM, RNN, with Conv1

In [271]:
# ORIGINAL GRU
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.GRU(rnn_units1,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.GRU(rnn_units2,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [272]:
# # LSTM
# def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
#     model = tf.keras.Sequential([
#         tf.keras.layers.Embedding(vocab_size, embedding_dim, 
#                                   batch_input_shape=[batch_size, None]),
#         tf.keras.layers.LSTM(rnn_units1,
#                              return_sequences=True,
#                              stateful=True,
#                              recurrent_initializer='glorot_uniform'),
#         tf.keras.layers.LSTM(rnn_units2,
#                              return_sequences=True,
#                              stateful=True,
#                              recurrent_initializer='glorot_uniform'),
#         tf.keras.layers.Dense(vocab_size)
#     ])
#     return model

In [273]:
# # RNN
# def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
#     model = tf.keras.Sequential([
#         tf.keras.layers.Embedding(vocab_size, embedding_dim, 
#                                   batch_input_shape=[batch_size, None]),
#         tf.keras.layers.SimpleRNN(rnn_units1,
#                                   return_sequences=True,
#                                   stateful=True,
#                                   recurrent_initializer='glorot_uniform'),
#         tf.keras.layers.SimpleRNN(rnn_units2,
#                                   return_sequences=True,
#                                   stateful=True,
#                                   recurrent_initializer='glorot_uniform'),
#         tf.keras.layers.Dense(vocab_size)
#     ])
#     return model


In [274]:
# # dodanie Conv1
# def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
#     model = tf.keras.Sequential([
#         tf.keras.layers.Embedding(vocab_size, embedding_dim, 
#                                   batch_input_shape=[batch_size, None]),
#         tf.keras.layers.Conv1D(filters=128, 
#                                kernel_size=5, 
#                                activation='relu', 
#                                padding='same'),
#         tf.keras.layers.GRU(rnn_units1,
#                             return_sequences=True,
#                             stateful=True,
#                             recurrent_initializer='glorot_uniform'),
#         tf.keras.layers.GRU(rnn_units2,
#                             return_sequences=True,
#                             stateful=True,
#                             recurrent_initializer='glorot_uniform'),
#         tf.keras.layers.Dense(vocab_size)
#     ])
#     return model


In [275]:
model = build_model(
  vocab_size = vocab_size,
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [276]:
model.summary()

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (64, None, 300)           23700     
                                                                 
 gru_20 (GRU)                (64, None, 1024)          4073472   
                                                                 
 gru_21 (GRU)                (64, None, 1024)          6297600   
                                                                 
 dense_20 (Dense)            (64, None, 79)            80975     
                                                                 
Total params: 10475747 (39.96 MB)
Trainable params: 10475747 (39.96 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


### MODEL TRAINING
- optimizer & loss function declaration
- compilation
- epochs number
- launching the training

In [277]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [278]:
model.compile(optimizer='adam', loss=loss, metrics=['accuracy'])

In [279]:
checkpoint_dir = './checkpoints_GRU'        # TODO: remember to choose proper directory
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [280]:
EPOCHS=1

In [281]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])



### RESULTS
- bulding the model and applying saved weights
- text generation function
- generate text based on the provided beginning

In [282]:
latest_check= tf.train.latest_checkpoint(checkpoint_dir)

In [283]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(latest_check)

model.build(tf.TensorShape([1, None]))

In [284]:
model.summary()

Model: "sequential_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_13 (Embedding)    (1, None, 300)            23700     
                                                                 
 gru_22 (GRU)                (1, None, 1024)           4073472   
                                                                 
 gru_23 (GRU)                (1, None, 1024)           6297600   
                                                                 
 dense_21 (Dense)            (1, None, 79)             80975     
                                                                 
Total params: 10475747 (39.96 MB)
Trainable params: 10475747 (39.96 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [285]:
def generate_text(model, start_string):

  num_generate = 1000
  input_eval = [char2index[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)
  text_generated = []
  scaling = 0.5

  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      predictions = tf.squeeze(predictions, 0)
      predictions = predictions / scaling
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()
      input_eval = tf.expand_dims([predicted_id], 0)
      text_generated.append(index2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [286]:
print(generate_text(model, start_string=u"Harry killed Ron "))

Harry killed Ron yhe bat he thh an ose sole sarere the wat ot he ole pame and the fas tone an tol the th we tot he to ter yoroy sarey om bor the silad woron werind tor oring an to the sare soros the woud gepad there sas toud the tod the thin thor bore ere foin wemot he wose wat an oros an an he lad. on ant the sos the finle wher the ang onon wead anle sot oud soo fed an the lroind hinl herg and tint an the tor
ber momet an the eod an yote yin lad an the tounr and lo wat tiuns whal the be tal oud oo fan the the the tol onus thar he thore sing tor the iny ang thele was eard alind the tacbe ad car od the fort ant for sor the the the the sas boring the the sot ang eres thed war the the pe the ferere forer the sos she the douns fead the tom sat arkeinnd sole soun les an meree sate ne won cerelr ant and anl the un ther he sot oul thete there yed sas aly ind sarg an were tot ouny saok ke mals sare wapre te ton lek aned she an he th the fint he sorethe fare the the ther anr and the sared tot c

In [None]:
def calculate_cross_entropy_loss(model, test_sequences):
    total_loss = 0
    total_steps = 0
    
    for sequence in test_sequences:
        input_seq = sequence[:-1]
        target_seq = sequence[1:]
        
        input_seq = tf.expand_dims(input_seq, 0)
        
        predictions = model(input_seq)
        
        loss = tf.keras.losses.sparse_categorical_crossentropy(target_seq, predictions[0], from_logits=True)
        
        total_loss += tf.reduce_sum(loss).numpy()
        total_steps += len(target_seq)
    
    avg_loss = total_loss / total_steps
    return avg_loss

Średni Cross-Entropy Loss na zbiorze testowym: 5.3557


In [None]:
# tutaj można załadować inną część tolkiena albo hobbita
first_10_sequences = list(sequences.take(2))

In [304]:
avg_loss = calculate_cross_entropy_loss(model, first_10_sequences)
avg_loss

2.8558943939208983

In [305]:
perplexity = np.exp(avg_loss)
perplexity

17.38998375000394