<a href="https://colab.research.google.com/github/jvgille/deep-learning-project/blob/master/training_vs_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Generator - Deep Learning project

## setup

In [10]:
import os
import tensorflow as tf
import numpy as np
import math
import nltk
nltk.download('punkt')
from collections import defaultdict
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # 0 all, 1 no info, 2 no warning, 3 no error
from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Hyperparameters

In [0]:
file_location = '/content/drive/My Drive/Colab Notebooks/DL project/'
filename = 'shakespeare'
epochs = 10
batch_size = 64
seq_length = 100
rnn_units = 1024 # 1024
embedding_dim = 256 # not used atm
training_set_proportion = 0.999
shuffle_training_data = True

## Functions

In [0]:
def preprocess(filename, batch_size, seq_length, training_set_proportion, file_location='', shuffle_training_data=True):
    text = open(file_location + 'datasets/'+filename+'.txt', 'rb').read().decode(encoding='utf-8')
    text_length = len(text)
    print("Text length: ", text_length)

    training_text = text[:int((training_set_proportion * text_length))]
    test_text = text[int((training_set_proportion * text_length)):]

    print ('{} training_set unique characters'.format(len(sorted(set(training_text)))))
    print ('{} test_set unique characters'.format(len(sorted(set(test_text)))))

    training_vocab = sorted(set(training_text))

    char_to_idx = defaultdict(lambda:-1)
    idx_to_char = {-1: "Unknown"}
    for i, u in enumerate(training_vocab):
        char_to_idx[u] = i
        idx_to_char[i] = u

    training_text_as_int = np.array([char_to_idx[c] for c in training_text])
    #test_text_as_int = np.array([char_to_idx[c] for c in test_text])

    training_char_dataset = tf.data.Dataset.from_tensor_slices(training_text_as_int)
    #test_char_dataset = tf.data.Dataset.from_tensor_slices(test_text_as_int)

    training_sequences = training_char_dataset.batch(seq_length+1, drop_remainder=True)
    #test_sequences = test_char_dataset.batch(seq_length+1, drop_remainder=True)

    def split_input_target(chunk):
        input_text = chunk[:-1]
        target_text = chunk[1:]
        return input_text, target_text

    training_dataset = training_sequences.map(split_input_target)
    #test_dataset = test_sequences.map(split_input_target)

    BUFFER_SIZE = 10000 # TODO: make actual shuffle?
    if shuffle_training_data:
        training_dataset = training_dataset.shuffle(BUFFER_SIZE).batch(batch_size, drop_remainder=True)

    
    return training_dataset, idx_to_char, char_to_idx, training_vocab, test_text, training_text

def train_model(model, dataset, epochs, checkpoint_prefix):
    def loss(labels, logits):
        return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

    model.compile(optimizer='adam', loss=loss)

    # todo save best only
    checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_prefix,
        save_weights_only=True,
        save_best_only=True,
        monitor='loss') # TODO monitor val_loss instead

    return model.fit(dataset, epochs=epochs, callbacks=[checkpoint_callback])


def one_hot(x):
    global vocab_size
    return tf.one_hot(tf.cast(x, 'uint8'), depth=vocab_size)


def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
        model = tf.keras.Sequential([
            tf.keras.layers.Lambda(one_hot, batch_input_shape=[batch_size,None]),
            tf.keras.layers.LSTM(rnn_units,
                                return_sequences=True,
                                stateful=True,
                                recurrent_initializer='glorot_uniform'),
            tf.keras.layers.Dense(vocab_size)
        ])
        return model


    
def generate_text(model, char_to_idx, idx_to_char,
                  start_string, num_generate=1000, temperature=1.0):
    # Converting our start string to numbers (vectorizing)
    input_eval = [char_to_idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []

    model.reset_states()
    BPC_sum = 0
    for i in range(num_generate):
        predictions = model(input_eval)
         # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # used for BPC/perplexity calculations. predictions is unnormalized log probabilities (that is how tf.random.cathegorical treats them later)
        current_prediction = tf.nn.softmax(predictions[-1, :]) # now we have normalized regular probabilities. -1 because we only care about the last output character/word

        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        current_probability = current_prediction[predicted_id]
        BPC_sum -= tf.math.log(current_probability) / math.log(2) # aka add minus log2 of prob

        # We pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx_to_char[predicted_id])

    BPC = BPC_sum / num_generate
    return ( (start_string + ''.join(text_generated)), BPC)



def evaluate_model(model, test_text, char_to_idx, idx_to_char):
    test_text_as_int = [char_to_idx[s] for s in test_text]
    text_length = len(test_text)
    model.reset_states()
    BPC_sum = 0
    unknown_count = 0
    for i in range(text_length - 1):
        if i % 1000 == 0:
          print(i, " characters evaluated")
        token = test_text_as_int[i]
        next_token = test_text_as_int[i+1]
        if token == -1 or next_token == -1: # did not exist in training
            unknown_count += 1
            model.reset_states()
            continue
            # could solve this instead by making sure that all characters in the test set are in the training set, works for characters in many situations
            
        input_eval = tf.expand_dims([token], 0)
        unnormalized_log_predictions = model(input_eval)
        next_token_probability = tf.nn.softmax(unnormalized_log_predictions[0, 0])[next_token]
        BPC_sum -= tf.math.log(next_token_probability) / math.log(2)
    
    BPC = BPC_sum / (text_length - 1 - unknown_count)
    # TODO: calculate perplexity using BPC and average word length
    all_words = nltk.word_tokenize(test_text)  # also counts things like "," or "!" as words
    perplexity = 2 ** (text_length / len(all_words) * BPC)

    return BPC, perplexity
    
    

## Test

In [13]:
training_dataset, idx_to_char, char_to_idx, vocab, test_text, training_text = preprocess(filename=filename, 
                                                      batch_size=batch_size, 
                                                      seq_length=seq_length, 
                                                      file_location=file_location, 
                                                      training_set_proportion=training_set_proportion,
                                                      shuffle_training_data=shuffle_training_data)


Text length:  1115394
65 training_set unique characters
45 test_set unique characters


In [0]:
vocab_size = len(vocab)
checkpoint_dir = './training_checkpoints/' + filename
checkpoint_prefix = checkpoint_dir + '/ckpt'

In [0]:
### Train model (comment out if only generating)
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size)

In [16]:
# model.load_weights(tf.train.latest_checkpoint(checkpoint_dir)) # use this to continue training where it left off
history = train_model(model, training_dataset, epochs=epochs, checkpoint_prefix=checkpoint_prefix)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [17]:
### Generate sample (comment out if only training)
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

(output, BPC) = generate_text(model,char_to_idx,idx_to_char,
                              start_string=u"t", num_generate=200)

print("BPC: ", BPC)
print(output)

BPC, perplexity = evaluate_model(model=model, 
               test_text=test_text, 
               char_to_idx=char_to_idx, 
               idx_to_char=idx_to_char)
print("BPC: ", BPC)
print("perplexity: ", perplexity)

BPC:  tf.Tensor(2.0812533, shape=(), dtype=float32)
tO, hours of these ofjest yet secret for me,
Thou loving race perisan's by the king.
I think is but his noble deat!
What stance I learn speak pilich walping thee,
Thou didst my sweet should not unull f
0  characters evaluated
1000  characters evaluated
BPC:  tf.Tensor(2.1114361, shape=(), dtype=float32)
perplexity:  tf.Tensor(589.98816, shape=(), dtype=float32)


In [18]:
BPC, perplexity = evaluate_model(model=model, 
               test_text=training_text[:1000], 
               char_to_idx=char_to_idx, 
               idx_to_char=idx_to_char)
print("BPC: ", BPC)
print("perplexity: ", perplexity)

0  characters evaluated
BPC:  tf.Tensor(1.9914516, shape=(), dtype=float32)
perplexity:  tf.Tensor(515.9524, shape=(), dtype=float32)
