<a href="https://colab.research.google.com/github/jvgille/deep-learning-project/blob/master/mostly_jeopardy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Generator - Deep Learning project

## setup

In [0]:
# this is the one before i started trying to add regularization
# this is my current working file
import os
import tensorflow as tf
import numpy as np
import math
import datetime
import nltk
nltk.download('punkt')
from collections import defaultdict
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # 0 all, 1 no info, 2 no warning, 3 no error
from google.colab import drive
drive.mount('/content/drive')
# Load the TensorBoard notebook extension
%load_ext tensorboard
# Clear any logs from previous runs
log_dir =  r'/content/drive/My Drive/Skola/Deep Learning/tensorboard_logs/' + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


# TODO

- check what a good learning rate is for the regular setup of 518 nodes one layer
- try double 518 layers and optimize learning rate for it, compare BPC between this and 1 layer
- reduce capacity of first/second layer see difference between low high and high low
- try bidirectional instead, might make more sense with text
- try skip connections if easy
- try adding even more layers, say 5 for more significant difference
- can also play with batch size see what is fastest and if it changes performance
- reduce sequence length, see if text becomes stupider but higher BPC?
- testa GRU se sista FL language engineering

## Hyperparameters

In [0]:
file_location = '/content/drive/My Drive/Colab Notebooks/DL project/'
filename = 'jeopardy2'
epochs = 8
batch_size = 64
seq_length = 100
rnn_units = 1024 # 1024
use_second_layer = False
rnn_units_layer_2 = 0 # only used if use_second_layer
embedding_dim = 256 # not used atm
training_set_proportion = 0.999

# Adam optimizer parameters:
LEARNING_RATE = 0.001 # 0.001
BETA_1 = 0.9 # 0.9
BETA_2 = 0.999 # 0.999
EPSILON = 1e-07 # 1e-07

shuffle_training_data = True

## Functions

In [0]:
def preprocess(filename, batch_size, seq_length, training_set_proportion, file_location='', shuffle_training_data=True):
    text = open(file_location + 'datasets/'+filename+'.txt', 'rb').read().decode(encoding='utf-8')
    text_length = len(text)
    print("Text length: ", text_length)

    training_text = text[:int((training_set_proportion * text_length))]
    test_text = text[int((training_set_proportion * text_length)):]

    print ('{} training_set unique characters'.format(len(sorted(set(training_text)))))
    print ('{} test_set unique characters'.format(len(sorted(set(test_text)))))

    training_vocab = sorted(set(training_text))

    char_to_idx = defaultdict(lambda:-1)
    idx_to_char = {-1: "Unknown"}
    for i, u in enumerate(training_vocab):
        char_to_idx[u] = i
        idx_to_char[i] = u

    training_text_as_int = np.array([char_to_idx[c] for c in training_text])
    #test_text_as_int = np.array([char_to_idx[c] for c in test_text])

    training_char_dataset = tf.data.Dataset.from_tensor_slices(training_text_as_int)
    #test_char_dataset = tf.data.Dataset.from_tensor_slices(test_text_as_int)

    training_sequences = training_char_dataset.batch(seq_length+1, drop_remainder=True)
    #test_sequences = test_char_dataset.batch(seq_length+1, drop_remainder=True)

    def split_input_target(chunk):
        input_text = chunk[:-1]
        target_text = chunk[1:]
        return input_text, target_text

    training_dataset = training_sequences.map(split_input_target)
    #test_dataset = test_sequences.map(split_input_target)

    BUFFER_SIZE = 10000 # TODO: make actual shuffle?
    if shuffle_training_data:
        training_dataset = training_dataset.shuffle(BUFFER_SIZE).batch(batch_size, drop_remainder=True)

    
    return training_dataset, idx_to_char, char_to_idx, training_vocab, test_text, training_text

def train_model(model, dataset, epochs, checkpoint_prefix):
    def loss(labels, logits):
        return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

    our_optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE, beta_1=BETA_1, beta_2=BETA_2, epsilon=EPSILON)
    model.compile(optimizer='adam', loss=loss)

    # todo save best only
    checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_prefix,
        save_weights_only=True,
        save_best_only=True,
        monitor='loss') # TODO monitor val_loss instead
    tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

    return model.fit(dataset, epochs=epochs, callbacks=[checkpoint_callback, tensorboard_callback])


def one_hot(x):
    global vocab_size
    return tf.one_hot(tf.cast(x, 'uint8'), depth=vocab_size)


def build_model(vocab_size, embedding_dim, rnn_units, batch_size):

    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Lambda(one_hot, batch_input_shape=[batch_size,None]))
    model.add(tf.keras.layers.LSTM(rnn_units,
                                return_sequences=True,
                                stateful=True,
                                recurrent_initializer='glorot_uniform'))
    
    if use_second_layer:
      model.add(tf.keras.layers.LSTM(rnn_units_layer_2,
                                return_sequences=True,
                                stateful=True,
                                recurrent_initializer='glorot_uniform',
                                dropout=0.2,
                                recurrent_dropout=0.2,
                                kernel_regularizer = tf.keras.regularizers.l2(0.2),
                                recurrent_regularizer = tf.keras.regularizers.l2(0.2),
                                bias_regularizer = tf.keras.regularizers.l2(0.2),
                                activity_regularizer = tf.keras.regularizers.l2(0.2)))
    tf.keras.layers.Dropout(0.4)
    model.add(tf.keras.layers.Dense(vocab_size))
    print(model.summary)
    return model


    
def generate_text(model, char_to_idx, idx_to_char,
                  start_string, num_generate=1000, temperature=1.0):
    # Converting our start string to numbers (vectorizing)
    input_eval = [char_to_idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []

    model.reset_states()
    BPC_sum = 0
    for i in range(num_generate):
        predictions = model(input_eval)
         # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # used for BPC/perplexity calculations. predictions is unnormalized log probabilities (that is how tf.random.cathegorical treats them later)
        current_prediction = tf.nn.softmax(predictions[-1, :]) # now we have normalized regular probabilities. -1 because we only care about the last output character/word

        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        current_probability = current_prediction[predicted_id]
        BPC_sum -= tf.math.log(current_probability) / math.log(2) # aka add minus log2 of prob

        # We pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx_to_char[predicted_id])

    BPC = BPC_sum / num_generate
    return ( (start_string + ''.join(text_generated)), BPC)



def evaluate_model(model, test_text, char_to_idx, idx_to_char):
    test_text_as_int = [char_to_idx[s] for s in test_text]
    text_length = len(test_text)
    model.reset_states()
    BPC_sum = 0
    unknown_count = 0
    for i in range(text_length - 1):
        if i % 1000 == 0:
          print(i, " characters evaluated so far")
        token = test_text_as_int[i]
        next_token = test_text_as_int[i+1]
        if token == -1 or next_token == -1: # did not exist in training
            unknown_count += 1
            model.reset_states()
            continue
            # could solve this instead by making sure that all characters in the test set are in the training set, works for characters in many situations
            
        input_eval = tf.expand_dims([token], 0)
        unnormalized_log_predictions = model(input_eval)
        next_token_probability = tf.nn.softmax(unnormalized_log_predictions[0, 0])[next_token]
        BPC_sum -= tf.math.log(next_token_probability) / math.log(2)
    
    BPC = BPC_sum / (text_length - 1 - unknown_count)
    # TODO: calculate perplexity using BPC and average word length
    all_words = nltk.word_tokenize(test_text)  # also counts things like "," or "!" as words
    perplexity = 2 ** (text_length / len(all_words) * BPC)

    return BPC, perplexity
    
    

## Test

In [0]:
training_dataset, idx_to_char, char_to_idx, vocab, test_text, training_text = preprocess(filename=filename, 
                                                      batch_size=batch_size, 
                                                      seq_length=seq_length, 
                                                      file_location=file_location, 
                                                      training_set_proportion=training_set_proportion,
                                                      shuffle_training_data=shuffle_training_data)


Text length:  24147105
96 training_set unique characters
83 test_set unique characters


In [0]:
vocab_size = len(vocab)
checkpoint_dir = './training_checkpoints/' + filename
checkpoint_prefix = checkpoint_dir + '/ckpt'

In [0]:
### Train model (comment out if only generating)
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size)

<bound method Network.summary of <tensorflow.python.keras.engine.sequential.Sequential object at 0x7f196b6f7e80>>


In [0]:
#model.load_weights(tf.train.latest_checkpoint(checkpoint_dir)) # use this to continue training where it left off
history = train_model(model, training_dataset, epochs=epochs, checkpoint_prefix=checkpoint_prefix)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [0]:
### Generate sample (comment out if only training)
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

(output, BPC) = generate_text(model,char_to_idx,idx_to_char,
                              start_string="QUESTION:\n", num_generate=1000, temperature=0.7)

print("BPC: ", BPC)
print(output)


<bound method Network.summary of <tensorflow.python.keras.engine.sequential.Sequential object at 0x7f18ab03d5f8>>
BPC:  tf.Tensor(1.0142515, shape=(), dtype=float32)
QUESTION:
QUESTION:
'In a 1992 special news star this director of "The Pantree Gardens" was a magazine that won an Oscar for his movie "Delilah With" these behavior'
ANSWER:
Neil Simon
QUESTION:
'During this event, the power of horses, the world was also called the Grand Canyon'
ANSWER:
Arctic Ocean
QUESTION:
'This capital of Mars is named for a castle of war in 1985'
ANSWER:
Zemin
QUESTION:
'The governor of St. Martha's Church last inch of a house of the format of a parliamentary sports car school in this country'
ANSWER:
Israel
QUESTION:
'She's the teacher of a thriller for his 1939 album "Paradise Lost"'
ANSWER:
Adrian Falls
QUESTION:
'"If I Jones", following "Never Land that is called the thing that ran the ____ might be standing"'
ANSWER:
the east
QUESTION:
'The first job of this president had states with Thor Heyerda

In [0]:
BPC, perplexity = evaluate_model(model=model, 
               test_text=training_text[0:1000], 
               char_to_idx=char_to_idx, 
               idx_to_char=idx_to_char)
print("BPC: ", BPC)
print("perplexity: ", perplexity)


BPC, perplexity = evaluate_model(model=model, 
               test_text=test_text[:1000], 
               char_to_idx=char_to_idx, 
               idx_to_char=idx_to_char)
print("BPC: ", BPC)
print("perplexity: ", perplexity)




0  characters evaluated so far
BPC:  tf.Tensor(1.311261, shape=(), dtype=float32)
perplexity:  tf.Tensor(98.532715, shape=(), dtype=float32)
0  characters evaluated so far
BPC:  tf.Tensor(1.4532557, shape=(), dtype=float32)
perplexity:  tf.Tensor(200.67458, shape=(), dtype=float32)


In [0]:
#%tensorboard --logdir="/content/drive/My Drive/Skola/Deep Learning/tensorboard_logs"
print(test_text[68:100])

ON:
'The dwarves in his novels 


# Results

512 nodes standard setup, 8 epochs, LR 0.001

Test BPC:

2.5164585

2.5571847

Train BPC:

2.4705796

2.5323067

-----------------

512 nodes standard setup, 8 epochs, LR 0.005

Test BPC:

2.5267715

Train BPC:

2.4885614

--------------

512 nodes standard setup, 8 epochs, LR 0.0002

Test BPC:

2.557327

Train BPC:

2.4768913

---------------

512 - 512 nodes 2 layer, 8 epochs, LR 0.001

Test BPC:

2.1694303

Train BPC:

2.1530702

31 seconds per epoch! - how many nodes for single layer could i get for this? (time is not super comparative since it differs on google colabs)

----------------

512 - 512 nodes 2 layer, 8 epochs, LR 0.001

Test BPC:

2.1694303

Train BPC:

2.1530702

--------------

1024 nodes 8 epochs, LR 0.001

Test BPC:

2.1679668

Train BPC:

2.1070945

------------

256 nodes in 4 layers, 8 epochs LR 0.001

Test BPC:

2.64231

Train BPC:

2.624404

--------------

1024 nodes 1 layer 8 epochs SimpleRNN

Test BPC:

2.7277024

Train BPC:

2.555806

-------------------

1024 nodes 1 layer 8 epochs GRU

Test BPC:

2.197043

Train BPC:

2.1322622

-----------------

1024 nodes 1 layer 8 epochs GRU - Bidirectional. Loss went down very quickly, maybe use less epochs if overfitting. The generated text was "--------------------------"
But bidirectional makes no sense right?

Test BPC:
 
7.093867

Train BPC:

7.005799

--------------------

256 nodes 1 layer 8 epochs GRU - Bidirectional. made like MPMPMPMPMMMPPMPMP

Test BPC:
 
7.197046

Train BPC:

7.112849

--------------

1024 nodes 1 layer 8 epochs SimpleRNN - Bidirectional. 

Test BPC:
 
7.6806884

Train BPC:

7.4952

-----------------

1024 nodes 1 layer 8 epochs LSTM - Bidirectional. 

Test BPC:

6.6484833

Train BPC:

6.6869802

--------------------

1024 nodes 1 layer 8 epochs LSTM with only 20 sequence length

Test BPC:

2.0334609

Train BPC:

1.9287055

--------------------------

jeopardy2 data, 4 epochs test BPC 1.4876859 train BPC 1.4236836. Then 8 epochs gave 1.3546735 train  and 1.4685392 test so pretty much just overfitting now.

----------------------

Adding dropouts 0.1/0.2 before dense same as before
0  characters evaluated so far
BPC:  tf.Tensor(1.3314992, shape=(), dtype=float32)
perplexity:  tf.Tensor(105.76686, shape=(), dtype=float32)
0  characters evaluated so far
BPC:  tf.Tensor(1.4445932, shape=(), dtype=float32)
perplexity:  tf.Tensor(194.43198, shape=(), dtype=float32)

--------------

keeping dropout also doing l2 of 0.01:

0  characters evaluated so far
BPC:  tf.Tensor(1.3669864, shape=(), dtype=float32)
perplexity:  tf.Tensor(119.757454, shape=(), dtype=float32)
0  characters evaluated so far
BPC:  tf.Tensor(1.4631684, shape=(), dtype=float32)
perplexity:  tf.Tensor(208.06439, shape=(), dtype=float32)

--------------------------

increasing l2 to 0.1 och dropout to 0.2/0.4

0  characters evaluated so far
BPC:  tf.Tensor(1.311261, shape=(), dtype=float32)
perplexity:  tf.Tensor(98.532715, shape=(), dtype=float32)
0  characters evaluated so far
BPC:  tf.Tensor(1.4532557, shape=(), dtype=float32)
perplexity:  tf.Tensor(200.67458, shape=(), dtype=float32)
--------------
--------------
file_location = '/content/drive/My Drive/Colab Notebooks/DL project/'
filename = 'shakespeare'
epochs = 8
batch_size = 64
seq_length = 100
rnn_units = 512 # 1024
use_second_layer = False
rnn_units_layer_2 = 0 # only used if use_second_layer
embedding_dim = 256 # not used atm
training_set_proportion = 0.999

# Adam optimizer parameters:
LEARNING_RATE = 0.0002 # 0.001
BETA_1 = 0.9 # 0.9
BETA_2 = 0.999 # 0.999
EPSILON = 1e-07 # 1e-07

shuffle_training_data = True

this gave nonsense for start string single capital letter or first 1000 characters, but not first 1000 characters starting from character 1 instead of 0. BPC 14 worse than uniform random.

----------------------

'In 1995 an explorer took his first ballet "House Of The River of the World" at the 1994 Olympics'

"Text: I have a dog biscuit and can always forware my cats for heating of these cookies."

QUESTION:
'The familial term for a person who is traveling out with a group of living vents'

QUESTION: 'The name of this compoud comes from the latin for "to start a book"'

