In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('train.csv')
data.describe()

Unnamed: 0,Quotes
count,1000
unique,890
top,"Radiate acceptance, and find peace in embracin..."
freq,5


1000 quotes in the dataset, with some repeating quotes

In [3]:
# Remove repeated quotes so as to not have biases
data = data.drop_duplicates()

In [4]:
data = list(data.Quotes.values)
data[:5]

["Embrace the beauty of every sunrise; it's a fresh chance to paint your world with joy.",
 'Embrace challenges; they are the stepping stones to your greatest victories.',
 'Embrace the rhythm of life and let it dance through your soul.',
 'Embrace kindness, for it has the power to change the world one heart at a time.',
 'Embrace the journey, for it leads to the destination of your dreams.']

In [5]:
# Tokenisation.

from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer() # Instantiate a Tokenizer object.
tokenizer.fit_on_texts(data) # Fit your Tokenizer onto dataset. This will map words in your dataset to integers.

In [6]:
tokenizer.word_index

{'the': 1,
 'of': 2,
 'your': 3,
 'and': 4,
 'a': 5,
 'is': 6,
 'in': 7,
 'for': 8,
 'let': 9,
 'to': 10,
 'it': 11,
 'be': 12,
 'every': 13,
 'our': 14,
 'you': 15,
 'that': 16,
 'embrace': 17,
 "life's": 18,
 'this': 19,
 'are': 20,
 'morning': 21,
 'with': 22,
 'radiate': 23,
 'dance': 24,
 'heart': 25,
 'believe': 26,
 'yourself': 27,
 'through': 28,
 "planet's": 29,
 'will': 30,
 'life': 31,
 'love': 32,
 'they': 33,
 "singapore's": 34,
 'kindness': 35,
 'power': 36,
 'from': 37,
 'dreams': 38,
 'we': 39,
 'soul': 40,
 'symphony': 41,
 'act': 42,
 'find': 43,
 'gratitude': 44,
 'singapore': 45,
 'world': 46,
 'strength': 47,
 'light': 48,
 'beauty': 49,
 'journey': 50,
 'nature': 51,
 'joy': 52,
 'planet': 53,
 'canvas': 54,
 'colors': 55,
 'way': 56,
 'whispers': 57,
 'where': 58,
 'potential': 59,
 'hope': 60,
 'testament': 61,
 'resilience': 62,
 'towards': 63,
 'true': 64,
 'new': 65,
 'compassion': 66,
 'beacon': 67,
 'actions': 68,
 'future': 69,
 'spirit': 70,
 'step': 71,


0 index is saved for paddings

Connector and filler words are most common<br>
Most common nouns/verbs/adjectives include dance, life's/life is, embrace<br>
Model is most likely to predict words related to embracing life

In [7]:
total_words = len(tokenizer.word_index) + 1 # Because index 0 is reserved for padding.
print(total_words)

1199


In [8]:
print(data[0])
print(tokenizer.texts_to_sequences([data[0]])[0])
print()

print(data[1])
print(tokenizer.texts_to_sequences([data[1]])[0])
print()

print(data[2])
print(tokenizer.texts_to_sequences([data[2]])[0]) # Extra [0] at the end, because the output is a list of list.
print()

Embrace the beauty of every sunrise; it's a fresh chance to paint your world with joy.
[17, 1, 49, 2, 13, 77, 372, 5, 163, 486, 10, 101, 3, 46, 22, 52]

Embrace challenges; they are the stepping stones to your greatest victories.
[17, 148, 33, 20, 1, 373, 487, 10, 3, 374, 687]

Embrace the rhythm of life and let it dance through your soul.
[17, 1, 119, 2, 31, 4, 9, 11, 24, 28, 3, 40]



When prompted with embrace, the model may predict 1, or 'the', a lot of the time, looking at the sample tokenised texts

In [9]:
# converts each of the 1000 quotes from texts to integers.

input_sequence = []

for datapoint in data:
    token_list = tokenizer.texts_to_sequences([datapoint])[0] 
    input_sequence.append(token_list)

print(f'Total number of quotes: {len(input_sequence)}')

Total number of quotes: 890


In [10]:
# Build input-output pairs where the input has 2 words.
# The input is 1 word and the corresponding output is the next word.
# Create a list of 2 words

input_output_pair1 = []
for datapoint in input_sequence:
    for i in range(len(datapoint) - 1): 
        input_output_pair1.append(datapoint[i:i + 2])
        
print(f'Total number of input-output pairs is: {len(input_output_pair1)}')

Total number of input-output pairs is: 9729


In [11]:
# Build input-output pairs where the input has 3 words.
# The input is 2 words and the corresponding output is the next word.
# Create a list of 3 words

input_output_pair2 = []
for datapoint in input_sequence:
    for i in range(len(datapoint) - 2): # Why -2?
        input_output_pair2.append(datapoint[i:i + 3])
        
print(f'Total number of input-output pairs is: {len(input_output_pair2)}')

Total number of input-output pairs is: 8839


In [12]:
# Build input-output pairs where the input has N-1 words.
# The input is every N-1 words and the corresponding output is the last word.
# Create a list of "N words", i.e. the entire quote.

input_output_pairN = []
for datapoint in input_sequence:
    input_output_pairN.append(datapoint) # Why no nested for-loop?
        
print(f'Total number of input-output pairs is: {len(input_output_pairN)}')

Total number of input-output pairs is: 890


In [13]:
print('Original quotes:')
print(input_sequence[0:2])
print()
print('Each quote chopped into phrases of two words:')
print(input_output_pair1[:20])
print()
print('Each quote chopped into phrases of three words:')
print(input_output_pair2[:20])
print()
print('Each quote itself:')
print(input_output_pairN[:20])

Original quotes:
[[17, 1, 49, 2, 13, 77, 372, 5, 163, 486, 10, 101, 3, 46, 22, 52], [17, 148, 33, 20, 1, 373, 487, 10, 3, 374, 687]]

Each quote chopped into phrases of two words:
[[17, 1], [1, 49], [49, 2], [2, 13], [13, 77], [77, 372], [372, 5], [5, 163], [163, 486], [486, 10], [10, 101], [101, 3], [3, 46], [46, 22], [22, 52], [17, 148], [148, 33], [33, 20], [20, 1], [1, 373]]

Each quote chopped into phrases of three words:
[[17, 1, 49], [1, 49, 2], [49, 2, 13], [2, 13, 77], [13, 77, 372], [77, 372, 5], [372, 5, 163], [5, 163, 486], [163, 486, 10], [486, 10, 101], [10, 101, 3], [101, 3, 46], [3, 46, 22], [46, 22, 52], [17, 148, 33], [148, 33, 20], [33, 20, 1], [20, 1, 373], [1, 373, 487], [373, 487, 10]]

Each quote itself:
[[17, 1, 49, 2, 13, 77, 372, 5, 163, 486, 10, 101, 3, 46, 22, 52], [17, 148, 33, 20, 1, 373, 487, 10, 3, 374, 687], [17, 1, 119, 2, 31, 4, 9, 11, 24, 28, 3, 40], [17, 35, 8, 11, 149, 1, 36, 10, 72, 1, 46, 375, 25, 488, 5, 299], [17, 1, 50, 8, 11, 247, 10, 1, 688,

In [14]:
# Generate more data
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

input_output_pair = input_output_pair1 + input_output_pair2 + input_output_pairN
max_sequence_len = max([len(x) for x in input_output_pair]) # This is to get the longest sequence of texts.
                                                            # And pad all datapoints to be of this length.
input_output_pair = np.array(pad_sequences(input_output_pair, maxlen=max_sequence_len, padding='pre'))
print(input_output_pair) # All datapoints now have the same length.

[[  0   0   0 ...   0  17   1]
 [  0   0   0 ...   0   1  49]
 [  0   0   0 ...   0  49   2]
 ...
 [  0   0   0 ...   7   1 106]
 [  0   0   0 ... 604 112  15]
 [  0   0   0 ...   2  13  81]]


In [15]:
# Now that we have an array, slice the last word as the output.
# The preceding words are the input.

X = input_output_pair[:,:-1]
y = input_output_pair[:,-1]
print(X)
print(y)

[[  0   0   0 ...   0   0  17]
 [  0   0   0 ...   0   0   1]
 [  0   0   0 ...   0   0  49]
 ...
 [  0   0   0 ... 211   7   1]
 [  0   0   0 ...  16 604 112]
 [  0   0   0 ... 588   2  13]]
[  1  49   2 ... 106  15  81]


In [16]:
# Convert y to be a one-hot array.

from tensorflow.keras import utils

y = utils.to_categorical(y, num_classes=total_words)

In [17]:
print(total_words)
print(X.shape)
print(y.shape)

1199
(19458, 34)
(19458, 1199)


In [18]:
# Create model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential

import tensorflow as tf
# gpus = tf.config.experimental.list_physical_devices('GPU')
# for gpu in gpus:
#     tf.config.experimental.set_memory_growth(gpu, True)

epochs = 50

# # Input is max sequence length - 1, as we've removed the last word for the label.
# input_len = max_sequence_len - 1 

# model = Sequential()

# # Add input embedding layer.
# model.add(Embedding(total_words, 10, input_length=input_len))

# # Add LSTM layer with 256 units.
# model.add(LSTM(256))

# # Control overfitting.
# model.add(Dropout(0.3))

# # Add output layer
# model.add(Dense(total_words, activation='softmax'))

# model.summary()

In [19]:
# model.compile(loss='categorical_crossentropy', optimizer='adam')

In [20]:
# model.fit(X, y, epochs=30, verbose=1)
# model.save_weights('rnn_weights1.h5')


In [21]:
# model.load_weights('rnn_weights1.h5')
# model.summary()

In [22]:
seed_texts = ['embrace each day',               
              'radiate some',               
              'believe that',               
              "life's actual purpose is",               
              'dance through each and every',               
              'let your time and energy',               
              'every person is',               
              'our country Singapore is',               
              'planet earth is',               
              'morning and evening would make it'] 

In [23]:
# Run parametric experiment to tune number of LSTM layers
def createAndTrain_model_lstmLayers(num_layers):
    input_len = max_sequence_len - 1 

    model = Sequential()

    # Add input embedding layer.
    model.add(Embedding(total_words, 10, input_length=input_len))

    # Add N LSTM layers with 256 units.
    for i in range(num_layers):
        model.add(LSTM(256, return_sequences=True))
        model.add(tf.keras.layers.LayerNormalization())
        
    # Add output layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    history = model.fit(X, y, epochs=epochs, verbose=1, batch_size=32, shuffle=True)

    return history


In [25]:
# range of layers
layers = np.arange(2, 11)

# initialize output results matrices
lossResultsLayers = np.zeros((epochs, len(layers)))

# loop over layers
for bi in range(len(layers)):
    
    history = createAndTrain_model_lstmLayers(layers[bi])

    # store results
    lossResultsLayers[:,bi] = history.history['loss']

Epoch 1/50


ValueError: in user code:

    File "c:\Users\hazem\anaconda3\envs\gpu_env\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\hazem\anaconda3\envs\gpu_env\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\hazem\anaconda3\envs\gpu_env\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\hazem\anaconda3\envs\gpu_env\lib\site-packages\keras\engine\training.py", line 994, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "c:\Users\hazem\anaconda3\envs\gpu_env\lib\site-packages\keras\engine\training.py", line 1052, in compute_loss
        return self.compiled_loss(
    File "c:\Users\hazem\anaconda3\envs\gpu_env\lib\site-packages\keras\engine\compile_utils.py", line 265, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "c:\Users\hazem\anaconda3\envs\gpu_env\lib\site-packages\keras\losses.py", line 152, in __call__
        losses = call_fn(y_true, y_pred)
    File "c:\Users\hazem\anaconda3\envs\gpu_env\lib\site-packages\keras\losses.py", line 272, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "c:\Users\hazem\anaconda3\envs\gpu_env\lib\site-packages\keras\losses.py", line 1990, in categorical_crossentropy
        return backend.categorical_crossentropy(
    File "c:\Users\hazem\anaconda3\envs\gpu_env\lib\site-packages\keras\backend.py", line 5529, in categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)

    ValueError: Shapes (None, 1199) and (None, 34, 1199) are incompatible


In [None]:
import matplotlib.pyplot as plt

plt.plot(lossResultsLayers)
plt.title('Model Loss')
plt.legend(lossResultsLayers)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid()
plt.show()


In [None]:
# Run parametric experiment to tune batchsize
def createAndTrain_model_batchsize(num_layers, batchsize):
    input_len = max_sequence_len - 1 

    model = Sequential()

    # Add input embedding layer.
    model.add(Embedding(total_words, 10, input_length=input_len))

    # Add N LSTM layers with 256 units.
    for i in range(num_layers):
        model.add(LSTM(256, return_sequences=True))
        model.add(tf.keras.layers.LayerNormalization())
        
    # Add output layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    history = model.fit(X, y, epochs=epochs, verbose=1, batch_size=batchsize, shuffle=True)

    return history


In [None]:
# range of batch sizes
batchsizes = 2**np.arange(4, 9)
numLayers =

# initialize output results matrices
lossResultsBatchsizes = np.zeros((epochs, len(batchsizes)))

# loop over layers
for bi in range(len(batchsizes)):
    history = createAndTrain_model_batchsize(numLayers, batchsizes[bi])

    # store results
    lossResultsBatchsizes[:,bi] = history.history['loss']

In [None]:
plt.plot(lossResultsBatchsizes)
plt.title('Model Loss')
plt.legend(lossResultsBatchsizes)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid()
plt.show()

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

monitor_val_acc = EarlyStopping(monitor='loss', patience=15)
model_checkpoint = ModelCheckpoint('best_RNN_version1.h5', save_best_only = True)

def create_bestModel(numLayers, batchsize):
    input_len = max_sequence_len - 1 

    model = Sequential()

    # Add input embedding layer.
    model.add(Embedding(total_words, 10, input_length=input_len))

    # Add N LSTM layers with 256 units.
    for i in range(numLayers):
        model.add(LSTM(256, return_sequences=True))
        model.add(tf.keras.layers.LayerNormalization())
        
    # Add output layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    history = model.fit(X, y, epochs=epochs, verbose=1, batch_size=batchsize, shuffle=True, callbacks=[monitor_val_acc, model_checkpoint])

    return history

In [None]:
batchsize = 

final_model = create_bestModel(numLayers, batchsize)
final_model.load_weights('best_RNN_version1.h5')


In [None]:
# def predict_next_N_words(input_texts, N_words=10):
#     predictions_list = []
#     for input_text in input_texts:
#         predicted_words = ""
#         tokens = tokenizer.texts_to_sequences([input_text])[0]
#         for _ in range(N_words):
#             tokens = pad_sequences([tokens], maxlen=max_sequence_len-1, padding='pre')
#             prediction = np.argmax(final_model.predict(tokens), axis=1)
#             predicted_word = tokenizer.sequences_to_texts([prediction])[0]
#             predicted_words += predicted_word + " "
#             tokens = np.append(tokens, prediction)
#         predicted_words = input_text + " " + predicted_words
#         predictions_list.append(predicted_words)
            
#     return predictions_list

def predict_next_N_words_unique(seed_texts, top_p=1, N_words=10):
    generated_texts = []

    for seed_text in seed_texts:
        current_generated_text = seed_text
        for _ in range(N_words):
            # Tokenize the input sequence
            seed_sequence = tokenizer.texts_to_sequences([seed_text])[0]

            # Pad the sequence if needed
            padded_sequence = tf.keras.preprocessing.sequence.pad_sequences([seed_sequence], maxlen=max_sequence_len-1)

            # Get the model's prediction for the next word
            predictions = final_model.predict(padded_sequence, verbose=0)[0]

            # Apply top-p sampling
            sorted_indices = np.argsort(predictions)[::-1]
            cumulative_probs = np.cumsum(predictions[sorted_indices])
            selected_indices = sorted_indices[cumulative_probs <= top_p]

            # Normalize probabilities
            selected_probs = predictions[selected_indices] / np.sum(predictions[selected_indices])

            # Sample from the selected indices based on the normalized probabilities
            next_index = np.random.choice(selected_indices, p=selected_probs)

            # Convert the index back to a word
            next_word = tokenizer.index_word[next_index]
            # print(next_word)
            # Break if the generated text is too long or if an end token is predicted
            if next_word is None or next_word == 'end_token' or len(current_generated_text.split()) >= N_words + len(seed_text):
                break

            # Update the generated text and seed_text for the next iteration
            current_generated_text += " " + next_word
            seed_text += " " + next_word

        generated_texts.append(current_generated_text)

    return generated_texts


In [None]:
prediction = predict_next_N_words_unique(seed_texts)
prediction

['embrace each day and carry that you nature call defy wisdom beneath than',
 'radiate some is believe and our source its springs bay echoes are',
 'believe that our reside discovery a destinies of believe and carry touch',
 "life's actual purpose is soothes conservation in a they blooms reveal breath truly of",
 'dance through each and every light determination beginnings our authenticity its your inner source be',
 'let your time and energy to our truly its in a speak secrets affection and',
 'every person is the garden of carry we stands desires and journey our',
 'our country Singapore is you reminds brush and wisdom efficiency simplicity of wisdom wisdom',
 "planet earth is life's progress success touch be a guiding of joy in",
 'morning and evening would make it in the testament be the blessings soul intention and success']

In [None]:
prediction2 = predict_next_N_words_unique(seed_texts)
prediction2

['embrace each day of your they and beautiful and fuels efficiency grudges dawn',
 'radiate some is believe and our truly and beauty efficiency on should',
 'believe that our reside discovery a true be energy truly of morning',
 "life's actual purpose is the blessings of intention and when to the find that",
 'dance through each and every stories guides on be our truly determination fresh fire pursuit',
 'let your time and energy wisdom tending wisdom halls should is grass serenity chapter bay',
 'every person is your kindness source be wisdom blossoming effort our grateful hold',
 'our country Singapore is the emotions of city and deserves beautiful desires that effort',
 'planet earth is the blessings and one of create plateaus on serenity chapters',
 'morning and evening would make it sand singapore be a preciousness fresh us face a night']