In [1]:
# Read data.

import pandas as pd

data = pd.read_csv('lab4 data.csv')

print(data)

# Convert data to a list.
data = list(data.quote.values)
print(f'There are {len(data)} quotes in this dataset.')
print(data[:5]) # Print the first five quotes in your list. If you print everything ... it will blow up ...

                                                 quote
0       The sun sets, painting the sky with warm hues.
1       Birds chirp, announcing the dawn of a new day.
2           Waves crash, a symphony of nature's power.
3       Leaves rustle, whispering secrets to the wind.
4        Raindrops dance, a gentle rhythm on the roof.
..                                                 ...
495  Gamma-ray bursts illuminate, the cosmos in bri...
496  Solar flares surge, in fiery displays of solar...
497  Interstellar dust dances, in the cosmic ballet...
498  Cosmic strings resonate, with the symphony of ...
499  Stellar remnants whisper, the tales of ancient...

[500 rows x 1 columns]
There are 500 quotes in this dataset.
['The sun sets, painting the sky with warm hues.', 'Birds chirp, announcing the dawn of a new day.', "Waves crash, a symphony of nature's power.", 'Leaves rustle, whispering secrets to the wind.', 'Raindrops dance, a gentle rhythm on the roof.']


In [2]:
# Tokenisation.

from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer() # Instantiate a Tokenizer object.
tokenizer.fit_on_texts(data) # Fit your Tokenizer on your dote that index starts with 1, not 0.ataset. This will map words in your dataset to integers.
print(tokenizer.word_index) # N

{'the': 1, 'of': 2, 'in': 3, 'climate': 4, 'and': 5, 'cosmic': 6, 'a': 7, 'like': 8, 'change': 9, 'to': 10, 'with': 11, 'stars': 12, 'whispers': 13, 'night': 14, 'sky': 15, 'on': 16, 'sea': 17, 'time': 18, 'echoes': 19, 'energy': 20, 'shadows': 21, 'dreams': 22, 'through': 23, 'secrets': 24, 'stardust': 25, 'leaves': 26, 'for': 27, 'dance': 28, 'clouds': 29, 'their': 30, 'from': 31, 'ancient': 32, 'solar': 33, 'tales': 34, 'universe': 35, 'snowflakes': 36, 'earth': 37, 'laughter': 38, 'whisper': 39, 'carbon': 40, 'tears': 41, 'embrace': 42, 'petals': 43, 'voices': 44, 'heart': 45, 'distant': 46, 'celestial': 47, 'rising': 48, 'emissions': 49, 'ecosystems': 50, 'flames': 51, 'rivers': 52, 'carry': 53, 'thunder': 54, 'weave': 55, 'tapestry': 56, 'resonate': 57, 'water': 58, 'exoplanets': 59, 'wind': 60, 'flicker': 61, 'butterflies': 62, 'wings': 63, 'breeze': 64, 'breezes': 65, 'linger': 66, 'rainbows': 67, 'footsteps': 68, 'sunbeams': 69, 'weaves': 70, 'cradle': 71, 'life': 72, 'levels'

In [3]:
total_words = len(tokenizer.word_index) + 1 # Because index 0 is reserved for padding.
print(total_words)

979


In [4]:
# Under the Tokenizer object, you can convert texts to sequences of integers.
# Be careful of the indexing of your data list, and how you use the ".texts_to_sequences" method.

print(data[0])
print(tokenizer.texts_to_sequences([data[0]])[0])
print()

print(data[1])
print(tokenizer.texts_to_sequences([data[1]])[0])
print()

print(data[2])
print(tokenizer.texts_to_sequences([data[2]])[0]) #There is this extra [0] at the end, because the output is a list of list.
print()

The sun sets, painting the sky with warm hues.
[1, 118, 471, 119, 1, 15, 11, 221, 159]

Birds chirp, announcing the dawn of a new day.
[472, 473, 298, 1, 299, 2, 7, 300, 301]

Waves crash, a symphony of nature's power.
[90, 474, 7, 91, 2, 160, 222]



In [5]:
# The following for-loop converts each of the 500 quotes from texts to integers.

input_sequence = []

for datapoint in data:
    token_list = tokenizer.texts_to_sequences([datapoint])[0] # Again, this extra [0] is because the output
                                                              # is a list of list.
    input_sequence.append(token_list)

print(f'Total number of quotes: {len(input_sequence)}')

Total number of quotes: 500


In [6]:
# Let us now create input-output pairs.
# The input is, say, every one word and the corresponding output is the following word.
# So, let us create a list of "two words".

input_output_pair1 = []
for datapoint in input_sequence:
    for i in range(len(datapoint) - 1): # Why -1?
        input_output_pair1.append(datapoint[i:i + 2])
        
print(f'Total number of input-output pairs is: {len(input_output_pair1)}')

Total number of input-output pairs is: 3111


In [7]:
# We can similarly build input-output pairs where the input has two words.
# The input is say, every two words and the corresponding output is the third word.
# So, let us create a list of "three words".

input_output_pair2 = []
for datapoint in input_sequence:
    for i in range(len(datapoint) - 2): # Why -2?
        input_output_pair2.append(datapoint[i:i + 3])
        
print(f'Total number of input-output pairs is: {len(input_output_pair2)}')

Total number of input-output pairs is: 2611


In [8]:
# We can similarly build input-output pairs where the input has N-1 words.
# The input is say, every N-1 words and the corresponding output is the last word.
# So, let us create a list of "N words", i.e. the entire quote.

input_output_pairN = []
for datapoint in input_sequence:
    input_output_pairN.append(datapoint) # Why no nested for-loop?
        
print(f'Total number of input-output pairs is: {len(input_output_pairN)}')

Total number of input-output pairs is: 500


In [9]:
# Notice each quote has been chopped into phrases comprising two words, three words, or the entire quote.

print('Original quotes:')
print(input_sequence[0:2])
print()
print('Each quote chopped into phrases of two words:')
print(input_output_pair1[:20])
print()
print('Each quote chopped into phrases of three words:')
print(input_output_pair2[:20])
print()
print('Each quote itself:')
print(input_output_pairN[:20])

Original quotes:
[[1, 118, 471, 119, 1, 15, 11, 221, 159], [472, 473, 298, 1, 299, 2, 7, 300, 301]]

Each quote chopped into phrases of two words:
[[1, 118], [118, 471], [471, 119], [119, 1], [1, 15], [15, 11], [11, 221], [221, 159], [472, 473], [473, 298], [298, 1], [1, 299], [299, 2], [2, 7], [7, 300], [300, 301], [90, 474], [474, 7], [7, 91], [91, 2]]

Each quote chopped into phrases of three words:
[[1, 118, 471], [118, 471, 119], [471, 119, 1], [119, 1, 15], [1, 15, 11], [15, 11, 221], [11, 221, 159], [472, 473, 298], [473, 298, 1], [298, 1, 299], [1, 299, 2], [299, 2, 7], [2, 7, 300], [7, 300, 301], [90, 474, 7], [474, 7, 91], [7, 91, 2], [91, 2, 160], [2, 160, 222], [26, 161, 475]]

Each quote itself:
[[1, 118, 471, 119, 1, 15, 11, 221, 159], [472, 473, 298, 1, 299, 2, 7, 300, 301], [90, 474, 7, 91, 2, 160, 222], [26, 161, 475, 24, 10, 1, 60], [120, 28, 7, 162, 163, 16, 1, 302], [12, 223, 8, 224, 3, 1, 14, 15], [36, 164, 476, 1, 92, 3, 93], [51, 61, 303, 21, 16, 1, 304], [477, 3

# NOTE: You should expand this part to generate more input-output pairs.

## This is a way of generating more training dataset.

In [10]:
# Let us combine our sequences of different lengths by padding them so that all have the same length,
# and then converting to a numpy array.

from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

input_output_pair = input_output_pair1 + input_output_pair2 + input_output_pairN
max_sequence_len = max([len(x) for x in input_output_pair]) # This is to get the longest sequence of texts.
                                                            # And let us pad all datapoints to be of this length.
input_output_pair = np.array(pad_sequences(input_output_pair, maxlen=max_sequence_len, padding='pre'))
print(input_output_pair) # All datapoints now have the same length.

[[  0   0   0 ...   0   1 118]
 [  0   0   0 ...   0 118 471]
 [  0   0   0 ...   0 471 119]
 ...
 [  0   0   0 ... 138   2 219]
 [  0   0   0 ...   2   1  35]
 [  0   0   0 ...   2  32 978]]


In [11]:
# Now that we have an array, let us slice the last word as the output.
# The preceding words are the input.

X = input_output_pair[:,:-1]
y = input_output_pair[:,-1]
print(X)
print(y)

[[  0   0   0 ...   0   0   1]
 [  0   0   0 ...   0   0 118]
 [  0   0   0 ...   0   0 471]
 ...
 [  0   0   0 ...   6 138   2]
 [  0   0   0 ...  91   2   1]
 [  0   0   0 ...  34   2  32]]
[118 471 119 ... 219  35 978]


In [12]:
# Let us convert y to be a one-hot array.

from tensorflow.keras import utils

y = utils.to_categorical(y, num_classes=total_words)
print(y)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


In [13]:
print(total_words)
print(X.shape)
print(y.shape)

979
(6222, 13)
(6222, 979)


In [14]:
# FINALLY. Let us create a simple model using LSTM.
# NOTE: You should try various other RNN models with other architectures.

from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential

import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

# Input is max sequence length - 1, as we've removed the last word for the label.
input_len = max_sequence_len - 1 

model = Sequential()

# Add input embedding layer.
model.add(Embedding(total_words, 10, input_length=input_len))

# Add LSTM layer with 256 units.
model.add(LSTM(256))

# Control overfitting.
model.add(Dropout(0.3))

# Add output layer
model.add(Dense(total_words, activation='softmax'))

In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 13, 10)            9790      
                                                                 
 lstm (LSTM)                 (None, 256)               273408    
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense (Dense)               (None, 979)               251603    
                                                                 
Total params: 534801 (2.04 MB)
Trainable params: 534801 (2.04 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [16]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
model.fit(X, y, epochs=30, verbose=1)

In [None]:
# Good idea to save your weights, after spending *hours* (!?) training ...

model.save_weights('lab4 weights.h5')

In [None]:
# If you want to load them later.

model.load_weights('lab4 weights.h5')

In [17]:
# Let us define a function to generate next word, given some input.

def predict_next_word(input_text):
    tokens = tokenizer.texts_to_sequences([input_text])[0] # Convert your input text into integers.
    tokens = pad_sequences([tokens], maxlen=max_sequence_len-1, padding='pre') # Pad your sequence of integers to the
                                                                               # required length that the model accepts.
    prediction = np.argmax(model.predict(tokens, verbose=0), axis=1) # Get the prediction of your model.
    prediction = tokenizer.sequences_to_texts([prediction])[0] # Since the model output is an integer, use your tokenizer
                                                               # to get the corresponding word!
                                                               # That is the prediction of your model for the next word!
    return prediction

In [18]:
# Now let us test with some input texts.

test_texts = ['sun sets',
              'twinkle like',
              'entwine creating',
              'stellar remnants',
              'solar flares']
for text in test_texts:
    prediction = predict_next_word(text)
    print(text, prediction)

sun sets level
twinkle like ripples
entwine creating level
stellar remnants level
solar flares souls


In [19]:
# Create your own function to predict the next N words.

def predict_next_N_words(input_text, N_words=10):
    return # ???

In [20]:
prediction = predict_next_N_words(test_texts) # ???
prediction

# Your model will predict a fixed word, given a fixed input.

## How to make your model be "creative", and predict different possible inputs?