# Haiku generator with LSTM


Haiku dataset https://www.kaggle.com/hjhalani30/haiku-dataset

# 1. Data preparation and cleaning

In [1]:
import matplotlib.pyplot as plt 
import numpy as np 
import os 
import pandas as pd 
import sys
import keras
from keras.models import load_model, Model
from keras.layers import Dense, Activation, Dropout, Input, LSTM, Reshape, Lambda, RepeatVector
from keras.initializers import glorot_uniform
from keras.utils import to_categorical
from keras.optimizers import Adam
from keras import backend as K
import tensorflow as tf
#tf.compat.v1.disable_eager_execution()

In [2]:
dirname = os.getcwd()
filename = "all_haiku.csv"
path_to_dataset = os.path.join(dirname, filename)
n_examples = 124295
data = pd.read_csv(path_to_dataset, delimiter=',', nrows = n_examples)
nRow, nCol = data.shape
print(f'There are {nRow} rows and {nCol} columns')

#cols 1-3 are 1st, 2nd, and 3rd lines of the haiku respectively

There are 124295 rows and 6 columns


In [3]:
data = data.to_numpy()

In [4]:
#cols 1-3 are 1st, 2nd, and 3rd lines of the haiku respectively

X_1_0 = data[:,1]
X_2_0 = data[:,2]
X_3_0 = data[:,3]

In [5]:
X_1 = []
X_2 = []
X_3 = []
for i in range(n_examples):
    if type(X_1_0[i]) == str and type(X_2_0[i]) == str and type(X_3_0[i]) == str:
        X_1.append(X_1_0[i])
        X_2.append(X_2_0[i])
        X_3.append(X_3_0[i])

In [6]:
#clean the data to get rid off "-" etc
#to make it sensitive to lines, maybe we could implement a line break as some symbol e.g. "|" and merge the X's together
punctuation = ['-', '--', '~', '—', '.', ';', ',', ':', '?', '!', '#', '(', ')', '<', '>', '*', '%', '{', '_', ']', '[', '@', '`', '\xa0', '=', '"', '&', '–', '…', '“', '”']

end_of_line = '.'
end_of_haiku = '/'

for i in range(len(X_1)):
    for p in punctuation:
        if p in X_1[i]:
            X_1[i] = X_1[i].replace(p, '')
        if p in X_2[i]:
            X_2[i] = X_2[i].replace(p, '')
        if p in X_3[i]:
            X_3[i] = X_3[i].replace(p, '')
   
    #delete redundant spaces
    if X_1[i][-3:-1] == '   ':
        X_1[i] = X_1[i].replace('   ', '')
    elif X_1[i][-2:-1] == '  ':
        X_1[i] = X_1[i].replace('  ', '')   
    elif X_1[i][-1] == ' ':
        X_1[i] = X_1[i].replace(' ', '')
    
    if X_2[i][-3:-1] == '   ':
        X_2[i] = X_2[i].replace('   ', '')
    elif X_2[i][-2:-1] == '  ':
        X_2[i] = X_2[i].replace('  ', '')   
    elif X_1[i][-1] == ' ':
        X_2[i] = X_2[i].replace(' ', '')
        
    if X_3[i][-3:-1] == '   ':
        X_3[i] = X_1[i].replace('   ', '')
    elif X_3[i][-2:-1] == '  ':
        X_3[i] = X_1[i].replace('  ', '')   
    elif X_3[i][-1] == ' ':
        X_1[i] = X_1[i].replace(' ', '')
    
    #we want lowercase letters only
    #X_1[i] = X_1[i].lower()
    #X_2[i] = X_2[i].lower()
    #X_3[i] = X_3[i].lower()
    
    X_1[i] += end_of_line
    X_2[i] += end_of_line
    X_3[i] += end_of_line
    X_3[i] += end_of_haiku
    


In [7]:
#merge the three lines together
haikus = []
for i in range(len(X_1)):
    haikus.append(X_1[i]+X_2[i]+X_3[i])

In [8]:
short_haikus = []
for i in range(len(haikus)):
    if len(haikus[i]) < 60:
        short_haikus.append(haikus[i])
n_haikus = len(short_haikus)
print(n_haikus)

haikus = short_haikus

23231


In [9]:
max_len = 0
for i in range(n_haikus):
    max_len = max(max_len, len(haikus[i]))
    
print(max_len)

59


In [10]:
#create characters dict 
token_index = {}
idx = -1
for haiku in haikus:
    for i in range(len(haiku)):
        if haiku[i] not in token_index:
            token_index[haiku[i]] = idx+1
            idx += 1

#add padding if you want all of the same length?

In [11]:
vocab_size = len(token_index)
print(vocab_size)

74


Now that we have turned characters into tokens with corresponding indexes we can vectorise our training examples by one-hot-encoding them and subsequently creating 3D numpy arrays for X and Y:

`X`: This is an (n_haikus, max_len, vocab_size) dimensional array.
- We have max_len timesteps for each training example
- At each time step the input is one of vocab_size possible values and is represented as a one-hot vector. e.g. X[i,t,:] is a one-hot vector representing the value of the i-th example at time t. 

`Y`: a (max_len, n_haikus, vocab_size) dimensional array

We're using the previous values to predict the next value. Therefore each traing example Y is just training example X moved one step ahead.

In [12]:
max_len = max_len - 1

In [13]:
X = np.zeros((n_haikus, max_len, vocab_size), dtype = 'bool')
Y = np.zeros((max_len, n_haikus, vocab_size), dtype = 'bool')

In [14]:
#vectorise haikus - turn them into a list of char indexes
haiku_vectors = []
for i, haiku in enumerate(haikus):
    haiku_vec = []
    for j in range(len(haiku)):
        haiku_vec.append(token_index.get(haiku[j]))
    haiku_vectors.append(haiku_vec)

In [15]:
X_vec = []
Y_vec = []
for i, vec in enumerate(haiku_vectors):
    X_vec.append(vec[:-1])
    Y_vec.append(vec[1:])


In [16]:
for i, vec in enumerate(X_vec):
    for j, idx in enumerate(vec):
        X[i, j, idx] = True
        
for i, vec in enumerate(Y_vec):
    for j, idx in enumerate(vec):
        Y[j, i, idx] = True

In [17]:
print(X[0, :, :])
print(X.shape)
print(Y.shape)

[[ True False False ... False False False]
 [False  True False ... False False False]
 [False False  True ... False False False]
 ...
 [False False False ... False False False]
 [False False False ... False False False]
 [False False False ... False False False]]
(23231, 58, 74)
(58, 23231, 74)


# 2. Building an LSTM model

In [18]:
#number of dimensions for hidden state (a) of each LSTM cell
n_a = 64

We implement LSTM forward pass defined by the following:

Forget gate:

$$\mathbf{\Gamma}_f^{\langle t \rangle} = \sigma(\mathbf{W}_f[\mathbf{a}^{\langle t-1 \rangle}, \mathbf{x}^{\langle t \rangle}] + \mathbf{b}_f)\tag{1} $$

Candidate value:

$$\mathbf{\tilde{c}}^{\langle t \rangle} = \tanh\left( \mathbf{W}_{c} [\mathbf{a}^{\langle t - 1 \rangle}, \mathbf{x}^{\langle t \rangle}] + \mathbf{b}_{c} \right) \tag{3}$$

Update gate:

$$\mathbf{\Gamma}_i^{\langle t \rangle} = \sigma(\mathbf{W}_i[a^{\langle t-1 \rangle}, \mathbf{x}^{\langle t \rangle}] + \mathbf{b}_i)\tag{2} $$ 

Cell state:
$$ \mathbf{c}^{\langle t \rangle} = \mathbf{\Gamma}_f^{\langle t \rangle}* \mathbf{c}^{\langle t-1 \rangle} + \mathbf{\Gamma}_{i}^{\langle t \rangle} *\mathbf{\tilde{c}}^{\langle t \rangle} \tag{4} $$

Output gate:
$$ \mathbf{\Gamma}_o^{\langle t \rangle}=  \sigma(\mathbf{W}_o[\mathbf{a}^{\langle t-1 \rangle}, \mathbf{x}^{\langle t \rangle}] + \mathbf{b}_{o})\tag{5}$$ 

Hidden state:
$$ \mathbf{a}^{\langle t \rangle} = \mathbf{\Gamma}_o^{\langle t \rangle} * \tanh(\mathbf{c}^{\langle t \rangle})\tag{6} $$

Prediction:
$$\mathbf{y}^{\langle t \rangle}_{pred} = \textrm{softmax}(\mathbf{W}_{y} \mathbf{a}^{\langle t \rangle} + \mathbf{b}_{y})$$

We want to generate new characters at test time predicting the next character based on the last character and the parameters (which can pass information about previous characters too) so we need to build LSTM architecture which implements such inference. So we use keras functional API to build a custom model.

We build a model by stacking keras layers on top of one another:
1. Input layer (specifying shape of input) 
- tensorflow.keras.layers.Input() class
2. 


LSTM_cell:

Takes hidden state as input and returns the next hidden state

dense_layer:

Dense implements the operation: output = activation(dot(input, kernel) + bias) where:
- activation = element-wise activation function passed as the activation argument
- kernel = weights matrix created by the layer
- bias = bias vector created by the layer 

Thus here we use Dense to calculated y_pred = softmax(dot(Wya, a) + by)

In [19]:
#define the layer objects of the model globally so that they are fixed when used in a model function

LSTM_cell = LSTM(n_a, return_state = True)

prediction_layer = Dense(vocab_size, activation='softmax')

reshape_layer = Reshape((1, vocab_size)) 

In [20]:
def LSTM_model(max_len, n_a, vocab_size):
    
    #specify the shape of input X
    X = Input(shape=(max_len, vocab_size))
    
    #specify the size of accepted a0 and c) inputs
    a0 = Input(shape=(n_a,), name='a0')
    c0 = Input(shape=(n_a,), name='c0')
    
    #initiliase cell state and hidden state
    a = a0
    c = c0
    
    #create an empty list to append predictions (outputs of Dense)
    outputs = []
    
    for t in range(max_len):
        
        #adds a layer for the next input (i.e. next character and next timestep)
        #lambda allows to specify the operation to be applied as a function on a tensor in a layer
        #here it just takes a slice of x for a given timestep and returns it
        x_t = Lambda(lambda x_t: X[:,t,:])(X)
        
        # reshapes x to be (1, vocab_size) 
        x_t = reshape_layer(x_t)
        
        #one step of LSTM to update hidden state and cell state
        a, _, c = LSTM_cell(x_t, initial_state=[c, a])
        
        #apply softmax to a function of hidden state to get prediction
        output = prediction_layer(a)
        
        outputs.append(output)
        
    model = Model(inputs= [X,a0,c0], outputs= outputs)
    
    return model
        
        
    

In [21]:
#def our model
model = LSTM_model(max_len = max_len , n_a = n_a, vocab_size = vocab_size)

In [22]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 58, 74)]     0                                            
__________________________________________________________________________________________________
lambda (Lambda)                 (None, 74)           0           input_1[0][0]                    
__________________________________________________________________________________________________
reshape (Reshape)               (None, 1, 74)        0           lambda[0][0]                     
                                                                 lambda_1[0][0]                   
                                                                 lambda_2[0][0]                   
                                                                 lambda_3[0][0]        

In [23]:
#compile the model

optimizer = Adam(lr=0.01, beta_1=0.9, beta_2=0.9999, decay=0.001)
#optimizer = 'rmsprop'

model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

In [24]:
k = 23231
a0 = np.zeros((k, n_a))
c0 = np.zeros((k, n_a))

In [25]:
len(list(Y))
hg = list(Y)
hg[0].shape

print('number of training examples:', X.shape[0])
print('Tx (length of sequence):', X.shape[1])
print('total # of unique values:', vocab_size)
print('shape of X:', X.shape)
print('Shape of Y:', Y.shape)

#58 items of shape (23231, 74)]]]]

number of training examples: 23231
Tx (length of sequence): 58
total # of unique values: 74
shape of X: (23231, 58, 74)
Shape of Y: (58, 23231, 74)


In [26]:
model.fit([X, a0, c0], list(Y), epochs=100)

Epoch 1/100


_SymbolicException: Inputs to eager execution function cannot be Keras symbolic tensors, but found [<tf.Tensor 'input_1:0' shape=(None, 58, 74) dtype=float32>]

In [None]:
def one_hot(x):
    x = K.argmax(x)
    x = tf.one_hot(indices=x, depth=78) 
    x = RepeatVector(1)(x)
    return x

In [None]:
def inference_model(LSTM_cell, prediction_layer, vocab_size = vocab_size, n_a = n_a, length = 58):
    """
    Uses the trained "LSTM_cell" and "densor" from model() to generate a sequence of values.
    
    Arguments:
    LSTM_cell -- the trained "LSTM_cell" from model(), Keras layer object
    densor -- the trained "densor" from model(), Keras layer object
    n_values -- integer, number of unique values
    n_a -- number of units in the LSTM_cell
    Ty -- integer, number of time steps to generate
    
    Returns:
    inference_model -- Keras model instance
    """
    
    # Define the input of your model with a shape 
    x0 = Input(shape=(1, vocab_size))
    
    # Define s0, initial hidden state for the decoder LSTM
    a0 = Input(shape=(n_a,), name='a0')
    c0 = Input(shape=(n_a,), name='c0')
    a = a0
    c = c0
    x = x0
    
    #densor and lstm cell are already trained

    ### START CODE HERE ###
    # Step 1: Create an empty list of "outputs" to later store your predicted values (≈1 line)
    outputs = []
    
    # Step 2: Loop over Ty and generate a value at every time step
    for t in range(length):
        
        # Step 2.A: Perform one step of LSTM_cell (≈1 line)
        a, _, c = LSTM_cell(x, initial_state=[a, c])
        
        # Step 2.B: Apply Dense layer to the hidden state output of the LSTM_cell (≈1 line)
        out = prediction_layer(a) #output y?
        #make sure sampling is applied according to distribution?

        # Step 2.C: Append the prediction "out" to "outputs". out.shape = (None, 78) (≈1 line)
        outputs.append(out)
        
        # Step 2.D: 
        # Select the next value according to "out",
        # Set "x" to be the one-hot representation of the selected value
        # See instructions above.
        x = Lambda(one_hot)(out) #update x to be one hot rep of last output y
        
    # Step 3: Create model instance with the correct "inputs" and "outputs" (≈1 line)
    inference_model = Model(inputs= [x0,a0,c0], outputs=outputs)
    
    ### END CODE HERE ###
    
    return inference_model

In [None]:
inference_model = inference_model(LSTM_cell, prediction_layer, vocab_size = vocab_size, n_a = n_a, length = 58)