# Training a LSTM for generating love poems.

Love poems were collected from several online repositories. A LSTM was trained on a few hundred of these poems. Compromises on amount of training data and size of model were made due to memory limitations. Future directions for this project include streaming data to help mitigate this issue.

The LSTM was trained in TensorFlow and leverage my local computer's GPU. Design of the script for training the model was heavily influenced by 2 training tutorials:


Current model name and parameters:
word_model_love_poems_composite_50.h5
single layer, 600 nodes, batch size of 50.

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense
from keras.layers import Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
import random
from keras.callbacks import ModelCheckpoint
import os
import io
from IPython.display import clear_output
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


In [2]:
from keras import backend
backend.tensorflow_backend._get_available_gpus()

['/job:localhost/replica:0/task:0/device:GPU:0']

In [3]:
import os

In [4]:
os.getcwd()

'C:\\Users\\Kyle\\Documents\\Blog Posts\\Autoencoder_for_text_generation'

In [6]:
filename='love_poems_poem_hunter.txt'
data1=(open(filename).read())
data1=data1.lower()

filename='love_poems_book_riot.txt'
data2=(open(filename).read())
data2=data2.lower()

filename='love_poems_poetry_foundation.txt'
data3=(open(filename).read())
data3=data3.lower()

filename='lovepoetry.txt'
data4=open(filename).read()
data4=data4.lower()
#model_name='word_model_'+filename.split('.')[0]+'_composite.h5'
model_name='word_model_love_poems_composite_50.h5'

In [7]:
#sample from data4 because there are so many poems!
random.seed(400)
reshuffle=[elm for elm in data4.split('\n\n\n\n') if len(elm)>0]
#reshuffle

In [8]:
random.shuffle(reshuffle)
data4='\n\n\n\n'.join(reshuffle[0:100])

In [9]:
len(data4)

68405

In [10]:
data='\n\n\n\n'.join([data1,data2,data3,data4])

In [11]:
del data1
del data2
del data3
del data4

In [12]:
len(data.split('\n\n\n\n'))

288

In [13]:
data=data.lower().replace('\n\n\n\n','<endtoken>\n<starttoken>')
data=data.replace('\n',' <returntoken> ')

In [14]:
len(data)

302775

In [15]:
tokenizer = Tokenizer()

In [16]:
def dataset_preparation(data, num_words=None):

   # corpus = data.lower().split("\n")    
    corpus = data.lower().replace('<endtoken>','<endtoken2><endtoken>').split("<endtoken>")
    tokenizer.fit_on_texts(corpus)
    if num_words!=None:
        tokenizer.word_index = {e:i for e,i in tokenizer.word_index.items() if i <= num_words} # <= because tokenizer is 1 indexed
        tokenizer.word_index[tokenizer.oov_token] = num_words + 1
    total_words = len(tokenizer.word_index) + 1
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences,   
                          maxlen=max_sequence_len, padding='pre'))
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len, total_words

In [17]:
def create_model(predictors, label, max_sequence_len, 
                 total_words,num_epochs=2,save=False, 
                 batch_size=100, device='/cpu:0'):
    print(device)
    input_len = max_sequence_len - 1
    with tf.device(device):
        
        model = Sequential()
        model.add(Embedding(total_words, 10, input_length=input_len))
        #model.add(LSTM(250,return_sequences=True))
        #model.add(Dropout(0.2))
        #model.add(LSTM(500,return_sequences=True))
        #model.add(Dropout(0.2))
        model.add(LSTM(600))
        model.add(Dropout(0.2))
        model.add(Dense(total_words, activation='softmax'))

        model.compile(loss='categorical_crossentropy', optimizer='adam')


        if save==True:
            # define the checkpoint
            filepath = model_name
            checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, 
                                         save_best_only=True, mode='min')
            #callbacks_list = [checkpoint]
            callbacks_list = [EarlyStopping(monitor='loss', patience=2),checkpoint]
            try:
                model.load_weights(model_name)
                print('found previous model. Loading weights.')
            except:
                print('No previous model found.')
            model.fit(predictors, label, epochs=num_epochs, verbose=1,
                      callbacks=callbacks_list,batch_size=batch_size)
        else:
            model.fit(predictors, label, epochs=num_epochs, verbose=1,
                      batch_size=batch_size)
    return model

In [24]:
def load_model(filename, max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    
    try:
        model = Sequential()
        model.add(Embedding(total_words, 10, input_length=input_len))
        #model.add(LSTM(350,return_sequences=True))
        #model.add(Dropout(0.2))
        #model.add(LSTM(500,return_sequences=True))
        #model.add(Dropout(0.2))
        model.add(LSTM(600))
        model.add(Dropout(0.2))
        model.add(Dense(total_words, activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam')
        # define the checkpoint
        filepath =filename
        checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
        callbacks_list = [EarlyStopping(monitor='loss', patience=2),checkpoint]
        #callbacks_list = [checkpoint]
        model.load_weights(filename)
        print('found previous model. Loading weights.')
    except:
        print('No previous model found.')
        
    return model

In [19]:
def generate_text(seed_text, next_words, max_sequence_len, model):
    for j in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen= 
                             max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
  
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text

In [20]:
X, Y, max_len, total_words = dataset_preparation(data, num_words=None)

In [21]:
total_words

5129

In [22]:
len(data.split(' '))/total_words

9.483330083837005

In [23]:
model = create_model(X, Y, max_len, total_words,num_epochs=1,save=True, 
                     batch_size=50, device='/gpu:0')

/gpu:0
No previous model found.
Epoch 1/1

Epoch 00001: loss improved from inf to 6.45938, saving model to word_model_love_poems_composite_50.h5


Loss starting at 8.5

In [None]:
#model = create_model(X, Y, max_len, total_words,num_epochs=1,save=False, device='/gpu:0')
text = generate_text("starttoken", 150, max_len, model)
#print(text.replace('starttoken','').replace('returntoken','\n').split('endtoken2')[0])
text

In [None]:
model = create_model(X, Y, max_len, total_words,num_epochs=60,save=True, 
                     batch_size=50, device='/gpu:0')

In [31]:
model = create_model(X, Y, max_len, total_words,num_epochs=10,save=True, 
                     batch_size=50, device='/gpu:0')

/gpu:0
found previous model. Loading weights.
Epoch 1/10

Epoch 00001: loss improved from inf to 0.51051, saving model to word_model_love_poems_composite_50.h5
Epoch 2/10

Epoch 00002: loss improved from 0.51051 to 0.46732, saving model to word_model_love_poems_composite_50.h5
Epoch 3/10

Epoch 00003: loss improved from 0.46732 to 0.45343, saving model to word_model_love_poems_composite_50.h5
Epoch 4/10

Epoch 00004: loss did not improve from 0.45343
Epoch 5/10

Epoch 00005: loss did not improve from 0.45343


In [27]:
text = generate_text("starttoken", 150, max_len, model)
print(text.replace('starttoken','').replace('returntoken','\n').split('endtoken2')[0])

    
  
 i lie here in the night and dream of what will be 
 when finally i am in your arm's with you lying next to me 
 
 i dream of you do i are with my heart 
 in you in the way you are to me 
 
 why i love you because i love you 
 and because i hate you i love you 
 i love you because you are my life and my heart in my heart in my ocean 
 
 i love you because you are my love and my poems 
 love you because you are my sky and my moon in my lonely nights 
 
 i love you because i love you 
 and because you love you are my love and my poems 
 love you because you are my sky and my


In [38]:
text = generate_text("starttoken", 150, max_len, model)
print(text.replace('starttoken','').replace('returntoken','\n').split('endtoken2')[0])

       
  
 i cannot be with another girl 
 but john can't because he's her pearl 
 to her it's all a great big game 
 if you about a great part of this way 
 
 i had a man love me 
 but i thought i could do you 
 
 i love you because i love you 
 i am first you in me 
 i can't believe you pushed me away 
 i know you always be there 
 but you know where i loved you 
 i promise you to be forever 
 
 you said if me that 
 i can't know it is the way 
 that makes me want to be 
 and when i know that i love 
 
 i love you because i love you 
 and because i love you 



In [31]:
text = generate_text("starttoken", 150, max_len, model)
print(text.replace('starttoken','').replace('returntoken','\n').split('endtoken2')[0])

  i love you 
 i love love you 
 no other words could say it more 
 your handsome and clever 
 without you i'd be sore 
 
 i love you 
 i love love you 
 nothing else can say 
 your charming and loving 
 your soul delicate my inner 
 
 i love our love 
 when you was no part of my own 
 
 i love you 
 i love love you 
 you are so loving of me 
 and if you are a promise 
 i shared i never never never fool 
 i just never ask on the way 
 
 now of i have away 
 to the way of to my love 
 and the beautiful times of a mountain 
 
 i love you because i love you 
 and because i love you i love you


Option to load model:

In [29]:
model=load_model(model_name, max_len, total_words)

found previous model. Loading weights.


In [34]:
text = generate_text("starttoken", 150, max_len, model)
print(text.replace('starttoken','').replace('returntoken','\n').split('endtoken2')[0])

     
  love is love is love 
 
 so t's the sky of the goodnight 
 but you when it certain never 
 
 love is to be now to you 
 
 less who other do not on love 
 love is no heart is on in white 
 or love for love love love love 
 
 love is not happy give love 
 that is these love you will 
 love is so much love love is the way 
 
 love you want it much love love 
 
 you are you the one 
 you are the one and to give you 
 not all your love and you 
 when you just just the way you are 


In [22]:
def augmented_writing(seed_text, model, top_n=10, length=10):
    for x in range(0,length):
        clear_output()
        token_list = tokenizer.texts_to_sequences([seed_text])
        token_list=token_list[0]
        token_list =pad_sequences([token_list], maxlen= 
                                 max_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        predictions=pd.Series(model.predict(token_list, verbose=0)[0]
            ).sort_values(ascending=False)[0:top_n]
        word_dict=tokenizer.word_index
        for i in range(0,len(predictions)):
            word=list(word_dict.keys())[list(
                word_dict.values()).index(predictions.index[i])]
            weight=np.round(predictions.values[i],2)
            print(i, weight, word)
        print('Current sentence: %s'%(seed_text))
        print('What should be the next word?')
        
        try:
            choice=int(input())
            output_word=list(word_dict.keys())[list(
                word_dict.values()).index(predictions.index[choice])]
            seed_text += " " + output_word
        except:
            print('early end')
            break
    return seed_text

In [31]:
seed_text='starttoken'
seed_text=augmented_writing(seed_text, model, top_n=5, length=50)

0 0.91 i
1 0.07 for
2 0.01 returntoken
3 0.0 love
4 0.0 to
Current sentence: starttoken i love not because i love you returntoken i go from loving to not loving you returntoken from waiting to not waiting for you returntoken my heart moves from cold to fire returntoken returntoken you are a reason returntoken that fills each season returntoken when i love you returntoken
What should be the next word?
0


In [32]:
print(seed_text.replace('starttoken','').replace('returntoken','\n').split('endtoken2')[0])

 i love not because i love you 
 i go from loving to not loving you 
 from waiting to not waiting for you 
 my heart moves from cold to fire 
 
 you are a reason 
 that fills each season 
 when i love you 
 i
