# Project 3 Shakespearean Sonnets RNN Part 6


In [1]:
import random
import os
import fileinput
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random

Using TensorFlow backend.


# Initial Processing
- I'm not actually changing anything about the given text I'm literally just tokenizing the input and then passing the training data into the LSTM model.
- I am making 5 char jumps between samples of 40
- I tried to use the Keras tokenizer but it wasn't recognizing all of the unique character so I wrote my own
- I also decided to remove new line characters because it ruined the format of the sonnet 
- I had a very high number of spaces, so I also removed white space from between the sonnets in the given text file and the numbers of the sonnets

In [4]:
# Turn text file into training set

# training data should be 40 character sequences from the sonnets
# take all possible subsequences of 40 consectives from the dataset
# use semi-redundant requences-> pick sequences starting every n-th character


def get_char_repr(char_to_index, char):
    unique_words = char_to_index.keys()
    # Return a vector that's zero everywhere besides the index corresponding to <word>
    feature_representation = np.zeros(len(unique_words))
    feature_representation[char_to_index[char]] = 1
    return feature_representation 

def preprocess_init(text):
    # Convert text to dataset using semi-redundant sequences
    text = text.replace("\n","") # maybe add this back? idk 
    text = text.replace("0","")
    text = text.replace("1","")
    text = text.replace("2","")
    text = text.replace("3","")
    text = text.replace("4","")
    text = text.replace("5","")
    text = text.replace("6","")
    text = text.replace("7","")
    text = text.replace("8","")
    text = text.replace("9","")
    text = text.lower()
    
    unique_chars = sorted(list(set(text)))
    skip = 2 
    char_len = 40
    vocab_dict = {}
    for i in range(len(unique_chars)):
        vocab_dict[unique_chars[i]] = i
    
    #list of sets fo 40 characters
    sequences = [] 
    # individual final characters 
    characters = []
    
    # generate seqs of 40 chars by looping through whole thing
    for i in range(0, len(text)-41, skip):
        sequences.append(text[i:i+40]) # sequence
        characters.append(text[i+41]) # char 
    
    # need to reshape because LSTM is being moody and wants a 3D thing for x
    trainX = np.zeros((len(sequences), 40, len(unique_chars)))
    trainY = np.zeros((len(sequences), len(unique_chars)))
    # put 1s into the places where things fit the correct char
    for index in range(len(sequences)):
        for seq in range(len(sequences[0])):
            trainX[index, seq] = get_char_repr(vocab_dict, sequences[index][seq])
        trainY[index] = get_char_repr(vocab_dict, characters[index])
        
    return trainX,trainY, vocab_dict

In [5]:
text = ""
for line in fileinput.FileInput('data/shakespeare.txt'):
    if line.rstrip():
        text+=line.lstrip()
        
# do feature encoding and get vocab dictionary 
seqX, chars, vocab_map = preprocess_init(text)
unique_chars = sorted(list(set(text)))
map_len = len(vocab_map)

In [13]:
# Train a character-based LSTM model

def train_rnn(trainX, characters):
    
    model = Sequential()
    # single layers of 100-200 LSTM units
    model.add(LSTM(175, input_shape=(40,map_len)))
    # fully connected dense output layer with a softmax nonlinearity
    model.add(Dense(map_len, activation='softmax'))

    # Train model to minimize categorical cross-entropy
    model.compile(optimizer = 'rmsprop', # unclear if we need this
                  loss="categorical_crossentropy",
                  metrics = ["accuracy"]    
                  # we want accuracy over 0.6 on training data
                 )

    # train for many epochs so loss converges
    print("Train model...")
    model.fit(trainX, characters, epochs = 30, batch_size = 64)
    return model
    

In [14]:
rnn_model = train_rnn(seqX, chars)

Train model...
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [19]:
def generate_next_char(z_vector,temperature):
    # implement temp parameter myself as part of the poem generation algo
    # add a Lambda layer to LSTM during prediction or write a function
    z_vector = np.asarray(z_vector)
    num = np.exp(z_vector/temperature)
    denom = np.sum(np.exp(z_vector/temperature))
    index_predictions = num/denom
    max_prediction_index = np.argmax(index_predictions)
    
    # have to find the char that goes with the index
    char = ""
    for i in vocab_map.keys():
        if vocab_map[i] == max_prediction_index:
            char = i
    return char

In [41]:
### Poetry Generation 
# to generate poems, draw softmax samples from the trained model
def generate_sonnet_without_random_seed(temp):
    generated_sonnet = ''
    seed = "shall i compare thee to a summer's day? " # first sentence 
    generated_sonnet += seed
    for line in range(13*40): # of there are 14 lines in a sonnet and our seed is the first
        # each line is about 40 characters long?
            
        features = np.zeros((1,40,map_len))
        for i in range(40):
            features[0,i] = get_char_repr(vocab_map, generated_sonnet[len(generated_sonnet)-40+i])
        
        # generate the next value 
        predictions = rnn_model.predict(features)[0]
        next_char = generate_next_char(predictions, temp) # change around the temp
        generated_sonnet += next_char 
        
    # separate poem by line and print out
    for i in range(13):
        if i==0:
            print(generated_sonnet[i*40].upper()+generated_sonnet[i*40+1:(i+1)*40])
        elif i>0 and i<12:
            print(generated_sonnet[i*40].upper()+generated_sonnet[i*40+1:(i+1)*40]+",")
        else:
            print(generated_sonnet[i*40].upper()+generated_sonnet[i*41:(i+1)*40]+".")

    return generated_sonnet


In [42]:
print('\nSample Poem:\n====================')
t1 = generate_sonnet_without_random_seed(0.25)


Sample Poem:
Shall i compare thee to a summer's day? 
Hrwmod hte' eehrd,o a eo oh aeae  yulwti,
N hn yu oferisd,t yutde ntin rmoe yu hsr,
Snnt htrvne,hr iagtn o ees frmnwnnc,btog,
Igtytosrwet  nr hm adahccccoe htrtnnes r,
Cetrn.hrsnt n o aa hseas adoi o o a erns,
 ad,eti o hsrwran,ada o a eres o al eehr,
D,o o hvr  o eu aa,hwsdflas htevr yu o o,
 a ee hsrsdae,o yu hreses h ol,adtntsrsi,
G tetrmraotdt n ube o o al e,e o a aeweo,
 o yut hsres lef.tetrnm,adtntfaetym atig,
T't' nterng o o a oehtba,adta adto hwetn,
Oesasdadysadet nto adohae frm.


In [18]:
print('\nSample Poem:\n====================')
t2 = generate_sonnet_without_random_seed(0.75)

TypeError: generate_sonnet_without_random_seed() takes 0 positional arguments but 1 was given

In [None]:
print('\nSample Poem:\n====================')
t3 = generate_sonnet_without_random_seed(1.5)