# Keats generator

In [1]:
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

try:
    # %tensorflow_version only exists in Colab.
    %tensorflow_version 2.x
    !pip install -q -U tensorflow-addons
    IS_COLAB = True
except Exception:
    IS_COLAB = False

# TensorFlow ≥2.0 is required
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

#if not tf.config.list_physical_devices('GPU'):
#    print("No GPU was detected. LSTMs and CNNs can be very slow without a GPU.")
#    if IS_COLAB:
#        print("Go to Runtime > Change runtime and select a GPU hardware accelerator.")

# Common imports
import numpy as np
import os
import re

# to make this notebook's output stable across runs
np.random.seed(42)
tf.random.set_seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import RMSprop


Bad key "text.kerning_factor" on line 4 in
/anaconda3/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


# Import and clean

In [2]:
# Text cleaning functions:
def deRomanNumeral(words): #note: doesn't remove roman numeral I, as that is used often...
    pattern = '^(cm?|cd?|d?c?c?c?)(xc?|xl?|l?x?x?x?)(ix|iv|v?i?ii)$'
    words = [ w for w in words if re.search(pattern,w) is not None ]
    return words

# Text cleaning functions:
def checkRomanNumeral(word): #note: doesn't remove roman numeral I, as that is used often...
    if word == 'i':
        return None
    pattern = '^(cm?|cd?|d?c?c?c?)(xc?|xl?|l?x?x?x?)(m|d|c|x|v|l|ix|iv|v?i?i?i)$'
    return re.search(pattern,word)

In [3]:
def newlinepartition(text):
    # cut up a single entry
    septext = text.split("\n\n\n")
    if len(septext) == 2 and len(septext[0]) < 200:
        return septext[1]
    if len(septext) > 2:
        newtext = []
        for tt in septext:
            if len(tt) < 100:
                pass
            elif tt[:9]=='FOOTNOTES':
                print("removed footnotes")
                pass
            elif tt[:9]=='LINENOTES':
                print("removed linenotes")
                pass
            elif tt[:9]=='NOTE':
                print("removed note")
                pass
            else:
                newtext.append(tt)
        if(len(newtext)>1):
            print('check this text')
        return "\n\n".join(newtext)
    if len(septext) < 2:
        print("assumed empty")
        return None
    
def cleanup_text_A(text): # Keats # Coleridge
    words = text.replace("\n\n","\n")
    words = words.replace("\n"," \n ")
    words = words.replace("-"," ")
    words = words.replace("."," . ")
    words = words.replace("!"," ! ")
    words = words.replace("?"," ? ")
    words = words.split(" ")
    table = str.maketrans('', '', '…”_-.,;!:?*/()[]{}0123456789"')
    stripped = [w.translate(table).lower() for w in words]
    stripped = [w for w in stripped if not w=='' and not checkRomanNumeral(w)]
    cleanedtext = " ".join(stripped)
    cleanedtext = cleanedtext.replace("\n part \n","\n")
    cleanedtext = cleanedtext.replace("part i \n","")
    cleanedtext = cleanedtext.replace("\n book \n","\n")
    cleanedtext = cleanedtext.replace("book i \n","")
    cleanedtext = cleanedtext.replace("\n scene \n","\n")
    cleanedtext = cleanedtext.replace("scene i \n","")
    cleanedtext = cleanedtext.replace("\n \n","\n")
    cleanedtext = cleanedtext.replace("\n note \n", 'TERMINATE')
    cleanedtext = cleanedtext.replace("\n notes \n", 'TERMINATE')
    cleanedtext = cleanedtext.replace("\n footnote \n", 'TERMINATE')
    cleanedtext = cleanedtext.replace("\n footnotes \n", 'TERMINATE')
    cleanedtext = cleanedtext.split('TERMINATE')[0]
    cleanedtext = re.sub("(\n |)verse (st|nd|rd|th) \n","\n", cleanedtext)
    cleanedtext = cleanedtext.lstrip("i ")
    cleanedtext = cleanedtext.split(" ed \n")[-1] # remove editors notes in wordsworth (may be dangerous)
    cleanedtext = cleanedtext.replace("composed published ","")
    cleanedtext = cleanedtext.lstrip("\n")
    return cleanedtext
    
def split_into_works(maintext,iStart,iEnd, type='A'):
    # first identify where legal / forewardend etc, set these to iStart and iEnd (with a +!)
    textlist = []
    for i in range(iStart,iEnd):
        print(i)
        text = newlinepartition(maintext[i])
        if text is not None: 
            if type == 'A' or type == '-':
                textlist.append(cleanup_text_A(text))
            elif type == 'B':
                textlist.append(cleanup_text_B(text))
    return textlist

def split_works_into_words(works):
    return [l.split(" ") for l in works ]

## Keats test (Blake's poems not split)

In [4]:
keatsfilelist = [ file for file in os.listdir() if file[-4:]=='.txt' and file[:5]=='Keats']

In [6]:
filedictionary = {
                  'Keats_1.txt': {'zones': 2, 'len': [8, 22], 'cleanup': 'A'},
                  'Keats_2.txt':  {'zones': 2, 'len': [5, 37],'cleanup': 'A'},
                  'Keats_3.txt': {'zones': 2, 'len': [6, 10], 'cleanup': 'A'},
                  'Keats_4.txt': {'zones': 2, 'len': [0, 0], 'cleanup': 'A'}, #This is only Lamia which is in Keats_1.txt
                  }

In [7]:
def read_and_process_file(filelist,idx):
    filename = filelist[idx]
    file = open(filename, 'rt')
    text = file.read()
    file.close()
    adict = filedictionary[filename]
    # split into words by white space
    Zones = text.split("***")
    lens = adict['len']
    maintext = []
    if adict['zones'] == 'asmain':
        maintext = Zones
    else:
        maintext = Zones[adict['zones']].split("\n\n\n\n\n")
    maintext = split_into_works(maintext,lens[0],lens[1],type=adict['cleanup'])
    return maintext

In [8]:
workslist = []
for i in range(len(keatsfilelist)): 
    workslist+= read_and_process_file(keatsfilelist,i)

5
6
7
8
9
10
11
12
check this text
13
14
15
assumed empty
16
17
assumed empty
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
check this text
36
6
7
8
9
8
removed footnotes
check this text
9
10
11
assumed empty
12
13
14
15
16
17
18
19
20
21
check this text


# A keats text run (based on char)

In [9]:
# this allows to smooth out a distrubtion and equalize probabilities ()
def reweight_distribution(orig, temp=1.,eps=1e-8):
    new = np.exp(np.log(np.abs(orig)+eps) / temp)
    return new / np.sum(new)

def sample(preds, temp = 1.0):
    preds = np.asarray(preds).astype('float64')
    preds = reweight_distribution(preds, temp)
    probs = np.random.multinomial(1,preds,1)
    return np.argmax(probs)

corpus = "\n\n".join(workslist)
len(corpus)
maxlen = 80
allchars = sorted(list(set(corpus)))
charind = dict((char,allchars.index(char)) for char in allchars)

In [23]:
step = 1
sequences = []
next_char = []

for i in range(0,len(corpus)-maxlen,step):
    sequences.append(corpus[i:i+maxlen])
    next_char.append(corpus[i+maxlen])
    
print("# Sequences: "+str(len(sequences)))
print("# chars: "+str(len(allchars)))

# Sequences: 369141
# chars: 36


In [24]:
# initialize all values as zero
x = np.zeros((len(sequences),maxlen,len(allchars)),dtype=np.bool)
y = np.zeros((len(sequences),len(allchars)),dtype=np.bool)
for i, seq in enumerate(sequences):
    for j, char in enumerate(seq):
        x[i,j,charind[char]] = 1 # one hot encode
    y[i,charind[next_char[i]]] = 1

In [None]:
keatsmodel = Sequential()
keatsmodel.add(LSTM(128,input_shape=(maxlen,len(allchars))))
keatsmodel.add(Dense(len(allchars),activation="softmax"))

In [25]:
optim = RMSprop(lr=0.01)
keatsmodel.compile(loss="categorical_crossentropy",optimizer=optim)

In [30]:
for epoch in range(2):
    print('Epoch: ',3*epoch)
    keatsmodel.fit(x,y,batch_size=128,epochs = 3) # run for just three epochs
    
    start_ind = np.random.randint(0,len(corpus)-maxlen-1)
    gen_text = corpus[start_ind:start_ind+maxlen]
    
    for temp in [0.25,0.5,0.75,1.0,1.25]:
        print("--- temperature: ",temp)
        sys.stdout.write(gen_text)
        for i in range(200):
            sampled = np.zeros((1,maxlen,len(allchars)))
            for t, char in enumerate(gen_text):
                sampled[0,t,charind[char]] = 1
            preds = keatsmodel.predict(sampled, verbose=0)[0]
            nextind = sample(preds,temp)
            newchar = allchars[nextind]
            #add new char and move along string
            gen_text += newchar
            gen_text = gen_text[1:]
            sys.stdout.write(newchar)
            

Epoch:  0
Train on 369141 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
--- temperature:  0.25
bee hive casts its swarm 
 acorns ripe down pattering 
 while the autumn breezes of the brint 
 the bright and the clouds in many a sorrow 
 the soft and startled the mid and so 
 and for the bright with the bright and the bowery 
 of some soul to see the shade and seem 
 and the--- temperature:  0.5
ith the bright and the bowery 
 of some soul to see the shade and seem 
 and the courted wand the fire upon her wood 
 and morn to light for the pout thevery 
 as a some morning happy with things 
 of all the white we start with beauty bed 
 of her rich thee be a spring side 
 to--- temperature:  0.75
 all the white we start with beauty bed 
 of her rich thee be a spring side 
 to that then her maning sung 
 to could high affedden eyes 
 old sorrow changers he must spine 
 but from its worled and the 'twaster burn 
 a give i silence greet of her winger 
 the deep child as a la--- temperature:  1.0
he 'tw

In [31]:
keatsmodel.save("keats_char_gen.h5")

# Generate new Keats

In [10]:
keatsmodel = keras.models.load_model("keats_char_gen.h5")

In [14]:
start_ind = np.random.randint(0,len(corpus)-maxlen-1)
gen_text = corpus[start_ind:start_ind+maxlen]
temp = 0.1
exciteT = 0.7 #applied on first letter of a word to reduce repetition
exciteT2 = 0.5 #applied on the second letters of a word to reduce repetition
exciteT3 = 0.3 #applied on third letter of a word to reduce repetition
exciteT4 = 0.2 #applied on fourth letter of a word to reduce repetition

runenhance = True
continueval = True
printon = False
i = 0
finishline = 500
while continueval:
    sampled = np.zeros((1,maxlen,len(allchars)))
    for t, char in enumerate(gen_text):
        sampled[0,t,charind[char]] = 1
    preds = keatsmodel.predict(sampled, verbose=0)[0]
    enhanceT = 0.0
    if runenhance:
        if gen_text[-4] == ' ':
            enhanceT = exciteT4
        if gen_text[-3] == ' ':
            enhanceT = exciteT3
        if gen_text[-2] == ' ':
            enhanceT = exciteT2
        if gen_text[-1] == ' ':
            enhanceT = exciteT
    nextind = sample(preds,temp + enhanceT)
    newchar = allchars[nextind]
    #add new char and move along string
    gen_text += newchar
    gen_text = gen_text[1:]
    if printon:
        sys.stdout.write(newchar)
        i+=1
    if newchar == '\n':
        printon = True
        if i >= finishline:
            continueval = False

 more clear down steeping but the sun 
 while upon my closed and grow not to me 
 made he look'd the fancy bright of the bride 
 when at new and the tender content 
 and the vales sorrow and the eye 
 in the world a song 
 care born and the incense came 
 and while shall be found his hours of self 
 he seem'd in words and e'er ear to be seen 
 phore sat luxurial forest was an footstep 
 was fair god meadow in her green not her late 
 the portal rose and misery in the stars 
 where are ere she see of one i dead 


# Embedding

In [None]:
from sklearn.decomposition import PCA
from gensim.models import Word2Vec

wordmodel = Word2Vec(split_works_into_words(cleanedworks), min_count=1)
print(wordmodel)
# summarize vocabulary
words = list(wordmodel.wv.vocab)
print(words)
print(wordmodel['crept'])
# save model
wordmodel.save('wordmodel.bin')

In [None]:
X = wordmodel[wordmodel.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)
# create a scatter plot of the projection
plt.scatter(result[:, 0], result[:, 1])
words = list(wordmodel.wv.vocab)
#for i, word in enumerate(words):
#    plt.annotate(word, xy=(result[i, 0], result[i, 1]))
#plt.show()

In [None]:
layerE = Embedding(len(words), edim, input_length=50)

In [None]:
# load model
new_wordmodel = Word2Vec.load('wordmodel.bin')
print(new_wordmodel)