In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, GlobalAveragePooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

In [None]:
strategy = tf.distribute.MirroredStrategy()
data = pd.read_csv("/content/drive/MyDrive/767project/Shakespeare_data.csv")

In [None]:
data.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


In [None]:
dataset = data['PlayerLine']
subset_size = int(len(dataset) * 0.10)
subset_indices = np.random.choice(range(len(dataset)), size=subset_size, replace=False)
subset_dataset = dataset[subset_indices]
dataset = subset_dataset

In [None]:
corpus = []
with strategy.scope():
    for line in dataset:
        lowercase_line = line.lower()
        corpus.append(lowercase_line)
corpus[:10]

['from all such devils, good lord deliver us!',
 'is torn from forth that pretty hollow cage,',
 'how thaliard came full bent with sin',
 "the mansion where!--'twas at a feast,--o, would",
 'we shall have him here to-morrow with his best ruff on.',
 'drawing',
 'scoff on, vile fiend and shameless courtezan!',
 "friend, look to 't.",
 'thou shalt know her, fellow, by the rest that have no heads.',
 'an into their estimation and report: but he hath so']

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
word_to_token = tokenizer.word_index
def key_pair(num):
    count=0
    for key, value in word_to_token.items():
        if count>=num: break
        print(f''''{key:}': {value},''')
        count +=1
key_pair(10)

'the': 1,
'and': 2,
'i': 3,
'to': 4,
'of': 5,
'a': 6,
'you': 7,
'my': 8,
'that': 9,
'in': 10,


In [None]:

input_sequences = []
with strategy.scope():
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)


In [None]:
input_sequences[:5]

[[47, 33],
 [47, 33, 81],
 [47, 33, 81, 1285],
 [47, 33, 81, 1285, 44],
 [47, 33, 81, 1285, 44, 43]]

In [None]:
before = input_sequences[1]
max_seq_len = max(len(x) for x in input_sequences)
print(max_seq_len)

35


In [None]:
total_words = len(word_to_token)+1
print(total_words)

9054


In [None]:
# Padding
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_seq_len, padding = 'pre'))
after = input_sequences[1]

In [None]:
print(f'Before: {before}')
print(f'After: {after}')

Before: [47, 33, 81]
After: [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0 47 33 81]


In [None]:
features, labels = input_sequences[:, :-1], input_sequences[:, -1],
labels = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [None]:
#Create the model!
def generator_model():
    tf.random.set_seed(42)
    model = Sequential()
    model.add(Embedding(total_words, 100, input_length = max_seq_len-1)),
    model.add(Bidirectional(LSTM(64, return_sequences = True))),
    model.add(Bidirectional(LSTM(32))),
    model.add(Dense(64, activation = 'relu')),
    model.add(Dense(total_words, activation = 'softmax'))
    return model

In [None]:
with strategy.scope():
    model = generator_model()
    model.compile(loss = 'categorical_crossentropy',
                 optimizer = tf.keras.optimizers.Adam(),
                 metrics = ['accuracy'])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 34, 100)           905400    
                                                                 
 bidirectional (Bidirection  (None, 34, 128)           84480     
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, 64)                41216     
 onal)                                                           
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dense_1 (Dense)             (None, 9054)              588510    
                                                                 
Total params: 1623766 (6.19 MB)
Trainable params: 162376

In [None]:
EPOCHS = 50
BATCH_SIZE = 8
history = model.fit(features, labels, epochs=EPOCHS, batch_size=BATCH_SIZE)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
model.save('LSTM model')

In [None]:
def test_generator(string, num):
    if len(string)==0:
        print("Error: No word found")
        return
    for _ in range(num):
        token_list = tokenizer.texts_to_sequences([string])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding = "pre")
        probabilities = model.predict(token_list)
        choice = np.random.choice([1,2,3])
        predicted = np.argsort(probabilities, axis = -1)[0][-choice]
        if predicted !=0:
            generated_word = tokenizer.index_word[predicted]
            string += " " + generated_word
    print(string)

In [None]:
test_generator("long live the king", 30)

long live the king should go from justice so soon be an ass that's not in all margaret under this shape shall give me this night now you may not do a very spirit


In [None]:
test_generator("Life", 10)

Life like the man i' my upon is worse being no


In [None]:
test_generator("Dream", 10)

Dream on the cap of his sorrow of the man that
