In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import tensorflow.keras.utils
import tensorflow as tf
import numpy as np
import pandas as pd

# Load Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Let's load the data
df = pd.read_csv('/content/Shakespeare_data.csv')

In [None]:
df.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


In [None]:
import csv

corpus = []

with open('/content/Shakespeare_data.csv') as f:
  reader = csv.reader(f,delimiter=",")
  next(reader)
  for row in reader:
    corpus.append(row[5])

print(len(corpus))

111396


In [None]:
print(corpus[:10])

['ACT I', 'SCENE I. London. The palace.', 'Enter KING HENRY, LORD JOHN OF LANCASTER, the EARL of WESTMORELAND, SIR WALTER BLUNT, and others', 'So shaken as we are, so wan with care,', 'Find we a time for frighted peace to pant,', 'And breathe short-winded accents of new broils', 'To be commenced in strands afar remote.', 'No more the thirsty entrance of this soil', "Shall daub her lips with her own children's blood,", 'Nor more shall trenching war channel her fields,']


# Data Cleaning

In [None]:

import string

def text_cleaner(text):
    text = "".join(car for car in text if car not in string.punctuation).lower()
    text = text.encode("utf8").decode("ascii",'ignore')
    return text

corpus = [text_cleaner(line) for line in corpus]

In [None]:
corpus

['act i',
 'scene i london the palace',
 'enter king henry lord john of lancaster the earl of westmoreland sir walter blunt and others',
 'so shaken as we are so wan with care',
 'find we a time for frighted peace to pant',
 'and breathe shortwinded accents of new broils',
 'to be commenced in strands afar remote',
 'no more the thirsty entrance of this soil',
 'shall daub her lips with her own childrens blood',
 'nor more shall trenching war channel her fields',
 'nor bruise her flowerets with the armed hoofs',
 'of hostile paces those opposed eyes',
 'which like the meteors of a troubled heaven',
 'all of one nature of one substance bred',
 'did lately meet in the intestine shock',
 'and furious close of civil butchery',
 'shall now in mutual wellbeseeming ranks',
 'march all one way and be no more opposed',
 'against acquaintance kindred and allies',
 'the edge of war like an illsheathed knife',
 'no more shall cut his master therefore friends',
 'as far as to the sepulchre of chris

In [None]:
# Tokenization is the process of splitting up a text into a list of individual words, or tokens.
# corpus is too big if you try with all data
corpus = corpus[0:5000]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
word_index = tokenizer.word_index
total_words = len(word_index) + 1
total_words

5411

In [None]:
# create input sequences using list of tokens
input_sequence = []


for sentence in corpus:
  tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]

  for i in range(1, len(tokenized_sentence)):
        n_gram_sequence = tokenized_sentence[:i+1]
        input_sequence.append(n_gram_sequence)

In [None]:
input_sequence

[[495, 4],
 [153, 4],
 [153, 4, 301],
 [153, 4, 301, 1],
 [153, 4, 301, 1, 792],
 [60, 50],
 [60, 50, 93],
 [60, 50, 93, 33],
 [60, 50, 93, 33, 117],
 [60, 50, 93, 33, 117, 3],
 [60, 50, 93, 33, 117, 3, 283],
 [60, 50, 93, 33, 117, 3, 283, 1],
 [60, 50, 93, 33, 117, 3, 283, 1, 204],
 [60, 50, 93, 33, 117, 3, 283, 1, 204, 3],
 [60, 50, 93, 33, 117, 3, 283, 1, 204, 3, 270],
 [60, 50, 93, 33, 117, 3, 283, 1, 204, 3, 270, 80],
 [60, 50, 93, 33, 117, 3, 283, 1, 204, 3, 270, 80, 380],
 [60, 50, 93, 33, 117, 3, 283, 1, 204, 3, 270, 80, 380, 322],
 [60, 50, 93, 33, 117, 3, 283, 1, 204, 3, 270, 80, 380, 322, 2],
 [60, 50, 93, 33, 117, 3, 283, 1, 204, 3, 270, 80, 380, 322, 2, 381],
 [25, 2394],
 [25, 2394, 20],
 [25, 2394, 20, 34],
 [25, 2394, 20, 34, 46],
 [25, 2394, 20, 34, 46, 25],
 [25, 2394, 20, 34, 46, 25, 2395],
 [25, 2394, 20, 34, 46, 25, 2395, 13],
 [25, 2394, 20, 34, 46, 25, 2395, 13, 496],
 [197, 34],
 [197, 34, 6],
 [197, 34, 6, 100],
 [197, 34, 6, 100, 15],
 [197, 34, 6, 100, 15, 15

In [None]:
#max_len of input sequence
max_len = max([len(x) for x in input_sequence])
max_len

34

In [None]:
# pad sequences
padded_input_sequence = pad_sequences(input_sequence,maxlen=max_len,padding="pre")
padded_input_sequence

array([[  0,   0,   0, ...,   0, 495,   4],
       [  0,   0,   0, ...,   0, 153,   4],
       [  0,   0,   0, ..., 153,   4, 301],
       ...,
       [  0,   0,   0, ...,   4,  53, 877],
       [  0,   0,   0, ...,  53, 877, 235],
       [  0,   0,   0, ..., 877, 235, 666]], dtype=int32)

In [None]:
# create x and y
x = padded_input_sequence[:,:-1]
y = padded_input_sequence[:,-1]

In [None]:
x.shape

(32802, 33)

In [None]:
y.shape

(32802,)

In [None]:
# create one-hot encoding of the labels
from tensorflow.keras.utils import to_categorical
y = to_categorical(y,num_classes=5412)

In [None]:
y.shape

(32802, 5412)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [None]:

model = Sequential()
model.add(Embedding(5412,100,input_length =33))
model.add(LSTM(250))
model.add(Dense(5412,activation = "softmax"))

In [None]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 33, 100)           541200    
                                                                 
 lstm (LSTM)                 (None, 250)               351000    
                                                                 
 dense (Dense)               (None, 5412)              1358412   
                                                                 
Total params: 2250612 (8.59 MB)
Trainable params: 2250612 (8.59 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
model.fit(x,y,epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7eed2ef28ac0>

In [None]:
corpus

['act i',
 'scene i london the palace',
 'enter king henry lord john of lancaster the earl of westmoreland sir walter blunt and others',
 'so shaken as we are so wan with care',
 'find we a time for frighted peace to pant',
 'and breathe shortwinded accents of new broils',
 'to be commenced in strands afar remote',
 'no more the thirsty entrance of this soil',
 'shall daub her lips with her own childrens blood',
 'nor more shall trenching war channel her fields',
 'nor bruise her flowerets with the armed hoofs',
 'of hostile paces those opposed eyes',
 'which like the meteors of a troubled heaven',
 'all of one nature of one substance bred',
 'did lately meet in the intestine shock',
 'and furious close of civil butchery',
 'shall now in mutual wellbeseeming ranks',
 'march all one way and be no more opposed',
 'against acquaintance kindred and allies',
 'the edge of war like an illsheathed knife',
 'no more shall cut his master therefore friends',
 'as far as to the sepulchre of chris

In [None]:
import time
seed_text = "we must have bloody"
next_words = 20

for word in range(next_words):
  token_list = tokenizer.texts_to_sequences([seed_text])[0]
  token_list = pad_sequences([token_list],maxlen=max_len-1,padding='pre')
  pos = np.argmax(model.predict(token_list))

  for word,index in tokenizer.word_index.items():
    if index == pos:
      seed_text = seed_text + " " + word
      print(seed_text)
      time.sleep(2)

we must have bloody noses
we must have bloody noses and
we must have bloody noses and crackd
we must have bloody noses and crackd crowns
we must have bloody noses and crackd crowns upon
we must have bloody noses and crackd crowns upon their
we must have bloody noses and crackd crowns upon their own
we must have bloody noses and crackd crowns upon their own heads
we must have bloody noses and crackd crowns upon their own heads thou
we must have bloody noses and crackd crowns upon their own heads thou shalt
we must have bloody noses and crackd crowns upon their own heads thou shalt struck
we must have bloody noses and crackd crowns upon their own heads thou shalt struck me
we must have bloody noses and crackd crowns upon their own heads thou shalt struck me upon
we must have bloody noses and crackd crowns upon their own heads thou shalt struck me upon her
we must have bloody noses and crackd crowns upon their own heads thou shalt struck me upon her year
we must have bloody noses and crac