<a href="https://colab.research.google.com/github/kaispace30098/WordSeqenceDecoding/blob/main/Sonnet_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense,Embedding, Input, GRU, #GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.optimizers import Adam

In [2]:
#Configuration
Tlimit=100
VocabMax=3000
Embedding_dim=100
Val_split=0.2
Batch_size=128
EPOCHS=1000
Latent_dim=25

In [3]:
#load the data
input_texts=[]
target_texts=[]
for line in open('/content/sonnets.txt'):

  line_=line.rstrip()#Remove any white spaces at the end of the string
  #including "/n"
  input_line='<sos> '+line_
  target_line=line_+' <eos>'

  input_texts.append(input_line)
  target_texts.append(target_line)

all_lines=input_texts+target_texts

In [4]:
#setup a tokenizer
tokenizer=Tokenizer(num_words=VocabMax,filters='')
#Tokenizer max words, filter
tokenizer.fit_on_texts(all_lines)
#tokenize all words, sequenize the input and target seperately
input_sequences=tokenizer.texts_to_sequences(input_texts)
target_sequences=tokenizer.texts_to_sequences(target_texts)

In [5]:
word2idx=tokenizer.word_index
print('found %s unique tokens.' % len(word2idx))

assert('<sos>' in word2idx)
assert('<eos>' in word2idx)
#if condition returns False, AssertionError is raised

found 3477 unique tokens.


In [6]:
Tactual=max(len(s) for s in input_sequences)
print(f'Max seq length:{Tactual}')
print(f'Max seq length limit: {Tlimit}')
#we need to pad sequence later...


Max seq length:20
Max seq length limit: 100


In [7]:
T=min(Tactual,Tlimit)
padded_input_sequences=pad_sequences(input_sequences,maxlen=T, padding='post')
padded_target_sequences=pad_sequences(target_sequences,maxlen=T,padding='post')
print(padded_input_sequences.shape)

#max input sequence length eqals to max target sequence length

(1291, 20)


In [8]:
print(input_texts[0])
print(input_sequences[0])
print(padded_input_sequences[0])
print(target_texts[0])
print(target_sequences[0])
print(padded_target_sequences[0])

<sos> Somewhile before the dawn I rose, and stept
[1, 989, 134, 4, 512, 6, 990, 3, 991]
[  1 989 134   4 512   6 990   3 991   0   0   0   0   0   0   0   0   0
   0   0]
Somewhile before the dawn I rose, and stept <eos>
[989, 134, 4, 512, 6, 990, 3, 991, 2]
[989 134   4 512   6 990   3 991   2   0   0   0   0   0   0   0   0   0
   0   0]


In [9]:
print(f'total number of vocabulary:{len(word2idx)+1}')
print(f'Max vocab limit: {VocabMax}')
num_words=min(VocabMax,len(word2idx)+1)
print(num_words)


total number of vocabulary:3478
Max vocab limit: 3000
3000


In [10]:
#cross entropy so do one-hot encode
one_hot_targets=np.zeros((len(padded_input_sequences),T,num_words))

for i, target_seq in enumerate(padded_target_sequences):
  for t,word in enumerate(target_seq):
    if word>0:
      one_hot_targets[i,t,word]=1

In [11]:
############import data
from google.colab import drive
drive.mount('/content/gdrive')
word2vec={}
with open('/content/gdrive/MyDrive/glove.6B.100d.txt') as f:
  for line in f:
    values=line.split()
    word=values[0]
    vec=np.asarray(values[1:],dtype='float32')
    word2vec[word]=vec
print(len(word2vec),'word vectors')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
400000 word vectors


In [12]:
embedding_matrix=np.zeros((num_words,Embedding_dim))
for word, i in word2idx.items():
   if i >= VocabMax:
     continue
   embedding_vector=word2vec.get(word)
   if embedding_vector is not None:
       embedding_matrix[i]=embedding_vector


In [13]:
#Create an embedding layer
embedding_layer=Embedding(num_words,
                          Embedding_dim,
                          weights=[embedding_matrix],
                          #input_length=Tlimit,
                          trainable=False)

In [14]:
#Create model
input_=Input(shape=(T,))
initial_h=Input(shape=(Latent_dim,))
x=embedding_layer(input_)
gru=GRU(Latent_dim, return_sequences=True,return_state=True)
#latent dimension of the output space of GRU
x,_=gru(x,initial_state=initial_h)#GRU ADD TWO LAYERS
#x=GlobalMaxPooling1D()(x)
dense=Dense(num_words,activation='softmax')
output=dense(x)
model=Model([input_,initial_h],output)

In [15]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 20, 100)      300000      ['input_1[0][0]']                
                                                                                                  
 input_2 (InputLayer)           [(None, 25)]         0           []                               
                                                                                                  
 gru (GRU)                      [(None, 20, 25),     9525        ['embedding[0][0]',              
                                 (None, 25)]                      'input_2[0][0]']            

In [16]:
#from tensorflow.keras.layers import LSTM
## create an LSTM network with a single LSTM
#input_ = Input(shape=(T,))
#initial_h = Input(shape=(Latent_dim,))
#initial_c = Input(shape=(Latent_dim,))
#x = embedding_layer(input_)
#lstm = LSTM(Latent_dim, return_sequences=True, return_state=True)
#x, _, _ = lstm(x, initial_state=[initial_h, initial_c]) # don't need the states here
#dense = Dense(num_words, activation='softmax')
#output = dense(x)

#model = Model([input_, initial_h, initial_c], output)

In [17]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 20, 100)      300000      ['input_1[0][0]']                
                                                                                                  
 input_2 (InputLayer)           [(None, 25)]         0           []                               
                                                                                                  
 gru (GRU)                      [(None, 20, 25),     9525        ['embedding[0][0]',              
                                 (None, 25)]                      'input_2[0][0]']            

In [18]:
model.compile(
    loss='categorical_crossentropy',
    optimizer=Adam(lr=0.01),
    metrics=['accuracy']
)



  super(Adam, self).__init__(name, **kwargs)


In [19]:
z=np.zeros((len(padded_input_sequences),Latent_dim))
r=model.fit([padded_input_sequences,z],one_hot_targets,Batch_size,EPOCHS,validation_split=Val_split)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

In [20]:
#For prediction we need to pass ONE word at a time
#Build a new model with new input and new sizes!

input2=Input(shape=(1,))
x=embedding_layer(input2)
x,h=gru(x,initial_state=[initial_h])
output2=dense(x)
sampling_model=Model(inputs=[input2,initial_h],outputs=[output2,h])

In [21]:
#Reverse the 
inv_map = {v: k for k, v in word2idx.items()}

In [23]:
#MY SONNET~
for k in range(0,14):
  #Inputs for sampling model's prediction
  np_input=np.array([[word2idx['<sos>']]])
  h=np.zeros((1,Latent_dim))
  sentence=[]
  for i in range(0,T-1):
    yi,h=sampling_model.predict([np_input,h])
    words_dist=yi[0,0]#delete the outter bracket
    words_dist[0]=0
    words_dist/=words_dist.sum()
    idx=np.random.choice(len(words_dist),p=words_dist)
  
    sentence.append(inv_map[idx])
    if idx==word2idx['<eos>']:
      break
    np_input=np.array([[idx]])
  print([[' '.join(i)]for i in [sentence]])


[['now her wrong chase loss, <eos>']]
[['unready to forego void for death, of cold lids apart, realities? me <eos>']]
[['o my verse both them qualify. <eos>']]
[['admit impediments. and on you"--and it fears, them <eos>']]
[["my tongue-tied patience unmoved--and my eyes body's heart and kiss <eos>"]]
[['and death travel even your helpless and circumstances, yea, be, do so <eos>']]
[['oh may me; of alas is i was it! sleep true, out <eos>']]
[['no answer, with an this wondrous delivery. <eos>']]
[["because i am cannot upon never crown'd, to or show thy peering eyes. <eos>"]]
[['reason, that might sleep away and he have the you"--and <eos>']]
[['ogni con across softly horse, never greater. knot brave treasure, dream, kill, before like him proud, decay? thirst sonnes']]
[['and satan hates of too thanks all then a wrong. <eos>']]
[['the itchy policy dogs despair, hast allow seek or woe.batter <eos>']]
[['the longing of to it that to make me, <eos>']]
