In [1]:
import tensorflow as tf
import string
import requests
import pandas as pd

In [2]:
response = requests.get('https://github.com/kamalsai369/TextgenerationusingLSTM/blob/main/TextGeneration/Datasets/doc.txt')


In [3]:
data=response.text.splitlines()
print(data[:10]) # 10 sample

['Looking for some education', 'Made my way into the night', 'All that bullshit conversation', "Baby, can't you read the signs? I won't bore you with the details, baby", "I don't even wanna waste your time", "Let's just say that maybe", 'You could help me ease my mind', "I ain't Mr. Right But if you're looking for fast love", "If that's love in your eyes", "It's more than enough"]


In [4]:
len(data)
#there is 2400 lines in this poetry

2400

In [5]:
# Total word number is:
len(" ".join(data))

91330

In [6]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, Conv1D, MaxPooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences


#### LSTM model and train test preparation

In [7]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(data) #it's going to fit on the data in the forms of lines.

In [8]:
# help(tokenizer)

In [9]:
encoded_text=tokenizer.texts_to_sequences(data)
encoded_text[:10]
# These number actually comes from vocabulary
# it assigns number to each words

[[254, 21, 219, 725],
 [117, 8, 80, 153, 3, 133],
 [14, 10, 726, 727],
 [41, 56, 2, 603, 3, 728, 1, 68, 517, 2, 40, 3, 518, 41],
 [1, 23, 107, 189, 300, 9, 57],
 [286, 35, 46, 10, 230],
 [2, 83, 134, 4, 519, 8, 120],
 [1, 37, 520, 102, 19, 27, 25, 254, 21, 328, 11],
 [27, 209, 11, 13, 9, 124],
 [42, 67, 210, 125]]

In [10]:
wc=tokenizer.word_counts
# word frequency

In [11]:
wi=tokenizer.word_index

In [12]:
print(f"Number of unique words and total vocab size: {len(tokenizer.word_counts)+1}")
vocab_size=len(tokenizer.word_counts)+1 # always adding plus one for tensorflow

Number of unique words and total vocab size: 1396


In [13]:
x=["play this song"]

In [14]:
tokenizer.texts_to_sequences(x)

[[241, 44, 409]]

### Prepare data for training

In [15]:
 encoded_text[:10] # 10 sample

[[254, 21, 219, 725],
 [117, 8, 80, 153, 3, 133],
 [14, 10, 726, 727],
 [41, 56, 2, 603, 3, 728, 1, 68, 517, 2, 40, 3, 518, 41],
 [1, 23, 107, 189, 300, 9, 57],
 [286, 35, 46, 10, 230],
 [2, 83, 134, 4, 519, 8, 120],
 [1, 37, 520, 102, 19, 27, 25, 254, 21, 328, 11],
 [27, 209, 11, 13, 9, 124],
 [42, 67, 210, 125]]

In [16]:
data_list=[]
for i in encoded_text:
    if len(i)>1:
        for j in range(2,len(i)):
            data_list.append(i[:j])
#             print(i[:j]) # if you want to check data

#### Paddding

In [17]:
max_length=20
#max length of line is 20 token per line in our poetry

In [18]:
sequences=pad_sequences(data_list,maxlen=max_length,padding="pre") # we set the lenght size equal to 20
sequences[:5] # 5 sample

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0, 254,  21],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0, 254,  21, 219],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0, 117,   8],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0, 117,   8,  80],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0, 117,   8,  80, 153]])

In [19]:
sequences.shape

(14231, 20)

In [20]:
X=sequences[:,:-1]
y=sequences[:,-1]
print("X values")
print(X[:5]) # 5 sample
print("-"*30)
print("X values")
print(y[:5]) # 5 sample

X values
[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
  254]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 254
   21]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
  117]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 117
    8]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 117   8
   80]]
------------------------------
X values
[ 21 219   8  80 153]


In [21]:
X.shape,y.shape

((14231, 19), (14231,))

In [22]:
y=to_categorical(y,num_classes=vocab_size)
#since unique word number is vocab_size, thus there is vocab_size classes
print(y[:5]) # 5 sample
print("Shape of y",y.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Shape of y (14231, 1396)


In [23]:
print("Shape of X",X.shape)
seq_length=X.shape[1]
seq_length

Shape of X (14231, 19)


19

#### Build Model
- We will build a simple LSTM model

In [24]:
model=Sequential()
model.add(Embedding(vocab_size,50,input_length=seq_length)) 
#The first layer is the Embedded layer that uses 50-length vectors
#return_sequences=True because we add another LSTM
model.add(LSTM(100,return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100,activation="relu"))
model.add(Dense(vocab_size,activation="softmax")) # we use softmax because there is multiclasses



In [25]:
# help(Embedding)

#### You can use dropout() , You will see that Training will be slightly slower trend in convergence, maybe lower final accuracy

In [26]:
# model=Sequential()
# model.add(Embedding(vocab_size,50,input_length=seq_length)) 
# #The first layer is the Embedded layer that uses 50-length vectors
# #return_sequences=True because we add another LSTM
# model.add(Dropout(0.2))
# model.add(LSTM(100,return_sequences=True))
# model.add(Dropout(0.2))
# model.add(LSTM(100))
# model.add(Dropout(0.2))
# model.add(Dense(100,activation="relu"))
# model.add(Dense(vocab_size,activation="softmax")) # we use softmax because there is multiclasses

####  Dropout can be applied to the input and recurrent connections of the memory units with the LSTM precisely and separately.
#### LSTM-specific dropout may have more pronounced effect on the convergence of the network than the layer-wise dropout.

In [27]:
# model=Sequential()
# model.add(Embedding(vocab_size,50,input_length=seq_length)) 
# #The first layer is the Embedded layer that uses 50-length vectors
# #return_sequences=True because we add another LSTM
# model.add(LSTM(100,return_sequences=True,dropout=0.2,recurrent_dropout=0.2))
# model.add(LSTM(100,dropout=0.2,recurrent_dropout=0.2))
# model.add(Dense(100,activation="relu"))
# model.add(Dense(vocab_size,activation="softmax")) # we use softmax because there is multiclasses

#### LSTM and CNN can be used together

In [28]:
# model=Sequential()
# model.add(Embedding(vocab_size,50,input_length=seq_length)) 
# #The first layer is the Embedded layer that uses 50-length vectors
# #return_sequences=True because we add another LSTM
# model.add(Conv1D(filters=32,kernel_size=3, padding="same",activation="relu"))
# model.add(MaxPooling1D(pool_size=2))
# model.add(LSTM(100,return_sequences=True))
# model.add(LSTM(100))
# model.add(Dense(100,activation="relu"))
# model.add(Dense(vocab_size,activation="softmax")) # we use softmax because there is multiclasses

In [29]:
model.summary()

In [30]:
model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=["accuracy"])

In [31]:
model.fit(X,y,batch_size=32,epochs=15)

Epoch 1/15
[1m130/445[0m [32m━━━━━[0m[37m━━━━━━━━━━━━━━━[0m [1m4s[0m 15ms/step - accuracy: 0.0329 - loss: 6.4201

KeyboardInterrupt: 

### Text Generation

In [32]:
text_lenght= 15 # 15 words per line

def generate_text(input_text, no_lines):
    general_text=[]
    for i in range(no_lines):
        text=[]
        for _ in range(text_lenght):
            encoded=tokenizer.texts_to_sequences([input_text])
            encoded=pad_sequences(encoded,maxlen=seq_length,padding="pre")
            y_pred=np.argmax(model.predict(encoded),axis=-1) # it will generate a word index, loop up into dictionary containing word index
            
            predicted_word=""
            for word,index in tokenizer.word_index.items():
                if index==y_pred:
                    predicted_word=word
                    break
                    
            input_text=input_text +' '+ predicted_word
            text.append(predicted_word)
        
        input_text=text[-1]
        text=" ".join(text) # input text will be the last word of first created line
        general_text.append(text)
    
    return general_text

In [33]:
input_text="me"
text_produced=generate_text(input_text,6)
text_produced

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 184ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1

['i i i i i i i i i i i i i i i',
 'i i i i i i i i i i i i i i i',
 'i i i i i i i i i i i i i i i',
 'i i i i i i i i i i i i i i i',
 'i i i i i i i i i i i i i i i',
 'i i i i i i i i i i i i i i i']

In [None]:
input_text="i want to see you"
text_produced=generate_text(input_text,6)
text_produced

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26

['never had met me down in the end with you to love me ease my',
 "god this reminds that i won't bore you to love me ease my love to",
 'prove i am it be someone like you want me to love me ease my',
 "god this reminds that i won't bore you to love me ease my love to",
 'prove i am it be someone like you want me to love me ease my',
 "god this reminds that i won't bore you to love me ease my love to"]

In [None]:
input_text="i want to see you"
text_produced=generate_text(input_text,6)
text_produced

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35

['never had met me down in the end with you to love me ease my',
 "god this reminds that i won't bore you to love me ease my love to",
 'prove i am it be someone like you want me to love me ease my',
 "god this reminds that i won't bore you to love me ease my love to",
 'prove i am it be someone like you want me to love me ease my',
 "god this reminds that i won't bore you to love me ease my love to"]

In [None]:
input_text="i think"
text_produced=generate_text(input_text,6)
text_produced

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29

['i left you apart anymore yeah oh send the sky fall in my heart inside',
 "my way remind me of the last time that i won't let the sky fall",
 'up to you wish i was my place should i leave it on the river',
 "to you road that you're gonna wish you never had met me down in the",
 "scars of your love remind me of the last time that i won't let the",
 "scars of your love remind me of the last time that i won't let the"]