In [1]:
# Importing Libraries
import spacy
import numpy as np
import random
import tensorflow
from pickle import dump,load
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM,Embedding

### Function Definition

In [2]:
def read_file(filepath):
    '''
    Function to read file from a given path
    '''
    with open(filepath) as f:
        str_text = f.read()
    return str_text

In [3]:
def seperate_punc(doc_text):
    '''
    Function to take only document words that are not punctuation
    '''
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [4]:
def create_model(vocabulary_size, seq_len):
    '''
    Function to create a LSTM model
    '''
    model = Sequential()
    model.add(Embedding(vocabulary_size, seq_len, input_length=seq_len))
    model.add(LSTM(50, return_sequences=True))
    model.add(LSTM(50))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(vocabulary_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    model.summary()
    return model

In [5]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    '''
    INPUTS:
    model : model that was trained on text data
    tokenizer : tokenizer that was fit on text data
    seq_len : length of training sequence
    seed_text : raw string text to serve as the seed
    num_gen_words : number of words to be generated by model
    '''
    
    # Final Output
    output_text = []
    
    # Intial Seed Sequence
    input_text = seed_text
    
    # Create num_gen_words
    for i in range(num_gen_words):
        
        # Take the input text string and encode it to a sequence
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        # Pad sequences to our trained rate (50 words in the video)
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        
        # Predict Class Probabilities for each word
        pred_word_ind = model.predict(pad_encoded)
#         print(pred_word_ind)
        pred_word_ind = np.argmax(pred_word_ind,axis=1)[0]
#         pred_word_ind = np.round(pred_word_ind).astype(int)[0]
#         pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
        print(pred_word_ind)
        # Grab word
        pred_word = tokenizer.index_word[pred_word_ind] 
        
        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word
        
        output_text.append(pred_word)
        
    # Make it look like a sentence.
    return ' '.join(output_text)

In [6]:
nlp = spacy.load('en_core_web_sm')
nlp.disable_pipes('ner', 'tagger', 'parser')
print(nlp.pipe_names)



['tok2vec', 'attribute_ruler', 'lemmatizer']


In [7]:
nlp.max_lenght = 1198623

## Loading the dataset 

In [8]:
dataset = read_file(r"C:\Users\prakash kotian\Desktop\Data_Science\Deep Learning\NLP\NLP Basic\Text Generation with LSTM\moby_dick_four_chapters.txt")

In [9]:
print("\nThe text from our Dataset : \n")
print(dataset[0:500])
print("\n")


The text from our Dataset : 

Call me Ishmael.  Some years ago--never mind how long
precisely--having little or no money in my purse, and nothing
particular to interest me on shore, I thought I would sail about a
little and see the watery part of the world.  It is a way I have of
driving off the spleen and regulating the circulation.  Whenever I
find myself growing grim about the mouth; whenever it is a damp,
drizzly November in my soul; whenever I find myself involuntarily
pausing before coffin warehouses, and bringing up t




In [10]:
tokens = seperate_punc(dataset)





In [11]:
tokens[:10]

['call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long']

In [12]:
## creating a batch of sequence with 26 words

In [13]:
training_length = 25+1
text_sequences = []

for i in range(training_length, len(tokens)):
    seq = tokens[i-training_length:i]
    text_sequences.append(seq)
print("Dividing the chapters into sequences of 26 words - \n\n")
print(f"Text Sequence 1 - \n {' '.join(text_sequences[0])} \n")
print(f"Text Sequence 2 - \n {' '.join(text_sequences[1])} \n")


Dividing the chapters into sequences of 26 words - 


Text Sequence 1 - 
 call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on 

Text Sequence 2 - 
 me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore 



In [14]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)
vocabulary_size = len(tokenizer.word_counts)
sequences = np.array(sequences)

In [15]:
sequences[1]

array([  14,  263,   51,  261,  408,   87,  219,  129,  111,  954,  260,
         50,   43,   38,  314,    7,   23,  546,    3,  150,  259,    6,
       2713,   14,   24,  957])

In [16]:
# Divinding into X and y data
X = sequences[:, :-1]
y = sequences[:, -1]
y = to_categorical(y, num_classes=vocabulary_size+1)
seq_len = X.shape[1]

In [17]:
# Creating a model by calling create_model method
model = create_model(vocabulary_size+1, seq_len)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 25)            67975     
                                                                 
 lstm (LSTM)                 (None, 25, 50)            15200     
                                                                 
 lstm_1 (LSTM)               (None, 50)                20200     
                                                                 
 dense (Dense)               (None, 50)                2550      
                                                                 
 dense_1 (Dense)             (None, 2719)              138669    
                                                                 
Total params: 244,594
Trainable params: 244,594
Non-trainable params: 0
_________________________________________________________________


In [21]:
# Fitting the model
model.fit(X, y, batch_size=128, epochs=8, verbose=1)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x2473aa16748>

In [22]:
# Saving the model and tokenizer
model.save('my_mobydick_model.h5')
dump(tokenizer, open('my_simpletokenizer', 'wb'))

In [23]:
# creating a random sequence
random.seed(101)
random_pick = random.randint(0, len(text_sequences))
random_seed_text = text_sequences[random_pick]
seed_text = ' '.join(random_seed_text)
pred_words = generate_text(model, tokenizer, seq_len, seed_text, num_gen_words=15)
print(f"\n\n The input text : \n {seed_text}")
print(f"\n\n The predicted text (next 15 words) is : \n {pred_words}")
seed_text = ' '.join(text_sequences[2565])
pred_words = generate_text(model, tokenizer, seq_len, seed_text, num_gen_words=25)
print(f"\n\n The input text : \n {seed_text}")
print(f"\n\n The predicted text (next 25 words) is : \n {pred_words}")

1
50
65
4
1
50
65
4
1
50
65
4
1
50
65


 The input text : 
 thought i to myself the man 's a human being just as i am he has just as much reason to fear me as i have


 The predicted text (next 15 words) is : 
 the little night of the little night of the little night of the little night
50
65
4
1
50
65
4
1
50
65
4
1
50
65
4
1
50
65
4
1
50
65
4
1
50


 The input text : 
 had sounded my pocket and only brought up a few pieces of silver,--so wherever you go ishmael said i to myself as i stood in the


 The predicted text (next 25 words) is : 
 little night of the little night of the little night of the little night of the little night of the little night of the little
