In [1]:
def read_file(filepath):
    with open(filepath) as f:
        str_text = f.read()
        return str_text

In [2]:
import spacy
nlp = spacy.load('en', disable = ['parser', 'tagger', 'ner']) # with disable param procesess could be ruled out
nlp.max_length = 1198623 # increasing the max length since its default is 1m

In [3]:
def seperate_func(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text \
            not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n '] # to focus on the words, punctuations etc. are not considered

In [4]:
text1 = read_file('sources\moby_dick_four_chapters.txt')
tokens = seperate_func(text1)
len(tokens)

11338

In [5]:
# creating a 25 pcs. network to predict 26th word
train_len = 25 + 1 
text_sequences = []
for i in range(train_len, len(tokens)):
    seq = tokens[i-train_len:i]
    text_sequences.append(seq)
len(text_sequences), type(text_sequences)

(11312, list)

In [6]:
' '.join(text_sequences[0]), ' '.join(text_sequences[1]) # next list is one word over version of the present list

('call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on',
 'me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore')

In [7]:
# converting sequences of words to sequences of IDs
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)
sequences = tokenizer.texts_to_sequences(text_sequences)
sequences[0]

[956,
 14,
 263,
 51,
 261,
 408,
 87,
 219,
 129,
 111,
 954,
 260,
 50,
 43,
 38,
 315,
 7,
 23,
 546,
 3,
 150,
 259,
 6,
 2712,
 14,
 24]

In [8]:
# list of all ID word pairs
tokenizer.index_word

{1: 'the',
 2: 'a',
 3: 'and',
 4: 'of',
 5: 'i',
 6: 'to',
 7: 'in',
 8: 'it',
 9: 'that',
 10: 'he',
 11: 'his',
 12: 'was',
 13: 'but',
 14: 'me',
 15: 'with',
 16: 'as',
 17: 'at',
 18: 'this',
 19: 'you',
 20: 'is',
 21: 'all',
 22: 'for',
 23: 'my',
 24: 'on',
 25: 'be',
 26: "'s",
 27: 'not',
 28: 'from',
 29: 'there',
 30: 'one',
 31: 'up',
 32: 'what',
 33: 'him',
 34: 'so',
 35: 'bed',
 36: 'now',
 37: 'about',
 38: 'no',
 39: 'into',
 40: 'by',
 41: 'were',
 42: 'out',
 43: 'or',
 44: 'harpooneer',
 45: 'had',
 46: 'then',
 47: 'have',
 48: 'an',
 49: 'upon',
 50: 'little',
 51: 'some',
 52: 'old',
 53: 'like',
 54: 'if',
 55: 'they',
 56: 'would',
 57: 'do',
 58: 'over',
 59: 'landlord',
 60: 'thought',
 61: 'room',
 62: 'when',
 63: 'could',
 64: "n't",
 65: 'night',
 66: 'here',
 67: 'head',
 68: 'such',
 69: 'which',
 70: 'man',
 71: 'did',
 72: 'sea',
 73: 'time',
 74: 'other',
 75: 'very',
 76: 'go',
 77: 'these',
 78: 'more',
 79: 'though',
 80: 'first',
 81: 'sort',


In [9]:
# listing id word pairs for a sequence
for i in sequences[0]:
    print(f"{i} : {tokenizer.index_word[i]}")

956 : call
14 : me
263 : ishmael
51 : some
261 : years
408 : ago
87 : never
219 : mind
129 : how
111 : long
954 : precisely
260 : having
50 : little
43 : or
38 : no
315 : money
7 : in
23 : my
546 : purse
3 : and
150 : nothing
259 : particular
6 : to
2712 : interest
14 : me
24 : on


In [10]:
# word counts
print(len(tokenizer.word_counts)) # number of unique words
tokenizer.word_counts # counts for each word

2717


OrderedDict([('call', 27),
             ('me', 2471),
             ('ishmael', 133),
             ('some', 758),
             ('years', 135),
             ('ago', 84),
             ('never', 449),
             ('mind', 164),
             ('how', 321),
             ('long', 374),
             ('precisely', 37),
             ('having', 142),
             ('little', 767),
             ('or', 950),
             ('no', 1003),
             ('money', 120),
             ('in', 5647),
             ('my', 1786),
             ('purse', 71),
             ('and', 9646),
             ('nothing', 281),
             ('particular', 152),
             ('to', 6497),
             ('interest', 24),
             ('on', 1716),
             ('shore', 26),
             ('i', 7150),
             ('thought', 676),
             ('would', 702),
             ('sail', 104),
             ('about', 1014),
             ('a', 10377),
             ('see', 416),
             ('the', 15540),
             ('watery', 26),
  

In [11]:
# converting sequences data to array
import numpy as np
sequences_arr = np.array(sequences)
sequences_arr

array([[ 956,   14,  263, ..., 2712,   14,   24],
       [  14,  263,   51, ...,   14,   24,  957],
       [ 263,   51,  261, ...,   24,  957,    5],
       ...,
       [ 952,   12,  166, ...,  262,   53,    2],
       [  12,  166, 2711, ...,   53,    2, 2717],
       [ 166, 2711,    3, ...,    2, 2717,   26]])

In [12]:
sequences_arr

array([[ 956,   14,  263, ..., 2712,   14,   24],
       [  14,  263,   51, ...,   14,   24,  957],
       [ 263,   51,  261, ...,   24,  957,    5],
       ...,
       [ 952,   12,  166, ...,  262,   53,    2],
       [  12,  166, 2711, ...,   53,    2, 2717],
       [ 166, 2711,    3, ...,    2, 2717,   26]])

In [13]:
# to predict 26th word by using first 25
X = sequences_arr[:,:-1] # first 25 columns are the features 
y = sequences_arr[:,-1] #last column is the label

In [14]:
# converting label to categorical data
from keras.utils import to_categorical
vocabulary_size = len(tokenizer.word_counts)
y = to_categorical(y, num_classes=vocabulary_size+1)
seq_len = X.shape[1]
X.shape

(11312, 25)

In [15]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding

In [16]:
def create_model(vocabulary_size, seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size, seq_len, input_length=seq_len)) # defining input, output and input length dimensions of an embedding
    model.add(LSTM(seq_len*2, return_sequences=True)) # defining first layer as 2 times input data point
    model.add(LSTM(150))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(vocabulary_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics = ['accuracy']) # treating each vocabulary word as its own individual category
    model.summary()
    return model

In [17]:
model = create_model(vocabulary_size+1, seq_len)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 25)            67950     
_________________________________________________________________
lstm (LSTM)                  (None, 25, 50)            15200     
_________________________________________________________________
lstm_1 (LSTM)                (None, 150)               120600    
_________________________________________________________________
dense (Dense)                (None, 50)                7550      
_________________________________________________________________
dense_1 (Dense)              (None, 2718)              138618    
Total params: 349,918
Trainable params: 349,918
Non-trainable params: 0
_________________________________________________________________


In [18]:
model.fit(X, y, batch_size = 128, epochs=2, verbose=1) # batch_size is for how many sequences should pass each time 

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x1c6d2c25640>

In [19]:
from pickle import dump, load
model.save('my_mobydick_model.h5') # saving the model
dump(tokenizer, open('my_simpletokenizer', 'wb')) # saving the tokenizer

In [20]:
from keras.preprocessing.sequence import pad_sequences

In [21]:
# generating new word
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    
    '''
    INPUTS:
    model : model that was trained on text data
    tokenizer : tokenizer that was fit on text data
    seq_len : length of training sequence
    seed_text : raw string text to serve as the seed
    num_gen_words : number of words to be generated by model
    '''
    
    output_text = []
    input_text = seed_text # defining initial seed text
    
    for i in range(num_gen_words): # number of words to be generated
        encoded_text = tokenizer.texts_to_sequences([input_text])[0] # transforming text data to sequences of numbers
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre') # padding the text to make sure the total length is equal to seq_len
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0] # calculating the proababilities for the next word, getting the first elements id number
        pred_word = tokenizer.index_word[pred_word_ind] # finding the actual first element
        input_text += ' ' + pred_word # adding predicted word to the text
        output_text.append(pred_word) #
    
    return ' '.join(output_text)

In [22]:
# picking a random seed text from the text
import random
random.seed(101)
random_pick = random.randint(0, len(text_sequences))
random_seed_text = text_sequences[random_pick]
random_seed_text = ' '.join(random_seed_text)
random_seed_text

"thought i to myself the man 's a human being just as i am he has just as much reason to fear me as i have"

In [23]:
# running the model with weak seed
generate_text(model, tokenizer, seq_len, seed_text=random_seed_text, num_gen_words=25)
# since the seed text is only one sentence the generated text consists of only 'the'

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


'the the the the the the the the the the the the the the the the the the the the the the the the the'

In [24]:
# running the model with strong seed
from keras.models import load_model

model = load_model('epochBIG.h5')
tokenizer2 = load(open('epochBIG', 'rb'))
generate_text(model, tokenizer2, seq_len, seed_text=random_seed_text, num_gen_words=25)

"to be seen there was no bad olfactories my own letter was cheerily listening over his hearers who 's more can go have a wearing"