# Text Generation with Neural Networks

In [1]:
import spacy
import numpy as np
from pickle import dump, load

  return torch._C._cuda_getDeviceCount() > 0


## Tokenize and Clean Text

In [2]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger','ner'])
nlp.max_length = 1198623

In [3]:
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']
    
    
filepath =  'Data/melville-moby_dick.txt'
with open(filepath) as f:
    d = f.read()

# Tokenization
tokens = separate_punc(d)
print('[INFO] Number of tokens: ', len(tokens))



[INFO] Number of tokens:  214708


## Create Sequences of Tokens

In [4]:
# Organize into sequences of tokens
train_len = 25+1 # 25 training words, then one target word

# Empty list of sequences
text_sequences = []
for i in range(train_len, len(tokens)):
    # Grab train_len# amount of characters
    seq = tokens[i-train_len:i]
    
    # Add to list of sequences
    text_sequences.append(seq)


' '.join(text_sequences[0])

'chapter 1 loomings call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to'

# Built Text-Generation model

In [5]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Conv1D, Flatten
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [6]:
# Create Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)

# Encoded text
sequences = tokenizer.texts_to_sequences(text_sequences)
sequences = np.array(sequences)

# Calculate vocabulary size
vocabulary_size = len(tokenizer.word_counts) + 1
print('[INFO] Vocabulary size: ', vocabulary_size)

[INFO] Vocabulary size:  17527


### Prepare Training Data

In [7]:
# Create inputs/outputs
X = sequences[:, :-1]
y = sequences[:, -1]

y = to_categorical(y, num_classes=vocabulary_size)

# Calculate sequence length
seq_length = X.shape[1]
print('[INFO] Sequence length: ', seq_length)

[INFO] Sequence length:  25


### Training the Model

In [8]:
model = Sequential()
model.add(Embedding(vocabulary_size, train_len, input_length=seq_length))
model.add(Conv1D(256, activation='relu', kernel_size=4, strides=2, padding="same"))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(vocabulary_size, activation='softmax'))


model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.summary()

In [9]:
score = model.fit(X, y,
                  batch_size=128, 
                  verbose=True, 
                  epochs=300)


Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300

KeyboardInterrupt: 

# Generating New Text

In [10]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    '''
    Generate text

    Parameters
    ----------
    model : model that was trained on text data
    tokenizer : tokenizer that was fit on text data
    seq_len : length of training sequence
    seed_text : raw string text to serve as the seed
    num_gen_words : number of words to be generated by model

    Outputs
    -------
    Generated text
    '''
    
    # Final Output
    output_text = []
    
    # Intial Seed Sequence
    input_text = seed_text
    
    # Create num_gen_words
    for i in range(num_gen_words):
        
        # Take the input text string and encode it to a sequence
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        # Pad sequences to our trained rate (50 words in the video)
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        
        # Predict Class Probabilities for each word
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
        
        # Grab word
        pred_word = tokenizer.index_word[pred_word_ind] 
        
        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word
        
        output_text.append(pred_word)
        
    # Make it look like a sentence.
    return ' '.join(output_text)

### Grab a random seed sequence

In [11]:
import random
random.seed(101)
random_pick = random.randint(0, len(text_sequences))

# Select random text
random_seed_text = text_sequences[random_pick]

seed_text = ' '.join(random_seed_text)
print(f'Select sentence: "{seed_text}"')

Select sentence: "he recognised his cutting spade pole entangled in the lines that were knotted round the tail of one of these whales there 's a pretty fellow"


In [12]:
generate_text(model,tokenizer,seq_length,seed_text=seed_text,num_gen_words=50)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


'had not been deemed profound usage i should fain not proceeded to get himself at the same hand how it had not given to the matter to keep them ascending and knowing the capstan and lo giving the stranger sails!--aye aye to sleep that from him that holding faith asleep'

# Exploring Generated Sequence

In [13]:
filepath = 'Data/moby_dick_four_chapters.txt'

with open(filepath) as f:
    full_text = f.read()

In [14]:
for i,word in enumerate(full_text.split()):
    if word == 'inkling':
        print(' '.join(full_text.split()[i-20:i+20]))
        print('\n')

were stains of some sort or other. At first I knew not what to make of this; but soon an inkling of the truth occurred to me. I remembered a story of a white man--a whaleman too--who, falling among the


