# Text Generation with Neural Networks

In [None]:
import spacy
import numpy as np
from pickle import dump, load

## Tokenize and Clean Text

In [None]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'tagger','ner'])
nlp.max_length = 1198623

In [None]:
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']
    
    
filepath =  'Data/melville-moby_dick.txt'
with open(filepath) as f:
    d = f.read()

# Tokenization
tokens = separate_punc(d)
print('[INFO] Number of tokens: ', len(tokens))

## Create Sequences of Tokens

In [None]:
# Organize into sequences of tokens
train_len = 25+1 # 25 training words, then one target word

# Empty list of sequences
text_sequences = []
for i in range(train_len, len(tokens)):
    # Grab train_len# amount of characters
    seq = tokens[i-train_len:i]
    
    # Add to list of sequences
    text_sequences.append(seq)


' '.join(text_sequences[0])

# Built Text-Generation model

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Conv1D, Flatten
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# Create Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)

# Encoded text
sequences = tokenizer.texts_to_sequences(text_sequences)
sequences = np.array(sequences)

# Calculate vocabulary size
vocabulary_size = len(tokenizer.word_counts) + 1
print('[INFO] Vocabulary size: ', vocabulary_size)

### Prepare Training Data

In [None]:
# Create inputs/outputs
X = sequences[:, :-1]
y = sequences[:, -1]

y = to_categorical(y, num_classes=vocabulary_size)

# Calculate sequence length
seq_length = X.shape[1]
print('[INFO] Sequence length: ', seq_length)

### Training the Model

In [None]:
model = Sequential()
model.add(Embedding(vocabulary_size, train_len, input_length=seq_length))
model.add(Conv1D(256, activation='relu', kernel_size=4, strides=2, padding="same"))
model.add(Flatten())
model.add(Dense(100, activation='relu'))
model.add(Dense(vocabulary_size, activation='softmax'))


model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# model.summary()

In [None]:
score = model.fit(X, y,
                  batch_size=128, 
                  verbose=True, 
                  epochs=300)


# Generating New Text

In [None]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    '''
    Generate text

    Parameters
    ----------
    model : model that was trained on text data
    tokenizer : tokenizer that was fit on text data
    seq_len : length of training sequence
    seed_text : raw string text to serve as the seed
    num_gen_words : number of words to be generated by model

    Outputs
    -------
    Generated text
    '''
    
    # Final Output
    output_text = []
    
    # Intial Seed Sequence
    input_text = seed_text
    
    # Create num_gen_words
    for i in range(num_gen_words):
        
        # Take the input text string and encode it to a sequence
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        # Pad sequences to our trained rate (50 words in the video)
        pad_encoded = pad_sequences([encoded_text], maxlen=seq_len, truncating='pre')
        
        # Predict Class Probabilities for each word
        pred_word_ind = model.predict_classes(pad_encoded, verbose=0)[0]
        
        # Grab word
        pred_word = tokenizer.index_word[pred_word_ind] 
        
        # Update the sequence of input text (shifting one over with the new word)
        input_text += ' ' + pred_word
        
        output_text.append(pred_word)
        
    # Make it look like a sentence.
    return ' '.join(output_text)

### Grab a random seed sequence

In [None]:
import random
random.seed(101)
random_pick = random.randint(0, len(text_sequences))

# Select random text
random_seed_text = text_sequences[random_pick]

seed_text = ' '.join(random_seed_text)
print(f'Select sentence: "{seed_text}"')

In [None]:
generate_text(model,tokenizer,seq_length,seed_text=seed_text,num_gen_words=50)

# Exploring Generated Sequence

In [None]:
filepath = 'Data/moby_dick_four_chapters.txt'

with open(filepath) as f:
    full_text = f.read()

In [None]:
for i,word in enumerate(full_text.split()):
    if word == 'inkling':
        print(' '.join(full_text.split()[i-20:i+20]))
        print('\n')