##**WikiGen**
####by Jason Ohanaga

# Import libraries

In [None]:
import tensorflow as tf

import numpy as np
import os
import time

!pip install wikipedia
import wikipedia
import re

# necessary to use colab TPUs
from google.colab import auth
auth.authenticate_user()

# Get text

---

Provide the topic and length of the text to be generated.

In [2]:
TOPIC = input("Input the topic: ")
WORD_COUNT = int(input("Input the word count: "))
NUM_OF_TEXT = int(input("Input the number of text to be generated: "))

Input the topic: elvis presley
Input the word count: 500
Input the number of text to be generated: 5


# Process the text

In [3]:
wiki = wikipedia.page(TOPIC)
text = wiki.content
text = re.sub(r'==.*?==+', '', text)
text = text.replace('\n', '')
text[:50]

'Elvis Aaron Presley (January 8, 1935 – August 16, '

In [4]:
print(f'Length of text: {len(text)} characters')

vocab = sorted(set(text))
print(f'Unique chars: {len(vocab)} ')

Length of text: 115952 characters
Unique chars: 83 


In [5]:
ids_from_chars = tf.keras.layers.experimental.preprocessing.StringLookup( # create ids from chars
    vocabulary=list(vocab), mask_token=None, invert=False)

chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup( # create chars from ids
    vocabulary=ids_from_chars.get_vocabulary(), invert=True, mask_token=None)

def text_from_ids(ids):
    ''' Converts chars ids back to chars text sequence'''
    return tf.strings.reduce_join(chars_from_ids(ids), axis=-1) # convert id to char and add to back of list

# Create dataset

In [6]:
# vectorize whole dataset
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))

# creates a dataset from lists
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

for ids in ids_dataset.take(18):
    print(chars_from_ids(ids).numpy().decode('utf-8'), end='')

Elvis Aaron Presle

In [7]:
# create sequences of length 100+1 | +1 is for that extra target char
seq_length = 100
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

In [8]:
# create input and label pairs
def split_input_target(sequence):
    ''' At each time step the input is the 
    current character and the label is the next character'''
    
    input = sequence[:-1]
    target = sequence[1:]
    return input, target

dataset = sequences.map(split_input_target)

for input, label in dataset.take(1):
    print("Input:", text_from_ids(input).numpy())
    print("Label:", text_from_ids(label).numpy())

Input: b'Elvis Aaron Presley (January 8, 1935 \xe2\x80\x93 August 16, 1977), also known simply as Elvis, was an American'
Label: b'lvis Aaron Presley (January 8, 1935 \xe2\x80\x93 August 16, 1977), also known simply as Elvis, was an American '


In [9]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000

dataset = dataset.repeat().shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)
dataset

<PrefetchDataset shapes: ((1024, 100), (1024, 100)), types: (tf.int64, tf.int64)>

# Config TPU

In [None]:
tf.keras.backend.clear_session()
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.TPUStrategy(resolver)

# Build model

In [11]:
class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, embed_dimn, rnn_units, stateful=True):
      super().__init__(self)
      self.embed = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dimn)
      self.gru = tf.keras.layers.GRU(rnn_units, stateful=stateful, return_sequences=True)
      self.gru1 = tf.keras.layers.GRU(rnn_units, stateful=stateful, return_sequences=True)
      self.dense = tf.keras.layers.Dense(vocab_size, activation='softmax')
        
    def call(self, inputs):
      x = self.embed(inputs)
      x = self.gru(x)
      x = self.gru1(x)
      output = self.dense(x)
      return output

# Train model

In [12]:
vocab_size = len(ids_from_chars.get_vocabulary())
embed_dimn = 512
rnn_units = 1024

with strategy.scope():
  model = MyModel(
      vocab_size=vocab_size, 
      embed_dimn=embed_dimn, 
      rnn_units=rnn_units,
      stateful=False,)

  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                metrics=['sparse_categorical_accuracy'])


In [13]:
model.fit(
    dataset,
    steps_per_epoch=100,
    epochs=10,
    callbacks = [tf.keras.callbacks.EarlyStopping(monitor='loss', patience=2)]
)

model.save_weights('./training_model.h5', overwrite=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Generate text

In [14]:
def generate_text(content_txt, predict_len, batch_size):
  # build model with appropriate shape based off of batch size
  tf.keras.backend.clear_session()
  pred_model = MyModel(
      vocab_size=vocab_size, 
      embed_dimn=embed_dimn, 
      rnn_units=rnn_units,
      stateful=True,)
  pred_model.build(input_shape=[batch_size, None])
  pred_model.load_weights('training_model.h5')

  # preprocess initial content string
  content = tf.strings.unicode_split(content_txt, 'UTF-8')
  content = ids_from_chars(content).numpy()
  content = np.repeat(np.expand_dims(content, 0), batch_size, axis=0)

  # run the content forward to prime the state of model
  pred_model.reset_states()
  for i in range(len(content_txt) - 1):
    pred_model.predict(content[:, i:i + 1])

  # get predictions starting from last char of content string
  predictions = [content[:, -1:]]
  for i in range(predict_len):
    last_word = predictions[-1]
    next_probits = pred_model.predict(last_word)[:, 0, :] # pred.shape -> [batch size, seq len, vocab size]

    # sample from our outputs probability distribution and add to predictions array
    next_idx = [np.random.choice(vocab_size, p=next_probits[b]) for b in range(batch_size)]
    predictions.append(np.array(next_idx, dtype=np.int32))

  # convert predicted text ids back to chars and save
  for b in range(batch_size):
    p = np.array([predictions[j][b] for j in range(predict_len)], dtype=np.int32)
    generate = content_txt[:-1]+text_from_ids(p).numpy().decode('utf-8')
    
    # save to file
    path = "generated_{0}_{1}.txt".format(TOPIC.replace(' ', '_'), b+1)
    f = open(path, "w")
    f.write(generate)
    f.close()
    print(f'GENERATED TEXT {b+1}/{batch_size} Complete\n\n')

  return


In [15]:
average_chars_per_word = 5
content_txt = wiki.title
predict_len = WORD_COUNT*average_chars_per_word - len(content_txt) # convert word count to average amount of chars

generate_text(content_txt, predict_len, batch_size=NUM_OF_TEXT)

GENERATED TEXT 1/5 Complete


GENERATED TEXT 2/5 Complete


GENERATED TEXT 3/5 Complete


GENERATED TEXT 4/5 Complete


GENERATED TEXT 5/5 Complete


