In [6]:
import tensorflow as tf
import tensorflow_hub as hub
import nltk
nltk.download('punkt')
import numpy as np
import pandas as pd

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Overview

In this notebook we train a next word prediction model based on Anne of Green Gables. Our approach is to use Universal Sentence Encoder to encode random lengths of text, and predict the next word using a neural network with one hidden layer.

# Dataset Preparation

Use this to upload your desired training corpus

In [1]:
# for uploading anne.txt
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
with open('/content/drive/My Drive/anne.txt', 'r') as f:
  raw_text = f.read()

In [0]:
def preprocess_text(raw_text):
  words = nltk.word_tokenize(raw_text.lower())
  words = [word for word in words if word.isalpha()]
  

In [0]:
words = nltk.word_tokenize(raw_text.lower())

In [0]:
# we remove words that contain punctuation, this might be too aggressive but it
# significantly cleans up our dataset
words = [word for word in words if word.isalpha()]

In [0]:
# the unique words in the dataset will be our options for prediction
unique_words = list(set(words))
one_hot = pd.get_dummies(pd.Series(unique_words))

In [0]:
encoder_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
universal_sentence_encoder = hub.load(encoder_url)

In [14]:
len(words)

104921

In [0]:
X = []
y = []
i = 2500 # start somewhere in the middle, there's > 100,000 words
while i < 5000:#len(words): # we end off at 5000 to reduce trainng time
  context = np.random.randint(1,10)
  X.append(universal_sentence_encoder([' '.join(words[i-context:i])]))
  y.append(one_hot[words[i]].values)
  i += context

X = np.squeeze(np.asarray(X))
y = np.asarray(y)

# Modeling

In [0]:
inputs = tf.keras.Input(shape=(512,)) # since USE takes text to a 512-D vector
hidden = tf.keras.layers.Dense(50, activation=tf.nn.relu)(inputs)
outputs = tf.keras.layers.Dense(len(unique_words), activation=tf.nn.softmax)(hidden)
model = tf.keras.Model(inputs=inputs, outputs=outputs)

In [20]:
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.fit(X,y, epochs=200)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7fee31af19b0>

# Demo

In [35]:
unfinished = "He put his coffee next "
# we follow the same preprocessing steps as done in training
unfinished_words = nltk.word_tokenize(unfinished.lower())
unfinished_words = [word for word in unfinished_words if word.isalpha()]
X_unfinished = np.asarray(universal_sentence_encoder([' '.join(unfinished_words[-10:])]))
prediction = unique_words[np.argmax(model.predict(X_unfinished))]
prediction

'house'