#### Copyright 2019 Google LLC.

In [1]:
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Text Generation

## Overview

### Learning Objectives

* Understand how to generate text using a LSTM


### Prerequisites

* T08-07 Sequence Prediction

### Estimated Duration

40 minutes

### Grading Criteria

Each exercise is worth 3 points. The rubric for calculating those points is:

| Points | Description |
|--------|-------------|
| 0      | No attempt at exercise |
| 1      | Attempted exercise, but code does not run |
| 2      | Attempted exercise, code runs, but produces incorrect answer |
| 3      | Exercise completed successfully |

There is 1 exercise in this Colab so there are 3 points available. The grading scale will be 3 points.

## Setup and Data Gathering



Before starting, we recommend enabling GPU acceletation. Go to Runtime -> Change runtime type.

In [2]:
# Set random seeds for reproducible results
import numpy as np
import pandas as pd

Before running the next block, go to [this link](https://www.gutenberg.org/ebooks/11) and download the plain text version of *Alice in Wonderland* to your computer. Then rename it 'alice.txt', and upload to the files in this colab.

In [3]:
filename = "alice.txt"

raw_text = open(filename).read()

The most important hyperparameter for the LSTM that we will train on this text is the length of our input sequences: 

In [4]:
n_steps = 10

## Data Preprocessing



In [5]:
raw_text=pd.Series(raw_text.split(' '))

print(raw_text[99:99+n_steps])


99            get
100          very
101         tired
102            of
103       sitting
104            by
105           her
106        sister
107            on
108    the\nbank,
dtype: object


# Tokenization

Spacy is a useful package with includes tools for tokenization (converting word to numbers), and embeddings (converting word-numbers to vectors, thereby avoiding one-hot endcoding.)

In [6]:
from tensorflow import keras

tokenizer = keras.preprocessing.text.Tokenizer(filters='!"#$%&()*+-/:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(raw_text)

#Encode words as integers:
text_tokens = tokenizer.texts_to_sequences(raw_text)

vocab_size = len(tokenizer.word_index) + 1


In [7]:
x=[]
for i in range(len(text_tokens)):
  if len(text_tokens[i])>0:
    x.append(text_tokens[i][0])
    
print(x[99:99+n_steps+1])

[27, 516, 5, 373, 52, 19, 592, 23, 1, 2, 5]


In [8]:

n_samples = len(x)-n_steps

X = np.zeros((n_samples, n_steps))
y = np.zeros(n_samples)

for i in range(len(x)-n_steps-1):
    X[i] = x[i:i + n_steps] #lists of n_steps words
    y[i] = x[i + n_steps] #next word in list

print(X[99],y[99])

[ 27. 516.   5. 373.  52.  19. 592.  23.   1.   2.] 5.0


# Setting up and Training Model

In [9]:
from keras.utils import to_categorical

#Create one-hot encoding of target, y:
y_binary = to_categorical(y, num_classes=vocab_size)

Using TensorFlow backend.


In [10]:
# setup model
embedding_dim=300

model = keras.Sequential([
  keras.layers.Embedding(
    vocab_size,
    embedding_dim,
    trainable=True,
    mask_zero=True, 
    input_length=n_steps
  ),
  keras.layers.LSTM(100, dropout=0.1),
  keras.layers.Dense(vocab_size, activation='softmax')  
])

model.summary()

W0809 19:42:40.895305 4445066688 deprecation.py:506] From /Users/dorishuang/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0809 19:42:40.898790 4445066688 deprecation.py:506] From /Users/dorishuang/anaconda3/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0809 19:42:41.260365 4445066688 deprecation.py:323] From /Users/dorishuang/anaconda3/lib/python3.6/site-packages/tensorflow/python/keras/backend.py:3794: add_dispatch_support.<lo

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 10, 300)           1419900   
_________________________________________________________________
lstm (LSTM)                  (None, 100)               160400    
_________________________________________________________________
dense (Dense)                (None, 4733)              478033    
Total params: 2,058,333
Trainable params: 2,058,333
Non-trainable params: 0
_________________________________________________________________


With GPU acceleration, training the model will take about 5 minutes. You can decrease this by reducing the number of epochs below, but this will lead to less accuracy.

In [11]:
model.compile(
  loss='categorical_crossentropy',
  optimizer='adam',
  metrics=['categorical_accuracy']
)

history = model.fit(
    X,
    y_binary,
    epochs=40,
    batch_size=64
)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40

KeyboardInterrupt: 

#Using the model to generate text

In [None]:

def generate_text(input_text,n):
  output_text=input_text
  for i in range(n):
    token_list = tokenizer.texts_to_sequences([output_text])[0][-n_steps:]
    predicted = model.predict_classes(np.array(token_list)[np.newaxis,:])
    next_word=tokenizer.index_word[predicted[0]]
    output_text += " "+next_word
    
    if i%10==0:
      output_text +='\n'
     
  return output_text


Feel free to change the "seed" text below, but make sure that there are at least n_steps words.

In [None]:
print(generate_text('Alice fell down the rabbit hole and hit her head on the',100))

Notice that the generated text is non-sensical, but is still somewhat Alice-in-Wonderland-esq. Also, basic grammatical rules seem to be roughly followed, in the sense that we don't see any strings of words like "the the a the an".

# Exercises

## Exercise 1

Train the model above on some other body of text you find on the internet. Some interesting results can be obtaind from song lyrics, Shakespeare, presidential tweets,  religous texts, and scientific papers. You'll almost certainly have to play with hyperparameters to get decent results.

### Student Solution

In [None]:
# Your solution here.
filename = "Fur Farming.txt"

raw_text = open(filename).read()
n_steps = 10
raw_text=pd.Series(raw_text.split(' '))

print(raw_text[99:99+n_steps])

In [None]:
from tensorflow import keras

tokenizer = keras.preprocessing.text.Tokenizer(filters='!"#$%&()*+-/:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(raw_text)

#Encode words as integers:
text_tokens = tokenizer.texts_to_sequences(raw_text)

vocab_size = len(tokenizer.word_index) + 1

x=[]
for i in range(len(text_tokens)):
  if len(text_tokens[i])>0:
    x.append(text_tokens[i][0])
    
print(x[99:99+n_steps+1])

In [None]:
n_samples = len(x)-n_steps

X = np.zeros((n_samples, n_steps))
y = np.zeros(n_samples)

for i in range(len(x)-n_steps-1):
    X[i] = x[i:i + n_steps] #lists of n_steps words
    y[i] = x[i + n_steps] #next word in list

print(X[99],y[99])

In [None]:
from keras.utils import to_categorical

#Create one-hot encoding of target, y:
y_binary = to_categorical(y, num_classes=vocab_size)

# setup model
embedding_dim=400

model = keras.Sequential([
  keras.layers.Embedding(
    vocab_size,
    embedding_dim,
    trainable=True,
    mask_zero=True, 
    input_length=n_steps
  ),
  keras.layers.LSTM(200, dropout=0.1),
  keras.layers.Dense(vocab_size, activation='softmax')  
])

model.summary()

In [None]:
model.compile(
  loss='categorical_crossentropy',
  optimizer='adam',
  metrics=['categorical_accuracy']
)

history = model.fit(
    X,
    y_binary,
    epochs=20,
    batch_size=64
)

In [None]:
def generate_text(input_text,n):
  output_text=input_text
  for i in range(n):
    token_list = tokenizer.texts_to_sequences([output_text])[0][-n_steps:]
    predicted = model.predict_classes(np.array(token_list)[np.newaxis,:])
    next_word=tokenizer.index_word[predicted[0]]
    output_text += " "+next_word
    
    if i%10==0:
      output_text +='\n'
     
  return output_text

In [None]:
print(generate_text('Furs are essential to the entire commercialization and the recent price changes have impacted ',200))