In [1]:
from __future__ import absolute_import, division, print_function

import tensorflow as tf
import tensorflow.contrib.eager as tfe

tf.enable_eager_execution()

print("TensorFlow version: {}".format(tf.VERSION))
print("Eager execution: {}".format(tf.executing_eagerly()))

import pandas as pd
import numpy as np
import os
import time

TensorFlow version: 1.11.0
Eager execution: True


# Load and Flatten Statements

In [2]:
debate_data = pd.read_csv('data/data-2017.csv')

In [3]:
debate_data.head()

Unnamed: 0,Date,Party,Previous Statement,Statement
0,2017-01-30,,,"The House resumed from November 17, 2016, cons..."
1,2017-01-30,Liberal,"The House resumed from November 17, 2016, cons...","Mr. Speaker, it is a pleasure to be back in th..."
2,2017-01-30,NDP,"Mr. Speaker, it is a pleasure to be back in th...","Mr. Speaker, usually this would be a time when..."
3,2017-01-30,Liberal,"Mr. Speaker, usually this would be a time when...","Mr. Speaker, I am proud to rise in the House t..."
4,2017-01-30,Liberal,"Mr. Speaker, I am proud to rise in the House t...","Mr. Speaker, I am very pleased to have the opp..."


In [4]:
statements = list(debate_data['Statement'])

In [5]:
text = " ".join(statements)

In [6]:
# Preview the text
text[0:800]

'The House resumed from November 17, 2016, consideration of the motion that Bill C-309, An Act to establish Gender Equality Week, be read the second time and referred to a committee. Mr. Speaker, it is a pleasure to be back in the House today and to speak in favour of the important legislation of Bill C-309, which would establish a gender equality week in Canada. This would provide a week to reflect on the importance of gender equality and the ongoing need to advance the cause of equality in Canada.I am proud that our government will support the passage of Bill C-309, with amendments that will be brought at committee. I would like to thank my friend the hon. member for Mississauga—Lakeshore for bringing this important legislation forward. This is an opportunity to remind ourselves of the wo'

In [7]:
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

130 unique characters


## Process Text (Vectorizing etc.)

In [8]:
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [9]:
# The maximum length sentence we want for a single input in characters
seq_length = 100

# Create training examples / targets
chunks = tf.data.Dataset.from_tensor_slices(text_as_int).batch(seq_length+1)

# for item in chunks.take(5):
#   print(repr(''.join(idx2char[item.numpy()])))

In [10]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = chunks.map(split_input_target)

In [11]:
for input_example, target_example in dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))


Input data:  'The House resumed from November 17, 2016, consideration of the motion that Bill C-309, An Act to est'
Target data: 'he House resumed from November 17, 2016, consideration of the motion that Bill C-309, An Act to esta'


In [12]:
# Batch size 
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences, 
# so it doesn't attempt to shuffle the entire sequence in memory. Instead, 
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

## Modeling

In [13]:
class Model(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, units):
    super(Model, self).__init__()
    self.units = units

    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

    if tf.test.is_gpu_available():
      self.gru = tf.keras.layers.CuDNNGRU(self.units, 
                                          return_sequences=True, 
                                          recurrent_initializer='glorot_uniform',
                                          stateful=True)
    else:
      self.gru = tf.keras.layers.GRU(self.units, 
                                     return_sequences=True, 
                                     recurrent_activation='sigmoid', 
                                     recurrent_initializer='glorot_uniform', 
                                     stateful=True)

    self.fc = tf.keras.layers.Dense(vocab_size)
        
  def call(self, x):
    embedding = self.embedding(x)
    
    # output at every time step
    # output shape == (batch_size, seq_length, hidden_size) 
    output = self.gru(embedding)
    
    # The dense layer will output predictions for every time_steps(seq_length)
    # output shape after the dense layer == (seq_length * batch_size, vocab_size)
    prediction = self.fc(output)
    
    # states will be used to pass at every step to the model while training
    return prediction

In [14]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension 
embedding_dim = 256

# Number of RNN units
units = 1024

model = Model(vocab_size, embedding_dim, units)

In [15]:
# Using adam optimizer with default arguments
optimizer = tf.train.AdamOptimizer()

# Using sparse_softmax_cross_entropy so that we don't have to create one-hot vectors
def loss_function(real, preds):
    return tf.losses.sparse_softmax_cross_entropy(labels=real, logits=preds)

In [16]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
# Checkpoint instance
checkpoint = tf.train.Checkpoint(optimizer=optimizer, model=model)

In [17]:
model.build(tf.TensorShape([BATCH_SIZE, seq_length]))

In [None]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  33280     
_________________________________________________________________
gru (GRU)                    multiple                  3935232   
_________________________________________________________________
dense (Dense)                multiple                  133250    
Total params: 4,101,762
Trainable params: 4,101,762
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Training step
EPOCHS = 30

for epoch in range(EPOCHS):
    start = time.time()
    
    # initializing the hidden state at the start of every epoch
    # initally hidden is None
    hidden = model.reset_states()
    
    for (batch, (inp, target)) in enumerate(dataset):
          with tf.GradientTape() as tape:
              # feeding the hidden state back into the model
              # This is the interesting step
              predictions = model(inp)
              loss = loss_function(target, predictions)
              
          grads = tape.gradient(loss, model.variables)
          optimizer.apply_gradients(zip(grads, model.variables))

          if batch % 100 == 0:
              print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch+1,
                                                            batch,
                                                            loss))
    # saving (checkpoint) the model every 5 epochs
    if (epoch + 1) % 5 == 0:
      checkpoint.save(file_prefix = checkpoint_prefix)

    print ('Epoch {} Loss {:.4f}'.format(epoch+1, loss))
    print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
    
! notify-send "Job finished!"

Epoch 1 Batch 0 Loss 4.8701
Epoch 1 Batch 100 Loss 2.3005
Epoch 1 Batch 200 Loss 1.9366


In [None]:
checkpoint.save(file_prefix = checkpoint_prefix)

In [None]:
# Evaluation step (generating text using the learned model)

# Number of characters to generate
num_generate = 1000

# You can change the start string to experiment
start_string = 'Q'

# Converting our start string to numbers (vectorizing) 
input_eval = [char2idx[s] for s in start_string]
input_eval = tf.expand_dims(input_eval, 0)

# Empty string to store our results
text_generated = []

# Low temperatures results in more predictable text.
# Higher temperatures results in more surprising text.
# Experiment to find the best setting.
temperature = 1.0

# Here batch size == 1
model.reset_states()
for i in range(num_generate):
    predictions = model(input_eval)
    # remove the batch dimension
    predictions = tf.squeeze(predictions, 0)

    # using a multinomial distribution to predict the word returned by the model
    predictions = predictions / temperature
    predicted_id = tf.multinomial(predictions, num_samples=1)[-1,0].numpy()
    
    # We pass the predicted word as the next input to the model
    # along with the previous hidden state
    input_eval = tf.expand_dims([predicted_id], 0)
    
    text_generated.append(idx2char[predicted_id])

print (start_string + ''.join(text_generated))