# Assignment 2 : Follow-up Question Generator

In [1]:
#imports
import pandas as pd
import csv
import json
import pickle
import spacy
import numpy as np
import tensorflow as tf
from tensorflow.python.layers.core import Dense
from tensorflow.contrib.seq2seq.python.ops import beam_search_ops
import time
import model
import os
from random import randint

In [2]:
with open('your_csv_file.csv', 'r') as file:  
    reader = csv.reader(file)
    result = list(reader) #Putting them all in a list

paragraph_list = []
for i in range(len(result)-1):
    paragraph1 = ' '.join(map(str, result[i+1][1:3]))
    paragraph_list.append(paragraph1)

followupque_list = []
for j in range(len(result)-1):
    paragraph2 = ''.join(map(str, result[j+1][3]))
    followupque_list.append(paragraph2)

df = pd.DataFrame(data={"paragraph": paragraph_list, "followup question": followupque_list})
df.to_csv("./newdata.csv", sep=',',index=False)

In [3]:
#Pickle up the extracted lists of questions/answers pairs
def savepickle(data, filename):
    """Saves the data into pickle format"""
    save_documents = open(filename +'.pickle', 'wb')
    pickle.dump(data, save_documents)
    save_documents.close()
    
savepickle(paragraph_list, 'train_paragraphs')
savepickle(followupque_list, 'train_questions')
savepickle(paragraph_list, 'dev_paragraphs')
savepickle(followupque_list, 'dev_questions')

In [4]:
# Load up spacy model and import stop words
nlp = spacy.load('en_core_web_sm')
from spacy.lang.en import STOP_WORDS
for word in STOP_WORDS:
    lexeme = nlp.vocab[word]
    lexeme.is_stop = True
    
def loadpickle(data_filepath):
    #Loads up the pickled dataset for further parsing and preprocessing
    documents_f = open(data_filepath+'.pickle', 'rb')
    data = pickle.load(documents_f)
    documents_f.close()
    
    return data

In [5]:
def preprocessData(data, remove_stopwords=True, replace_entities=False):
    parsed_data = []
    for index in range(len(data)):
        text = data[index]
        
        if replace_entities:
            spacy_text = nlp(text)
            text_ents = [(str(ent), str(ent.label_)) for ent in spacy_text.ents]
            
            text = text.lower()
            for ent in text_ents:
                replacee = str(ent[0].lower())
                replacer = str(ent[1])
                try:
                    text = text.replace(replacee, replacer)
                except:
                    pass
        else:
            text = text.lower()
            
        text = nlp(text)
        if remove_stopwords:
            text = [str(token.orth_) for token in text 
                    if not token.is_stop and not token.is_punct]
            text = ' '.join(text)
        else:
            text = [str(token.orth_) for token in text if not token.is_punct]
            text = ' '.join(text)
            
        parsed_data.append(text)
        
        if index % 100 == 0 and index > 0:
            print('Preprocessing {}/{}'.format(index, len(data)))
            
        if index % 1000 == 0 and index > 0:
            print('Pickling progress so far.')
            savepickle(parsed_data, 'parsed_data')
         
        if index % 2000 == 0:
            try:
                print(text)
            except:
                pass

    return parsed_data
    
def loadEmbeddings(embeddings_index, filepath):
    print('Loading Conceptnet Numberbatch word embeddings')
    with open(filepath, encoding='utf-8') as f:
        for line in f:
            values = line.split(' ')
            word = values[0]
            embedding = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = embedding
    
    print('Word embeddings:', len(embeddings_index))
    
def countWordFreq(word_frequency, data):
    for text in data:
        for token in text.split():
            if token not in word_frequency:
                word_frequency[token] = 1
            else:
                word_frequency[token] += 1

In [6]:
def conversionDicts(word_frequency, embeddings_index, threshold=10):
    missing_words = 0
    
    for token, freq in word_frequency.items():
        if freq > threshold:
            if token not in embeddings_index:
                missing_words += 1
                
    missing_ratio = round(missing_words/len(word_frequency), 4) * 100
    print('Number of words missing from Conceptnet Numberbatch:', missing_words)
    print('Percent of words that are missing from vocabulary: ', missing_ratio, '%')

    #Dictionary to convert words to integers
    print('Creating vocab_to_int dictionary')
    vocab2int = {}
    
    value = 0
    for token, freq in word_frequency.items():
        if freq >= threshold or token in embeddings_index:
            vocab2int[token] = value
            value += 1
    
    # Special tokens that will be added to our vocab. Those tokens will guide the sequence to sequence model
    codes = ['<UNK>', '<PAD>', '<EOS>', '<GO>']   
    
    print('Adding special tokens to vocab_to_int dictionary.')
    for code in codes:
        vocab2int[code] = len(vocab2int)
    
    #Dictionary to convert integers to words
    print('Creating int_to_vocab dictionary.')
    int2vocab = {}
    for token, index in vocab2int.items():
        int2vocab[index] = token
    
    usage_ratio = round(len(vocab2int) / len(word_frequency), 4) * 100
    print("Total number of unique words:", len(word_frequency))
    print("Number of words we will use:", len(vocab2int))
    print("Percent of words we will use: {}%".format(usage_ratio))
    
    return vocab2int, int2vocab

def embeddingMatrix(vocab2int, embeddings_index, embedding_dimensions=300):
    num_words = len(vocab2int)
    #Creating a default matrix with all values set to zero and fill it out
    print('Creating word embedding matrix with all the tokens and their corresponding vectors.')
    word_embedding_matrix = np.zeros((num_words, embedding_dimensions), dtype=np.float32)
    for token, index in vocab2int.items():
        if token in embeddings_index:
            word_embedding_matrix[index] = embeddings_index[token]
        else:
            new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dimensions))
            word_embedding_matrix[index] = new_embedding
            
    return word_embedding_matrix

In [7]:
def convData2Int(data, vocab2int, word_count, unk_count, eos=True):
    converted_data = []
    for text in data:
        converted_text = []
        for token in text.split():
            word_count += 1
            if token in vocab2int:
                converted_text.append(vocab2int[token])
            else:
                converted_text.append(vocab2int['<UNK>'])
                unk_count += 1
        if eos:
            converted_text.append(vocab2int['<EOS>'])
            
        converted_data.append(converted_text)
    
    assert len(converted_data) == len(data)
    return converted_data, word_count, unk_count

def summary(data):
    summary = []
    for text in data:
        summary.append(len(text))
    return pd.DataFrame(summary, columns=['counts'])

def unkCounter(data, vocab2int):
    unk_count = 0
    for token in data:
        if token == vocab2int['<UNK>']:
            unk_count += 1
    return unk_count

In [8]:
def remWrongLenData(coverted_inputs, converted_targets, vocab2int,
                             start_inputs_length, max_inputs_length, max_targets_length, 
                             min_inputs_length=10, min_targets_lengths=5,
                             unk_inputs_limit=1, unk_targets_limit=0):
    sorted_inputs = []
    sorted_targets = []
    
    for length in range(start_inputs_length, max_inputs_length): 
        for index, words in enumerate(converted_targets):
            if (len(converted_targets[index]) >= min_targets_lengths and
                len(converted_targets[index]) <= max_targets_length and
                len(coverted_inputs[index]) >= min_inputs_length and
                unkCounter(converted_targets[index], vocab2int) <= unk_targets_limit and
                unkCounter(coverted_inputs[index], vocab2int) <= unk_inputs_limit and
                length == len(coverted_inputs[index])
               ):
                sorted_targets.append(converted_targets[index])
                sorted_inputs.append(coverted_inputs[index])
        
    #Ensuring the length of sorted paragraph and questions match
    assert len(sorted_inputs) == len(sorted_targets)
    print('Got {} inputs/targets pairs!'.format(len(sorted_inputs)))
    
    return sorted_inputs, sorted_targets

In [9]:
#Loading the dataset
data_inputs = loadpickle('train_squad_paragraphs')
data_targets = loadpickle('train_squad_questions') 
assert len(data_targets) == len(data_inputs)
print('Loaded {} question/answer pairs.'.format(len(data_inputs)))

Loaded 1086 question/answer pairs.


In [10]:
#If parsed dataset is found, we try loading it.
try:
    parsed_inputs = loadpickle('parsed_inputs')
    parsed_targets = loadpickle('parsed_targets')
except:
    print('Preprocessing inputs, this may take a while...')
    parsed_inputs = preprocessData(data_inputs, remove_stopwords=True,
                                    replace_entities=True)
    savepickle(parsed_inputs, 'parsed_inputs')
    print('Preprocessing targets, this may take a while...')
    parsed_targets = preprocessData(data_targets, remove_stopwords=False,
                                     replace_entities=True)
    savepickle(parsed_targets, 'parsed_targets')
    
    assert len(parsed_inputs) == len(parsed_targets)
    print('Loaded up {} parsed inputs/targets pairs'.format(len(parsed_inputs)))

if 1:
    savepickle(parsed_inputs, 'parsed_inputs')
    savepickle(parsed_targets, 'parsed_targets')

In [11]:
#Load Numberbatch word embeddings
filepath = 'numberbatch-en-17.06.txt'
embeddings_index = {}
loadEmbeddings(embeddings_index, filepath)

#Calculate word frequency
word_frequency = {}
countWordFreq(word_frequency, parsed_targets)
countWordFreq(word_frequency, parsed_inputs)

#Get the usable only tokens and their integer conversion
vocab2int, int2vocab = conversionDicts(word_frequency, embeddings_index)
savepickle(vocab2int, 'vocab2int')
savepickle(int2vocab, 'int2vocab')

#Create embedding matrix
word_embedding_matrix = embeddingMatrix(vocab2int, embeddings_index)
del embeddings_index
savepickle(word_embedding_matrix, 'word_embedding_matrix')

Loading Conceptnet Numberbatch word embeddings
Word embeddings: 417195
Removing token which frequency in the corpus is under specified threshold
Number of words missing from Conceptnet Numberbatch: 17
Percent of words that are missing from vocabulary:  0.44999999999999996 %
Creating vocab_to_int dictionary
Adding special tokens to vocab_to_int dictionary.
Creating int_to_vocab dictionary.
Total number of unique words: 3751
Number of words we will use: 3244
Percent of words we will use: 86.48%
Creating word embedding matrix with all the tokens and their corresponding vectors.


In [12]:
#Convert words to integers and pickle the data
word_count = 0
unk_count = 0

print('Converting text to integers')
converted_inputs, word_count, unk_count = convData2Int(parsed_inputs, vocab2int, word_count, unk_count)
converted_targets, word_count, unk_count = convData2Int(parsed_targets, vocab2int, word_count,  unk_count)
assert len(converted_inputs) == len(converted_targets)

unk_percent = round(unk_count/word_count, 4) * 100
print('Total number of words:', word_count)
print('Total number of UNKs:', unk_count)
print('Percent of words that are UNK:', unk_percent)

savepickle(converted_inputs, 'converted_inputs')
savepickle(converted_targets, 'converted_targets')

Converting text to integers
Total number of words: 38009
Total number of UNKs: 1294
Percent of words that are UNK: 3.4000000000000004


In [13]:
#Build summary and sort the data to keep only the appropriate length
assert len(converted_inputs) == len(converted_targets)

summary_inputs = summary(converted_inputs)
summary_targets = summary(converted_targets)

print('Inputs:')
print(summary_inputs.describe())
print('#' * 50)
print('Targets')
print(summary_targets.describe())

sorted_inputs, sorted_targets = remWrongLenData(converted_inputs,
                                                         converted_targets,
                                                         vocab2int,
                                                         start_inputs_length=min(summary_inputs.counts),
                                                         max_inputs_length=int(np.percentile(summary_inputs.counts, 100)),
                                                         max_targets_length=int(np.percentile(summary_targets.counts, 100)),
                                                         min_inputs_length=10,
                                                         min_targets_lengths=5,
                                                         unk_inputs_limit=1,
                                                         unk_targets_limit=0)

#print('Pickling the final files.')
savepickle(sorted_inputs, 'sorted_inputs')
savepickle(sorted_targets, 'sorted_targets')

Inputs:
            counts
count  1086.000000
mean     25.988029
std      16.116707
min       2.000000
25%      15.000000
50%      22.000000
75%      34.000000
max     143.000000
##################################################
Targets
            counts
count  1086.000000
mean     11.011050
std       4.729344
min       2.000000
25%       8.000000
50%      10.000000
75%      13.000000
max      42.000000
Got 763 inputs/targets pairs!


In [14]:
#Building the model

def model_inputs():
    
    input_data = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    target_length = tf.placeholder(tf.int32, (None,), name='target_length')
    max_target_length = tf.reduce_max(target_length, name='max_dec_len')
    input_length = tf.placeholder(tf.int32, (None,), name='input_length')

    return input_data, targets, lr, keep_prob, \
           target_length, max_target_length, input_length
           
def processEncodingInput(target_data, vocab2int, batch_size):
    ending = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
    dec_input = tf.concat([tf.fill([batch_size, 1], vocab2int['<GO>']), ending], 1)

    return dec_input

def encodingLayer(rnn_size, sequence_length, num_layers, rnn_inputs, keep_prob):
    #Create the encoding layer
    
    for layer in range(num_layers):
        with tf.variable_scope('encoder_{}'.format(layer)):
            cell_fw = tf.contrib.rnn.LSTMCell(rnn_size,
                                              initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, 
                                                    input_keep_prob=keep_prob)

            cell_bw = tf.contrib.rnn.LSTMCell(rnn_size,
                                              initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, 
                                                    input_keep_prob=keep_prob)

            enc_output, enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, 
                                                                    cell_bw, 
                                                                    rnn_inputs,
                                                                    sequence_length,
                                                                    dtype=tf.float32)
    #Join outputs since we are using a bidirectional RNN
    enc_output = tf.concat(enc_output, 2)
    
    return enc_output, enc_state

In [15]:
def decodingAttentionTraining(rnn_size, enc_output, enc_state, input_length, 
                                dec_cell, batch_size):
    
    attn_mech_training = tf.contrib.seq2seq.BahdanauAttention(rnn_size,
                                                              enc_output,
                                                              input_length,
                                                              normalize=False,
                                                              name='BahdanauAttention')
    dec_cell_training = tf.contrib.seq2seq.AttentionWrapper(cell=dec_cell,
                                                            attention_mechanism=attn_mech_training,
                                                            attention_layer_size=rnn_size)
    initial_state_training = dec_cell_training.zero_state(batch_size, tf.float32)
    initial_state_training = initial_state_training.clone(cell_state=enc_state[0])
    
    return dec_cell_training, initial_state_training

def decodingLayerTraining(dec_embed_input, target_length, dec_cell, initial_state, 
                            output_layer, vocab_size, max_target_length):
    
    training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input,
                                                        sequence_length=target_length,
                                                        time_major=False)
    training_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                       training_helper,
                                                       initial_state,
                                                       output_layer) 
    training_logits = tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                                           output_time_major=False,
                                                           impute_finished=True,
                                                           maximum_iterations=max_target_length)
    return training_logits[0]

def decodingAttentionInference(enc_output, enc_state, input_length, rnn_size, dec_cell,
                                batch_size, beam_width):
    tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(enc_output, multiplier=beam_width)
    tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(enc_state[0], multiplier=beam_width)
    tiled_sequence_length = tf.contrib.seq2seq.tile_batch(input_length, multiplier=beam_width)
    
    attn_mech_sample = tf.contrib.seq2seq.BahdanauAttention(num_units=rnn_size,
                                                            memory=tiled_encoder_outputs,
                                                            memory_sequence_length=tiled_sequence_length)
    dec_cell_inference = tf.contrib.seq2seq.AttentionWrapper(cell=dec_cell,
                                                             attention_mechanism=attn_mech_sample,
                                                             attention_layer_size=rnn_size)
    decoder_initial_state_inference = dec_cell_inference.zero_state(dtype=tf.float32, 
                                                                   batch_size=batch_size*beam_width)
    decoder_initial_state_inference = decoder_initial_state_inference.clone(cell_state=tiled_encoder_final_state)
    
    return dec_cell_inference, decoder_initial_state_inference

def decodingLayerInference(embeddings, start_token, end_token, dec_cell, initial_state, 
                             output_layer, max_target_length, batch_size, beam_width):

    start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), 
                           [batch_size], 
                           name='start_tokens')    
    inference_decoder = tf.contrib.seq2seq.BeamSearchDecoder(cell=dec_cell,
                                                             embedding=embeddings,
                                                             start_tokens=start_tokens,
                                                             end_token=end_token,
                                                             initial_state=initial_state,
                                                             beam_width=beam_width,
                                                             output_layer=output_layer)
    inference_logits = tf.contrib.seq2seq.dynamic_decode(inference_decoder,
                                                         output_time_major=False,
                                                         impute_finished=False,
                                                         maximum_iterations=max_target_length)
    
    return inference_logits[0]

def decodingLayer(dec_embed_input, embeddings, enc_output, enc_state, vocab_size, 
                   input_length, target_length, max_target_length, rnn_size, 
                   vocab2int, keep_prob, batch_size, num_layers, beam_width):

    for layer in range(num_layers):
        with tf.variable_scope('decoder_{}'.format(layer)):
            lstm = tf.contrib.rnn.LSTMCell(rnn_size,
                                           initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
            dec_cell = tf.contrib.rnn.DropoutWrapper(lstm, 
                                                     input_keep_prob = keep_prob)
    
    output_layer = Dense(vocab_size,
                         kernel_initializer=tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))
    #Training decode using standard decoder  
    dec_cell_training, initial_state_training = decodingAttentionTraining(rnn_size, 
                                                                            enc_output, 
                                                                            enc_state, 
                                                                            input_length, 
                                                                            dec_cell, 
                                                                            batch_size)
    
    with tf.variable_scope("decode"):
        training_logits = decodingLayerTraining(dec_embed_input, 
                                                  target_length, 
                                                  dec_cell_training, 
                                                  initial_state_training,
                                                  output_layer,
                                                  vocab_size, 
                                                  max_target_length)
    #Inference decoding using beam search
    dec_cell_inference, decoder_init_state_inference = decodingAttentionInference(enc_output, 
                                                                                    enc_state, 
                                                                                    input_length, 
                                                                                    rnn_size, 
                                                                                    dec_cell,
                                                                                    batch_size, 
                                                                                    beam_width)
        
    with tf.variable_scope("decode", reuse=True):
        inference_logits = decodingLayerInference(embeddings,  
                                                    vocab2int['<GO>'], 
                                                    vocab2int['<EOS>'],
                                                    dec_cell_inference, 
                                                    decoder_init_state_inference, 
                                                    output_layer,
                                                    max_target_length,
                                                    batch_size,
                                                    beam_width)

    return training_logits, inference_logits

In [16]:
def padTextBatch(data_batch, vocab2int):
    max_text = max([len(text) for text in data_batch])
    return [text + [vocab2int['<PAD>']] * (max_text - len(text)) for text in data_batch]

def get_batches(targets, inputs, vocab2int, batch_size):
    for batch_i in range(0, len(inputs)//batch_size):
        start_i = batch_i * batch_size
        targets_batch = targets[start_i:start_i + batch_size]
        inputs_batch = inputs[start_i:start_i + batch_size]
        pad_targets_batch = np.array(padTextBatch(targets_batch, vocab2int))
        pad_inputs_batch = np.array(padTextBatch(inputs_batch, vocab2int))
        pad_targets_lenghts = []
        for target in pad_targets_batch:
            pad_targets_lenghts.append(len(target))
        pad_inputs_lenghts = []
        for text in pad_inputs_batch:
            pad_inputs_lenghts.append(len(text))
        yield pad_targets_batch, pad_inputs_batch, pad_targets_lenghts, pad_inputs_lenghts     

In [17]:
#Load data
print('Loading and preparing data for training...')
enc_inputs = loadpickle('sorted_inputs')
dec_targets = loadpickle('sorted_targets')
vocab2int = loadpickle('vocab2int')
int2vocab = loadpickle('int2vocab')
word_embedding_matrix = loadpickle('word_embedding_matrix')
assert len(enc_inputs) == len(dec_targets)
assert len(vocab2int) == len(int2vocab)

Loading and preparing data for training...


In [18]:
#Setting the parameters
epochs = 100
batch_size = 32
rnn_size = 256
num_layers = 2
learning_rate = 0.005
keep_probability = 0.8     
beam_width = 20

print('Building graph')
# Build the graph
train_graph = tf.Graph()
# Set the graph to default to ensure that it is ready for training
with train_graph.as_default():
    
    # Load the model inputs    
    input_data, targets, lr, keep_prob, target_length, max_target_length, input_length = model.model_inputs()

    # Create the training and inference logits
    training_logits, inference_logits = model.seq2seq_model(tf.reverse(input_data, [-1]),
                                                            targets, 
                                                            keep_prob,   
                                                            input_length,
                                                            target_length,
                                                            max_target_length,
                                                            len(vocab2int)+1,
                                                            rnn_size, 
                                                            num_layers, 
                                                            vocab2int,
                                                            word_embedding_matrix,
                                                            batch_size,
                                                            beam_width)
    
    # Create tensors for the training logits and inference logits
    training_logits = tf.identity(training_logits.rnn_output, 'logits')
    inference_logits = tf.identity(inference_logits.predicted_ids, name='predictions')
    
    # Create the weights for sequence_loss
    masks = tf.sequence_mask(target_length, max_target_length, dtype=tf.float32, name='masks')

    with tf.name_scope("optimization"):
        # Loss function
        cost = tf.contrib.seq2seq.sequence_loss(
            training_logits,
            targets,
            masks)

        # Optimizer
        optimizer = tf.train.AdamOptimizer(learning_rate)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)
print("Graph is built.")

Building graph
Graph is built.


In [19]:
#Training the model

learning_rate_decay = 0.95
min_learning_rate = 0.0005
display_step = 23 # Check training loss after every 20 batches
stop_early = 0 
stop = 3 # If the update loss does not decrease in 3 consecutive update checks, stop training
per_epoch = 3 # Make 3 update checks per epoch
update_check = (len(enc_inputs)//batch_size//per_epoch)-1

update_loss = 0 
batch_loss = 0
# Record the update losses for saving improvements in the model
question_update_loss = [] 
checkpoint_dir = 'ckpt' 
checkpoint_path = os.path.join(checkpoint_dir, 'model.ckpt')


restore = 0

print('Initializing session and training')
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    saver = tf.train.Saver() 
    
    # If we want to continue training a previous session
    ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
    if ckpt and restore:
        print('Restoring old model parameters from %s...' % ckpt.model_checkpoint_path)
        saver.restore(sess, ckpt.model_checkpoint_path)
    
    for epoch_i in range(1, epochs+1):
        update_loss = 0
        batch_loss = 0
        for batch_i, (targets_batch, inputs_batch, targets_lengths, inputs_lengths) in enumerate(
                model.get_batches(dec_targets, enc_inputs, vocab2int, batch_size)):
            start_time = time.time()
            _, loss = sess.run(
                [train_op, cost],
                {input_data: inputs_batch,
                 targets: targets_batch,
                 lr: learning_rate,
                 target_length: targets_lengths,
                 input_length: inputs_lengths,
                 keep_prob: keep_probability})

            batch_loss += loss
            update_loss += loss
            end_time = time.time()
            batch_time = end_time - start_time

            if batch_i % display_step == 0:
                print('Epoch {:>3}/{} Batch {:>4}/{} - Loss: {:>6.3f}, Seconds: {:>4.2f}'
                      .format(epoch_i,
                              epochs, 
                              batch_i, 
                              len(enc_inputs) // batch_size, 
                              batch_loss / display_step, 
                              batch_time*display_step))
                batch_loss = 0

            #print (batch_i)
            if batch_i % (update_check) == 0 and batch_i > 0:
                print("Mean loss for this update:", round(update_loss/update_check, 3))
                question_update_loss.append(update_loss)
                
                # If the update loss is at a new minimum, save the model
                if update_loss <= min(question_update_loss):
                    stop_early = 0
                    saver.save(sess, checkpoint_path)

                else:
                    stop_early += 1
                    if stop_early == stop:
                        break
                update_loss = 0
            
        # Reduce learning rate, but not below its minimum value
        learning_rate *= learning_rate_decay
        if learning_rate < min_learning_rate:
            learning_rate = min_learning_rate
        
        if stop_early == stop:
            print("Stopping Training since minimum loss is achieved")
            break

Initializing session and training
Epoch   1/100 Batch    0/23 - Loss:  0.355, Seconds: 35.93
Mean loss for this update: 4.721
Mean loss for this update: 2.347
Mean loss for this update: 2.786
Epoch   2/100 Batch    0/23 - Loss:  0.135, Seconds: 5.11
Mean loss for this update: 2.781
Mean loss for this update: 2.066
Mean loss for this update: 2.523
Epoch   3/100 Batch    0/23 - Loss:  0.128, Seconds: 4.96
Mean loss for this update: 2.666
Mean loss for this update: 1.984
Mean loss for this update: 2.401
Epoch   4/100 Batch    0/23 - Loss:  0.119, Seconds: 5.40
Mean loss for this update: 2.524
Mean loss for this update: 1.889
Mean loss for this update: 2.304
Epoch   5/100 Batch    0/23 - Loss:  0.114, Seconds: 5.16
Mean loss for this update: 2.416
Mean loss for this update: 1.826
Mean loss for this update: 2.186
Epoch   6/100 Batch    0/23 - Loss:  0.110, Seconds: 5.04
Mean loss for this update: 2.291
Mean loss for this update: 1.749
Mean loss for this update: 2.114
Epoch   7/100 Batch    

In [20]:
def cleanText(text, replace_entities=True):
    """Cleans the text in the same way as in data preprocessing part before training"""
    if replace_entities:
        spacy_text = nlp(text)
        text_ents = [(str(ent), str(ent.label_)) for ent in spacy_text.ents]
        
        text = text.lower()
        # Replace entities
        for ent in text_ents:
            replacee = str(ent[0].lower())
            replacer = str(ent[1])
            try:
                text = text.replace(replacee, replacer)
            except:
                pass
    else:
        text = text.lower()
        
    spacy_text = nlp(text)
    spacy_text = [str(token.orth_) for token in spacy_text 
                  if not token.is_punct and not token.is_stop]
    spacy_text = ' '.join(spacy_text)

    return spacy_text
        
def text2seq(input_sequence):
    """Prepare the text for the model"""
    text = cleanText(input_sequence)
    return [vocab2int.get(word, vocab2int['<UNK>']) for word in text.split()]

In [34]:
int2vocab = loadpickle('int2vocab')
vocab2int = loadpickle('vocab2int')
dev_squad_paragraphs = loadpickle('dev_squad_paragraphs')
dev_squad_paragraphs = list(set(dev_squad_paragraphs))

random_example = randint(0, len(dev_squad_paragraphs))
input_sequence = dev_squad_paragraphs[random_example]

In [22]:
# Setting parameters same as that of training
epochs = 100
batch_size = 32
rnn_size = 512
num_layers = 2
learning_rate = 0.005
keep_probability = 0.75     
beam_width = 3

In [35]:
text = text2seq(input_sequence)
checkpoint_path = 'ckpt/model.ckpt'

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    try:
        print('Restoring old model from %s...' % checkpoint_path)
        loader = tf.train.import_meta_graph(checkpoint_path + '.meta')
        loader.restore(sess, checkpoint_path)
    except: 
        raise 'Checkpoint directory not found!'

    input_data = loaded_graph.get_tensor_by_name('input:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    input_length = loaded_graph.get_tensor_by_name('input_length:0')
    target_length = loaded_graph.get_tensor_by_name('target_length:0')
    keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')
    
    #Multiply by batch_size to match the model's input parameters
    answer_logits = sess.run(logits, {input_data: [text]*batch_size, 
                                      target_length: [25], 
                                      input_length: [len(text)]*batch_size,
                                      keep_prob: 1.0})



Restoring old model from ckpt/model.ckpt...
INFO:tensorflow:Restoring parameters from ckpt/model.ckpt


In [24]:
# Removing the padding
pad = vocab2int["<PAD>"] 
new_logits = []
for i in range(batch_size):
    new_logits.append(answer_logits[i].T)

print('Original Text:', input_sequence.encode('utf-8').strip())

print('\nGenerated Questions:')
for index in range(beam_width):
    print(' -- : {}'.format(" ".join([int2vocab[i] for i in new_logits[1][index] if i != pad and i != -1])))

Original Text: b'What was the toughest decision you ever had to make?   When I was in the 12th standard, I was asked to choose between Biology or Statistics. We had an orientation about each of the streams; and of-course, biology(being biology) seemed way more difficult and rich a subject than statistics. I remember comparing the sizes of the books in both sections. Up until the orientation, I had always wanted to be a cardio-surgeon. My parents are both engineering profesors and so they were nudging me towards medicine. But I chose statistics, because it seemed like the easy way. It was one of the hardest decisions I have had to make.'

Generated Questions:
 -- : do you regret with your choice <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
 -- : do you succumb with your choice <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
 -- : was this pure of your choice <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>


In [30]:
# Removing the padding
pad = vocab2int["<PAD>"] 
new_logits = []
for i in range(batch_size):
    new_logits.append(answer_logits[i].T)

print('Original Text:', input_sequence.encode('utf-8').strip())

print('\nGenerated Questions:')
for index in range(beam_width):
    print(' -- : {}'.format(" ".join([int2vocab[i] for i in new_logits[1][index] if i != pad and i != -1])))

Original Text: b'What is the difference between hard work and smart work? Hard work and smart work are entirely different things. Although they are independent things, having existence on their own, smart work can result from hard work. So the major difference is that even though someone works really hard on something, there is necessarily no need that his/her resultant work should be something worth considering smart. Smart work comes as a result of careful and intelligent thinking followed by precise application of those ideas. This might involve a lot of hard work even though it is not necessary.'

Generated Questions:
 -- : what are you saying there is no distinction between your life <EOS> <EOS> <EOS> <EOS> <EOS>
 -- : what are you saying there is no distinction between them <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
 -- : how are you saying there is no distinction between your life <EOS> <EOS> <EOS> <EOS> <EOS>


In [33]:
# Removing the padding
pad = vocab2int["<PAD>"] 
new_logits = []
for i in range(batch_size):
    new_logits.append(answer_logits[i].T)

print('Original Text:', input_sequence.encode('utf-8').strip())

print('\nGenerated Questions:')
for index in range(beam_width):
    print(' -- : {}'.format(" ".join([int2vocab[i] for i in new_logits[1][index] if i != pad and i != -1])))

Original Text: b'If you could relive the last 10 years of your life, what would you do differently? I am happy with what my life has turned out to be. I would not make any big changes but may be change small small things like be more proactive and be helpful and polite.'

Generated Questions:
 -- : how does your self motivation can play self role <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
 -- : how does your self motivation <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
 -- : how does your self motivation can play your role <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>


In [36]:
# Removing the padding
pad = vocab2int["<PAD>"] 
new_logits = []
for i in range(batch_size):
    new_logits.append(answer_logits[i].T)

print('Original Text:', input_sequence.encode('utf-8').strip())

print('\nGenerated Questions:')
for index in range(beam_width):
    print(' -- : {}'.format(" ".join([int2vocab[i] for i in new_logits[1][index] if i != pad and i != -1])))

Original Text: b'What are your passions? Apart from my studies, i want to visit new places in the world. I also have a diehard feeling to go on a roadtrip in spain just like the movie zindigi na milegi dubaara. \n\\\nI also wish to develop games and other software in computer science. Apart from these i also want to continue with building various others stuffs like the water level sensor i discussed about in my intro. thses electronic stuff fascinate me alot!!'

Generated Questions:
 -- : how strong related your machine <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
 -- : how have you identify creativity related <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
 -- : what strong related your machine <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS> <EOS>
