[View in Colaboratory](https://colab.research.google.com/github/hamil168/Chatbots/blob/master/Seq2Seq.ipynb)

### TODO:
* refactor data processing (padding, bucketing)
* Aggregate CV loss/acc into arrays for early stopping
* create config file

### Completed:
*  Train test split, fixed random seed
*  Validation scores to training loop
*  Determine CV loss/acc calculations 

#### distant todo:
* beamsearch decoder

In [None]:
# For a fresh Colab instance, clone fresh:
#!pip install -q xlrd
#!git clone https://github.com/hamil168/Chatbots

In [None]:
# Change to Colab directory:
#cd Chatbots/

In [None]:
# For an existing Colab instance, pull from master, uncomment this:

#!git pull https://github.com/hamil168/Chatbots master

In [None]:
# Files as they appear in the repo clone
#ls


In [None]:
#!pip install tqdm   ### use later when in .py files

In [None]:
#!pip install pandas

In [None]:
#!pip install time

In [None]:
#!pip install re

In [None]:
#!pip install sklearn

In [None]:
#!pip install scipy

In [2]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 15015454074497344846
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 3177234432
locality {
  bus_id: 1
  links {
  }
}
incarnation: 5302878735416894368
physical_device_desc: "device: 0, name: GeForce GTX 970, pci bus id: 0000:01:00.0, compute capability: 5.2"
]


In [1]:
import numpy as np
import tensorflow as tf
import time
import re
from tensorflow.python.layers.core import Dense
#from tqdm import tqdm    ### use later when in .py files

from sklearn.cross_validation import train_test_split

global graph1, model

graph1 = tf.get_default_graph()



In [3]:
#my preproc.py
from preproc import *

In [4]:
lines = open('movie_lines.txt', encoding = 'utf-8', errors = 'ignore').read().split('\n')
conversations = open('movie_conversations.txt', encoding = 'utf-8', errors = 'ignore').read().split('\n')

In [12]:
id2l, cid, questions, answers, clean_questions, clean_answers, word2count, sorted_clean_questions, sorted_clean_answers = preproc_steps(lines,conversations)

questionswords2int, answerswords2int = map_questions_and_answers_to_integers(word2count)

In [13]:
#conversations[0:5]

In [14]:
#get_conversations_ids(conversations[0:5])

In [15]:
#print(questions[0:3])
#print(clean_questions[0:3])
#print(sorted_clean_questions[0:3])

# PREPROC LENGTH LIMITER CHECK:          ######### NOTE TO SELF: Do we go to length 25 what? <EOS consideration>
# this should be snipped at len() = MAX_LENGTH (25 for testing); it is 550+ otherwise
max_sca = max([len(s) for s in sorted_clean_answers])
print(max_sca)
max_idx = [i for i in range(len(sorted_clean_answers)) if len(sorted_clean_answers[i]) == max_sca]
print(sorted_clean_answers[max_idx[0]]) 

25
[16, 49, 26, 48, 19, 50, 51, 16, 52, 53, 46, 54, 55, 56, 53, 57, 42, 58, 19, 0, 29, 59, 60, 61, 62]


In [16]:
# Create placeholder for inputs and the targets
# in TF, all variables are tensors
# need to go from NP --> TF tensors
# need placeholders for every TF variables inputs and targets

def model_inputs():
  #inputs and targets are 2D matrices
  inputs = tf.placeholder(tf.int32, [None, None], name = 'inputs') 
  targets = tf.placeholder(tf.int32, [None, None], name = 'targets')
  keep_prob = tf.placeholder(tf.float32, name = 'dropout_rate') #dropout
  
  #lr = tf.placeholder(tf.float32, name = 'learning_rate')
  
  encoder_sequence_length = tf.placeholder(tf.int32, (None, ), name='encoder_sequence_length')
  decoder_sequence_length = tf.placeholder(tf.int32, (None, ), name='decoder_sequence_length')
  max_sequence_length = tf.reduce_max(decoder_sequence_length, name='max_sequence_length')
  
  return inputs, targets, keep_prob, encoder_sequence_length, decoder_sequence_length, max_sequence_length

In [17]:
# Create encoder RNN layer
def encoder_rnn(rnn_inputs, rnn_size, num_layers, 
                encoder_sequence_length, keep_prob, encoder_embedding_size, encoder_word_count):
  
  
  # LSTM cell class
  # rnn_size: number of input tensors
  # sequence_length: length of each question in the atch
  
  
  def cell(units, rate):
    layer = tf.contrib.rnn.BasicLSTMCell(units)
    return tf.contrib.rnn.DropoutWrapper(layer, rate)

  encoder_cell_fw = tf.contrib.rnn.MultiRNNCell([cell(rnn_size, keep_prob) for _ in range(num_layers)])
  #encoder_cell_bw = tf.contrib.rnn.MultiRNNCell([cell(rnn_size, keep_prob) for _ in range(num_layers)])

  encoder_embeddings = tf.contrib.layers.embed_sequence(rnn_inputs, encoder_word_count, encoder_embedding_size)
  
  # bidirection rnn function (creates dynamic bidirectional network)
  # builds independent forward and backward rnn
  # need ot make sure the ends match
  # (first element is encoder_output)
  #encoder_outputs, encoder_states = tf.nn.bidirectional_dynamic_rnn(encoder_cell_fw,
  encoder_outputs, encoder_states = tf.nn.dynamic_rnn(encoder_cell_fw,                                                                    
                                                   #cell_bw = encoder_cell_bw,
                                                   inputs = encoder_embeddings,
                                                   sequence_length = encoder_sequence_length,
                                                   dtype = tf.float32)
  
  return encoder_outputs, encoder_states


In [18]:
# preprocessing the targets
# need batches, 
# need each to start with <SOS> token

def preprocess_decoder_inputs(targets, word2int_dict, batch_size):
  """

  Prepares the decoder inputs (i.e. the 'targets') for use
     
     Inputs: 
        targets: the input for the decoder for training.
        word2int_dict: one of the dictionaries used to map a word to its integer
        batch_size: size of each batch for model training
        
     Outputs:
        preprocessed_targets: the processed version of the decoder inputs
  
  """
  
  # Using <SOS> for "start of string", create a tensor with one per batch element
  left_side = tf.fill([batch_size, 1], word2int_dict['<SOS>'])
    
  # Take the targets and remove the last member of each sample (it is blank)
  right_side = tf.strided_slice(targets, [0,0], [batch_size, -1], [1,1])
  
  # Add the <SOS> to the left side of every target phrase
  return tf.concat([left_side, right_side], 1)

### Attention
- (warning for later, when I add Beam Search) **NOTE** If you are using the `BeamSearchDecoder` with a cell wrapped in
`AttentionWrapper`
- will also need to return here with DeviceWrapper for multiple GPUs

In [19]:
def decoder(decoder_inputs, encoder_state, decoder_cell, decoder_embedding_size,
            vocabulary_size, decoder_sequence_length, max_sequence_length,
            word2id_dict, batch_size):
  
  
  embedding_layer = tf.Variable(tf.random_uniform([vocabulary_size, decoder_embedding_size]))
  embeddings = tf.nn.embedding_lookup(embedding_layer, decoder_inputs)
  
  output_layer = Dense(vocabulary_size, kernel_initializer=tf.truncated_normal_initializer(0.0, 0.1))
    
  with tf.variable_scope('decoder'):
  
    train_helper = tf.contrib.seq2seq.TrainingHelper(embeddings, sequence_length = decoder_sequence_length)

    train_decoder = tf.contrib.seq2seq.BasicDecoder(cell = decoder_cell,
                                             helper = train_helper,
                                             initial_state = encoder_state, 
                                                    output_layer = output_layer)


    # returns (final_outputs, final_state, final_sequence_lengths)
    train_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder = train_decoder,
                                                             impute_finished = True,
                                                             maximum_iterations = max_sequence_length)
    ###########################
    #decoder_output_dropout is handled in a attention wrapper function outside of this functinon                                                                
                                                   
         
  with tf.variable_scope('decoder', reuse=True):
  
    starting_id_vector = tf.tile(tf.constant([word2id_dict['<SOS>']], dtype=tf.int32), [batch_size], name = 'starting_id_vector')                                               
                                                   
    infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding_layer, 
                                                            starting_id_vector,
                                                           word2id_dict['<EOS>'])                                                   

    infer_decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell,
                                                    helper = infer_helper,
                                                    initial_state = encoder_state,
                                                    output_layer=output_layer)


    # returns (final_outputs, final_state, final_sequence_lengths)
    infer_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(infer_decoder,
                                                             impute_finished = True,
                                                             maximum_iterations = max_sequence_length)
    
                                                   
  return train_decoder_output, infer_decoder_output
                                                   
                                                   
                                              
  

In [20]:

def attention_mechanism(rnn_size, keep_prob, encoder_outputs, encoder_states, encoder_sequence_length, batch_size):
  
  
  def cell(units, probs):
    layer = tf.contrib.rnn.BasicLSTMCell(units)
    return tf.contrib.rnn.DropoutWrapper(layer, probs)
  
  decoder_cell = cell(rnn_size, keep_prob)
  
  attention_mechanism = tf.contrib.seq2seq.BahdanauAttention(rnn_size, 
                                                            encoder_outputs,
                                                            encoder_sequence_length)
  
  decoder_cell_wrapped = tf.contrib.seq2seq.AttentionWrapper(decoder_cell,
                                                            attention_mechanism,
                                                            rnn_size / 2)
  
  attention_ought = decoder_cell_wrapped.zero_state(batch_size = batch_size, dtype = tf.float32)
  
  encoder_state_new = attention_ought.clone(cell_state = encoder_states[-1])
  
  return decoder_cell_wrapped, encoder_state_new
  


In [21]:
def optimizer_loss(outputs, targets, decoder_sequence_length, max_sequence_length, learning_rate, clip_rate):
    '''
	
		Function used to define optimizer and loss function
		Inputs:
			outputs - outputs got from decoder part of the network
			targets - expected outputs/ labels
			dec_seq_len -
			max_seq_len - 
			learning_rate - small nubmer used to decrease value of gradients used to update our network
			clip_rate - tolerance boundries for clipping gradients
		Outputs:
			loss -
			trained_opt - optimizer with clipped gradients
    '''
    logits = tf.identity(outputs.rnn_output)
    
    mask_weights = tf.sequence_mask(decoder_sequence_length, max_sequence_length, dtype=tf.float32)
    
    with tf.variable_scope('opt_loss'):
        #using sequence_loss to optimize the seq2seq model
        loss = tf.contrib.seq2seq.sequence_loss(logits, 
                                                targets, 
                                                mask_weights)
        
        #Define optimizer
        optimizer = tf.train.AdamOptimizer(learning_rate)

        #Next 3 lines used to clip gradients {Prevent gradient explosion problem}
        gradients = tf.gradients(loss, tf.trainable_variables())
        clipped_grads, _ = tf.clip_by_global_norm(gradients, clip_rate)
        trained_opt = optimizer.apply_gradients(zip(clipped_grads, tf.trainable_variables()))
        
    return loss, trained_opt


In [22]:
class Seq2Seq_Model(object):
    
    def __init__(self, learning_rate, batch_size, encoder_embedded_size, decoder_embedded_size, rnn_size, 
                 number_of_layers, vocab_size, word2id_dict, clip_rate):
        
        #tf.reset_default_graph()
        
        self.inputs, self.targets, self.keep_prob, self.encoder_sequence_length, self.decoder_sequence_length, max_sequence_length = model_inputs()
        
        
        enc_outputs, enc_states = encoder_rnn(self.inputs, 
                                          rnn_size,
                                          number_of_layers, 
                                          self.encoder_sequence_length, 
                                          self.keep_prob, 
                                          encoder_embedded_size, 
                                          vocab_size)
        
        dec_inputs = preprocess_decoder_inputs(self.targets, 
                                                  word2id_dict, 
                                                  batch_size)
        
        
        decoder_cell, encoder_states_new = attention_mechanism(rnn_size, 
                                                          self.keep_prob, 
                                                          enc_outputs, 
                                                          enc_states, 
                                                          self.encoder_sequence_length, 
                                                          batch_size)
        
        train_outputs, inference_output = decoder(dec_inputs, 
                                                  encoder_states_new, 
                                                  decoder_cell,
                                                  decoder_embedded_size, 
                                                  vocab_size, 
                                                  self.decoder_sequence_length, 
                                                  max_sequence_length, 
                                                  word2id_dict, 
                                                  batch_size)
        
        self.predictions  = tf.identity(inference_output.sample_id, name='preds')
        
        self.loss, self.opt = optimizer_loss(train_outputs, 
                                       self.targets, 
                                       self.decoder_sequence_length, 
                                       max_sequence_length, 
                                       learning_rate, 
                                       clip_rate)

In [23]:
def get_accuracy(target, logits):
    """
    Calculate accuracy
    """
    max_seq = max(target.shape[1], logits.shape[1])
    if max_seq - target.shape[1]:
        target = np.pad(
            target,
            [(0,0),(0,max_seq - target.shape[1])],
            'constant')
    if max_seq - logits.shape[1]:
        logits = np.pad(
            logits,
            [(0,0),(0,max_seq - logits.shape[1])],
            'constant')

    return np.mean(np.equal(target, logits))

In [24]:
# Fxn to split data itno batches for batch gradient descent                                            
def split_into_batches(questions, answers, batch_size):
  padded_questions_in_batch = []
  padded_answers_in_batch = []
  final_question_batches = []
  final_answer_batches = []
  
  for batch_index in range(0,len(questions) // batch_size):
      start_index = batch_index * batch_size
                                            
      questions_in_batch = questions[start_index : start_index + batch_size]
      answers_in_batch = answers[start_index : start_index + batch_size]
      
                         
      final_question_batches.append(questions_in_batch)
      final_answer_batches.append(answers_in_batch)
       
  return final_question_batches, final_answer_batches
   

In [25]:
# Next up: hyper parameters
epochs = 5 #100
batch_size = 20  #64 make bigger to make faster
rnn_size = 64 # 512
num_layers = 2  #3
encoding_embedding_size = 64 #512  # 512 col in embedding matrix
decoding_embedding_size = 64 #512
learning_rate = 0.05 # 0.01
learning_rate_decay = 0.9
min_learning_rate = 0.003 #0.0001
keep_prob = 0.5
keep_probability = 0.5  # based on hinton paper '14'
clip= 5

In [27]:
# Split data into training and validation sets

TTS_TEST_SIZE = 0.2
TTS_SEED = 12345

train_Q, val_Q, train_A, val_A = \
        train_test_split(sorted_clean_questions, sorted_clean_answers,
                     test_size = TTS_TEST_SIZE, random_state = TTS_SEED)

padded_train_Q_batches, padded_train_A_batches = \
             split_into_batches(apply_padding(train_Q,questionswords2int),
                                apply_padding(train_A,questionswords2int), 
                                batch_size)

#padded_val_Q = apply_padding(val_Q, questionswords2int)
#padded_val_A = apply_padding(val_A, questionswords2int)


padded_val_Q_batches, padded_val_A_batches = \
            split_into_batches(apply_padding(val_Q, questionswords2int),
                              apply_padding(val_A, questionswords2int),
                              batch_size)


ValueError: Found input variables with inconsistent numbers of samples: [203602, 5540400]

In [26]:
# Apply padding to the sequences so the question and answer sequences are the same
# THERE IS AN ERROR HERE THAT MAKES A TON OF PADS GET ADDED, CAUSING OOM 
# Need to force it to 25 to parallel the limits ont he input strings
def apply_padding(batch_of_sequences, word2int):
    max_sequence_length = max([len(sequence) for sequence in batch_of_sequences])
    return [sequence + [word2int['<PAD>']] * (max_sequence_length - len(sequence)) for sequence in batch_of_sequences]

In [10]:
padded_train_A_batches[0]

NameError: name 'padded_train_A_batches' is not defined

In [59]:
#np.array([sequence + questionswords2int['<PAD>']] * (25 - len(sequence)) for sequence in sorted_clean_answers])

SyntaxError: invalid syntax (<ipython-input-59-e4a92eb603d9>, line 1)

In [70]:
# Need these sizes to help debug OOM error
print(np.asarray(padded_train_Q_batches).shape)
print(np.asarray(padded_train_A_batches).shape)

(8144, 20, 25)
(8144, 20, 554)


In [93]:
# skimming this, there is only one end (8625) per line ... so the problem must be in the padding
sorted_clean_answers

[[16,
  49,
  26,
  48,
  19,
  50,
  51,
  16,
  52,
  53,
  46,
  54,
  55,
  56,
  53,
  57,
  42,
  58,
  19,
  0,
  29,
  59,
  60,
  61,
  62,
  8625],
 [8626,
  64,
  61,
  65,
  66,
  67,
  68,
  69,
  70,
  61,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  61,
  79,
  80,
  53,
  75,
  81,
  82,
  8625],
 [103, 8625],
 [1514, 78, 102, 1534, 34, 149, 599, 8625],
 [28, 153, 226, 3, 6283, 8625],
 [27, 28, 8, 160, 252, 66, 1264, 98, 66, 606, 8625],
 [726, 134, 8625],
 [28, 238, 133, 193, 225, 75, 8625],
 [195, 8625],
 [21, 28, 125, 604, 33, 46, 1496, 48, 8625],
 [42,
  2541,
  156,
  19,
  45,
  79,
  46,
  104,
  1100,
  51,
  957,
  16,
  8626,
  144,
  218,
  512,
  8625],
 [112, 8626, 94, 46, 270, 8625],
 [179, 76, 78, 46, 271, 272, 8625],
 [278,
  25,
  19,
  279,
  19,
  21,
  111,
  95,
  280,
  77,
  117,
  8626,
  8626,
  76,
  263,
  75,
  5,
  19,
  281,
  282,
  283,
  94,
  8626,
  152,
  5,
  42,
  8626,
  8626,
  284,
  8625],
 [117, 61, 243, 4523, 377, 295, 862

In [76]:
print(np.asarray(padded_train_A_batches)[0].shape)
print(np.asarray(padded_train_A_batches)[0][1].shape)

(20, 554)
(554,)


In [81]:
# there appears to be an error in padding!
np.asarray(padded_train_A_batches)[0][1][0:500]

array([  35,   28,    8, 1472,  458,  615, 8625, 8624, 8624, 8624, 8624,
       8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624,
       8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624,
       8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624,
       8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624,
       8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624,
       8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624,
       8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624,
       8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624,
       8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624,
       8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624,
       8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624,
       8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624, 8624,
       8624, 8624, 8624, 8624, 8624, 8624, 8624, 86

In [71]:
# Onesuch error is (299, 20, 69xxx) ... 20 batches of 299 "words" of 69xxx possible words
print(np.asarray(padded_val_Q_batches).shape)
print(np.asarray(padded_val_A_batches).shape)

(2036, 20, 25)
(2036, 20, 299)


In [118]:
np.array(padded_train_A_batches).shape

(8144, 20, 1)

In [132]:
# With adjustments:
print(np.asarray(padded_val_A_batches).shape)
print(np.asarray(padded_val_A_batches)[0])

(2036, 20)
[None None None None None None None None None None None None None None
 None None None None None None]


In [63]:
# If needed during testing
session.close()

In [64]:
# instantiate the Seq2Seq model using graph1
# starts with resetting graph1 for debugging purposes

tf.reset_default_graph()
graph1 = tf.get_default_graph()

with graph1.as_default():
  
  model = Seq2Seq_Model(learning_rate, batch_size, encoding_embedding_size, 
                        decoding_embedding_size,
                        rnn_size, num_layers, len(word2count), 
                        questionswords2int, clip)
  

In [65]:
padded_train_Q_batches = padded_train_Q_batches[0:99]
padded_train_A_batches = padded_train_Q_batches[0:99]

In [66]:
# establish session using graph1

session = tf.Session(graph = graph1)

In [67]:
run_opts = tf.RunOptions(report_tensor_allocations_upon_oom = True)
saver = tf.train.Saver()

In [68]:
# training loop

# initialize global variables
session.run(tf.global_variables_initializer())
PRINT_ERROR = True

# empty lists
epoch_accuracy = []
epoch_loss = []

# Start training loop
for i in range(epochs):
  
  # empty lists to collect loss and acc for bucketd training sets
  batch_accuracy = []
  batch_loss = []
  
  # for bucket index
  # tqdm is a progress bar that does not look good in notebooks
  # but looks good in command line
  #for b_idx in tqdm(len(padded_train_Q_batches)): ### commented out for debugging
  
  for b_idx in range(len(padded_train_A_batches)): #<-- for debugging
  
    # convert the python arrays to numpy arrays
    X_batch = np.asarray(padded_train_Q_batches[b_idx])
    y_batch = np.asarray(padded_train_A_batches[b_idx])
    ### print(X_batch.shape, y_batch.shape)

    # create the feed_dict for the model creation steps
    feed_dict = {model.inputs:X_batch, 
         model.targets:y_batch, 
         model.keep_prob:keep_prob, 
         model.decoder_sequence_length:[len(y_batch[0])]*batch_size,
         model.encoder_sequence_length:[len(X_batch[0])]*batch_size}

    
    
    # a single step of batch gradient descent
    cost, _, preds = session.run([model.loss, model.opt, model.predictions], feed_dict=feed_dict, options = run_opts)

    # collect loss/acc for each batch
    batch_loss.append(cost)
    batch_accuracy.append(get_accuracy(y_batch, preds))

       
    #if(PRINT_ERROR == True and b_idx%100 == 0): 
    if(PRINT_ERROR == True and b_idx%5 == 0):
      print(" Bucket {}:".format(b_idx), 
          " | Loss: {}".format(np.mean(batch_loss)), 
          " | Accuracy: {}".format(np.mean(batch_accuracy)))

  epoch_loss.append(np.mean(batch_loss))
  epoch_accuracy.append(np.mean(batch_accuracy))
  
  # Print epoch and CV loss/accuracy:
  #if(PRINT_ERROR == True and i%100 == 0):
  if(PRINT_ERROR == True and i%1 == 0):
      val_losses = []
      val_acc = []
      
      for v_idx in range(2):#len(padded_val_Q_batches)):
        X_val = np.asarray(padded_val_Q_batches[v_idx])
        
        y_val = np.asarray(padded_val_A_batches[v_idx])
        ### print(X_val.shape, y_val.shape)
        
        # validation feed_dict
        val_feed_dict = {model.inputs:X_val, 
                         model.targets:y_val, 
                         model.keep_prob:1, 
                         model.decoder_sequence_length:[len(y_val[0])]*batch_size,
                         model.encoder_sequence_length:[len(X_val[0])]*batch_size}

        
        # run model loss and predictions, but not optimization -- scoring, not training!
        val_loss, val_preds = session.run([model.loss, model.predictions], feed_dict = val_feed_dict)
                
        val_losses.append(val_loss)
        val_acc.append(get_accuracy(y_val, val_preds))
        #val_acc.append(tf.metrics.accuracy(y_val, val_preds)) ##<-- causes attribute error in np.mean()
           
      print("EPOCH[{}]: {}/{}".format(i, i+1, epochs), 
          "\n --->| loss: {} val: {}".format(np.mean(epoch_loss), np.mean(val_losses)), 
          "\n --->| acc: {} val: {}".format(np.mean(epoch_accuracy), np.mean(val_acc)))

  saver.save(session, "checkpoint/chatbot_{}.ckpt".format(i))
    
session.close()

    

 Bucket 0:  | Loss: 11.163359642028809  | Accuracy: 0.078
 Bucket 5:  | Loss: 5.612247943878174  | Accuracy: 0.596
 Bucket 10:  | Loss: 4.491631507873535  | Accuracy: 0.6236363636363637
 Bucket 15:  | Loss: 3.8540849685668945  | Accuracy: 0.6475000000000001
 Bucket 20:  | Loss: 3.4660143852233887  | Accuracy: 0.6619047619047619
 Bucket 25:  | Loss: 3.2382938861846924  | Accuracy: 0.655846153846154
 Bucket 30:  | Loss: 3.072011709213257  | Accuracy: 0.6518709677419356
 Bucket 35:  | Loss: 2.9535036087036133  | Accuracy: 0.6491111111111112
 Bucket 40:  | Loss: 2.866523265838623  | Accuracy: 0.6529756097560977
 Bucket 45:  | Loss: 2.779364824295044  | Accuracy: 0.6590869565217392
 Bucket 50:  | Loss: 2.715663433074951  | Accuracy: 0.6625490196078431
 Bucket 55:  | Loss: 2.6710174083709717  | Accuracy: 0.6644285714285714
 Bucket 60:  | Loss: 2.635213851928711  | Accuracy: 0.6655409836065573
 Bucket 65:  | Loss: 2.589855194091797  | Accuracy: 0.6692121212121211
 Bucket 70:  | Loss: 2.572109

ResourceExhaustedError: OOM when allocating tensor with shape[299,20,69561] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[Node: decoder/decoder/TensorArrayStack/TensorArrayGatherV3 = TensorArrayGatherV3[dtype=DT_FLOAT, element_shape=[20,69561], _device="/job:localhost/replica:0/task:0/device:GPU:0"](decoder/decoder/TensorArray, decoder/decoder/TensorArrayStack/range, decoder/decoder/while/Exit_2)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[Node: decoder/decoder/while/BasicDecoderStep/decoder/attention_wrapper/Select/_553 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_811_d...per/Select", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](^_cloopdecoder/decoder/while/BasicDecoderStep/decoder/attention_wrapper/assert_equal/Assert/Assert/data_0/_367)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


Caused by op 'decoder/decoder/TensorArrayStack/TensorArrayGatherV3', defined at:
  File "D:\Dev\Anaconda3\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "D:\Dev\Anaconda3\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\ipykernel\kernelapp.py", line 486, in start
    self.io_loop.start()
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\tornado\platform\asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "D:\Dev\Anaconda3\lib\asyncio\base_events.py", line 421, in run_forever
    self._run_once()
  File "D:\Dev\Anaconda3\lib\asyncio\base_events.py", line 1431, in _run_once
    handle._run()
  File "D:\Dev\Anaconda3\lib\asyncio\events.py", line 145, in _run
    self._callback(*self._args)
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\tornado\platform\asyncio.py", line 122, in _handle_events
    handler_func(fileobj, events)
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\tornado\stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\zmq\eventloop\zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\zmq\eventloop\zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\zmq\eventloop\zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\tornado\stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\ipykernel\kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\ipykernel\kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\ipykernel\ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\ipykernel\zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\IPython\core\interactiveshell.py", line 2662, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\IPython\core\interactiveshell.py", line 2785, in _run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\IPython\core\interactiveshell.py", line 2901, in run_ast_nodes
    if self.run_code(code, result):
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\IPython\core\interactiveshell.py", line 2961, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-64-a51dbe1bcb1b>", line 12, in <module>
    questionswords2int, clip)
  File "<ipython-input-16-bf3612450fc8>", line 39, in __init__
    batch_size)
  File "<ipython-input-13-1ba126d07b25>", line 24, in decoder
    maximum_iterations = max_sequence_length)
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\tensorflow\contrib\seq2seq\python\ops\decoder.py", line 328, in dynamic_decode
    final_outputs = nest.map_structure(lambda ta: ta.stack(), final_outputs_ta)
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\tensorflow\python\util\nest.py", line 377, in map_structure
    structure[0], [func(*x) for x in entries])
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\tensorflow\python\util\nest.py", line 377, in <listcomp>
    structure[0], [func(*x) for x in entries])
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\tensorflow\contrib\seq2seq\python\ops\decoder.py", line 328, in <lambda>
    final_outputs = nest.map_structure(lambda ta: ta.stack(), final_outputs_ta)
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\tensorflow\python\ops\tensor_array_ops.py", line 856, in stack
    return self._implementation.stack(name=name)
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\tensorflow\python\ops\tensor_array_ops.py", line 289, in stack
    return self.gather(math_ops.range(0, self.size()), name=name)
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\tensorflow\python\ops\tensor_array_ops.py", line 303, in gather
    element_shape=element_shape)
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\tensorflow\python\ops\gen_data_flow_ops.py", line 6479, in tensor_array_gather_v3
    flow_in=flow_in, dtype=dtype, element_shape=element_shape, name=name)
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\tensorflow\python\util\deprecation.py", line 454, in new_func
    return func(*args, **kwargs)
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\tensorflow\python\framework\ops.py", line 3155, in create_op
    op_def=op_def)
  File "d:\python_virtual_environments\tf19_chatbot\lib\site-packages\tensorflow\python\framework\ops.py", line 1717, in __init__
    self._traceback = tf_stack.extract_stack()

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[299,20,69561] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[Node: decoder/decoder/TensorArrayStack/TensorArrayGatherV3 = TensorArrayGatherV3[dtype=DT_FLOAT, element_shape=[20,69561], _device="/job:localhost/replica:0/task:0/device:GPU:0"](decoder/decoder/TensorArray, decoder/decoder/TensorArrayStack/range, decoder/decoder/while/Exit_2)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.

	 [[Node: decoder/decoder/while/BasicDecoderStep/decoder/attention_wrapper/Select/_553 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_811_d...per/Select", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](^_cloopdecoder/decoder/while/BasicDecoderStep/decoder/attention_wrapper/assert_equal/Assert/Assert/data_0/_367)]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.



In [None]:
### Debugging Parameters
# Q's | BS | bmod | epochs
# OK: 50 | 10 | %2 | 5 
# NOT OK: 100 | 20 | %2 | 5
# error: OOM [299, 20, 69k] <--- an answer. Why is that 299? 2GB


