In [21]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import torch
import numpy as np
from torch.jit import script, trace
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import contractions
import csv
import random
import json
import re
import os
import unicodedata
import operator

import codecs
from io import open
import itertools
import math
from queue import PriorityQueue
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load & Preprocess Data

## Class Voc

In [4]:
PAD_token = 0  # Used for padding short sentences
SOS_token = 1  # Start-of-sentence token
EOS_token = 2  # End-of-sentence token
class Voc: # Word - Index Mapping
    def __init__(self, name, version):
        self.name = name
        self.version = version
        self.trimmed = False
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"} 
        self.num_words = 3  # Count SOS, EOS, PAD
        if version == "word2vec" or version=='word2vec_small':
            from torchnlp.word_to_vector import GloVe
            if version == "word2vec":
                self.dim = 300 
                self.glove = GloVe()
                self.weights_matrix = np.zeros((10000,self.dim))
            else:
                self.dim = 100 
                self.glove = GloVe(name='6B', dim=self.dim)
            self.weights_matrix = np.zeros((10000,self.dim))
            self.weights_matrix[0] = self.glove[str(PAD_token)]
            self.weights_matrix[1] = self.glove[str(SOS_token)]
            self.weights_matrix[2] = self.glove[str(EOS_token)]
        elif version == "bpemb":
            from bpemb import BPEmb
            self.dim = 100
            self.bpemb = BPEmb(lang="en", dim=self.dim)
            self.index2word = {PAD_token: self.bpemb.decode_ids([PAD_token]), 
                               SOS_token: self.bpemb.decode_ids([SOS_token]), 
                               EOS_token: self.bpemb.decode_ids([EOS_token])} 
            self.weights_matrix = self.bpemb.vectors.copy()
    
    def unicodeToAscii(self,s):
        return ''.join(
            c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn'
        )
    
    def tokenizer(self,s):
        s = self.unicodeToAscii(s.lower().strip())
        s = contractions.fix(s)
        s = re.sub(
            r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", 
            s)
        s = re.sub(r"[ ]+", " ", s)
        s = re.sub(r"\!+", "!", s)
        s = re.sub(r"\,+", ",", s)
        s = re.sub(r"\?+", "?", s)

        if self.version  == "bpemb":
            return self.bpemb.encode(s)
        else: 
            import spacy
            NLP = spacy.load('en')
            return [x.text for x in NLP.tokenizer(s) if x.text != " "]

    def addSentence(self, sentence):
        words = self.tokenizer(sentence) if  type(sentence) == str else sentence
        for word in words:
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
            if self.version in ["word2vec","word2vec_small"]:
                self.weights_matrix[self.num_words] = self.glove[word]
        else:
            self.word2count[word] += 1
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True
        keep_words = []
        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)
        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))
        # Reinitialize dictionaries
        self.word2index = {}
        self.word2count = {}
        self.index2word = {PAD_token: "PAD", SOS_token: "SOS", EOS_token: "EOS"}
        if version == "bpemb":
            self.index2word = {PAD_token: bpemb.decode_ids([PAD_token]), 
                               SOS_token: bpemb.decode_ids([SOS_token]), 
                               EOS_token: bpemb.decode_ids([EOS_token])} 
        self.num_words = 3 # Count default tokens
        if version in ["word2vec","word2vec_small"]:
            self.weights_matrix = np.zeros((len(keep_words),self.dim))
            self.weights_matrix[0] = self.glove[PAD_token]
            self.weights_matrix[1] = self.glove[SOS_token]
            self.weights_matrix[2] = self.glove[EOS_token]                     
        for word in keep_words:
            self.addWord(word)

## Loading Functions

In [28]:
## Convert the json file to dataset of the format [post,[response,emotion],pos_emotion,res_emotion] for number of bucket
def read_data(path,voc,max_size=None):
    data_set = []
    data = json.load(open(path,'r'))
    counter = 0
    size_max = 0
    for pair in data:
        post,emo1,emo2 = pair[0]
        response,res_emo1,res_emo2 = pair[1][0]
        post_word_list = voc.tokenizer(post)
        res_word_list = voc.tokenizer(response)
        if len(post_word_list) < MAX_LENGTH and len(res_word_list) < MAX_LENGTH:
            voc.addSentence(post)
            voc.addSentence(response)
            counter += 1
            if counter % 10000 == 0:
                print("    reading data pair %d" % counter)
                print(post_word_list)
                print(res_word_list)
            data_set.append([post, response, int(emo1), int(res_emo1)])
    return data_set

def getword2index(word):
    if word not in voc.word2index:
        voc.addWord(word)
    return voc.word2index[word]    
def indexesFromSentence(voc, sentence):
#     return [voc.word2index[word] for word in sentence.split(' ')] + [EOS_token]
    if voc.version == "bpemb":
        return voc.bpemb.encode_ids(sentence) + [EOS_token]
    return [getword2index(word) for word in voc.tokenizer(sentence)] + [EOS_token]
def zeroPadding(l, fillvalue=PAD_token):
    return list(itertools.zip_longest(*l, fillvalue=fillvalue))
def binaryMatrix(l, value=PAD_token):
    m = []
    for i, seq in enumerate(l):
        m.append([])
        for token in seq:
            if (type(token) == int and token == PAD_token) or (type(token)!=int and torch.equal(token,value)):
                m[i].append(0)
            else:
                m[i].append(1)
    return m
# Returns padded input sequence tensor and lengths
def inputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
    padList = zeroPadding(indexes_batch)
    padVar = torch.LongTensor(padList)    
    return padVar, lengths
# Returns padded target sequence tensor, padding mask, and max target length
def outputVar(l, voc):
    indexes_batch = [indexesFromSentence(voc, sentence) for sentence in l]
    max_target_len = max([len(indexes) for indexes in indexes_batch])    
    padList = zeroPadding(indexes_batch)
    mask = binaryMatrix(padList)
    mask = torch.ByteTensor(mask)
    padVar = torch.LongTensor(padList)    
    return padVar, mask, max_target_len
  
# Returns all items for a given batch of pairs
def batch2TrainData(voc, pair_batch):  
    pair_batch.sort(key=lambda x: len(indexesFromSentence(voc,x[0])), reverse=True)
#     pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
    input_batch, output_batch,emo_in,emo_out = [],[],[],[]
    for pair in pair_batch:
        input_batch.append(pair[0])
        output_batch.append(pair[1])
        pair[2] = 0 if pair[2] in [0,1,2] else pair[2]
        pair[3] = 0 if pair[3] in [0,1,2] else pair[3]
#         pair[2] = 0 if pair[2] in [1,2,3,5] else 1
#         pair[3] = 0 if pair[2] in [1,2,3,5] else 1
        emo_in.append(pair[2])
        emo_out.append(pair[3])
    inp, lengths = inputVar(input_batch, voc)
    output, mask, max_target_len = outputVar(output_batch, voc)
    return inp, lengths, output, mask, max_target_len,torch.LongTensor(emo_in),torch.LongTensor(emo_out)

## Testing

In [25]:
pair_batch = [random.choice(dev_set) for _ in range(small_batch_size)]
print(pair_batch[0])
pair_batch.sort(key=lambda x: len(x[0].split(" ")), reverse=True)
print(pair_batch[0])
input_batch, output_batch,emo_in,emo_out = [],[],[],[]
pair = pair_batch[0]
input_batch.append(pair[0])
output_batch.append(pair[1])
emo_in.append(pair[2])
emo_out.append(pair[3])

['hello . smith s resident .', 'hello . this is the operator . can i speak to mr . smith please ?', 0, 0]
['he says he ll write a letter soon . he hopes we are all well . love jimmy .', 'is that all ? he doesn t say very much does he ?', 0, 0]


# Seq2Seq Model
**With emotion-embedding, internal memory, external memory**

## Encoder, Decoder and Attention

In [8]:
class EncoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.embedding = embedding

        # Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size'
        #   because our input size is a word embedding with number of features == hidden_size
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers,
                          dropout=(0 if n_layers == 1 else dropout), bidirectional=True)
        
    def forward(self, input_seq, input_lengths, hidden=None):
        # Convert word indexes to embeddings
        embedded = self.embedding(input_seq)
        # Pack padded batch of sequences for RNN module
        packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        # Forward pass through GRU
        outputs, hidden = self.gru(packed, hidden)
        # Unpack padding
        outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        # Sum bidirectional GRU outputs
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:]
        # Return output and final hidden state
        return outputs, hidden
# Luong attention layer
class Attn(torch.nn.Module):
    def __init__(self, method, hidden_size):
        super(Attn, self).__init__()
        self.method = method
        if self.method not in ['dot', 'general', 'concat']:
            raise ValueError(self.method, "is not an appropriate attention method.")
        self.hidden_size = hidden_size
        if self.method == 'general':
            self.attn = torch.nn.Linear(self.hidden_size, hidden_size)
        elif self.method == 'concat':
            self.attn = torch.nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = torch.nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        energy = self.attn(encoder_output)
        return torch.sum(hidden * energy, dim=2)

    def concat_score(self, hidden, encoder_output):
        energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
        return torch.sum(self.v * energy, dim=2)

    def forward(self, hidden, encoder_outputs):
        # Calculate the attention weights (energies) based on the given method
        if self.method == 'general':
            attn_energies = self.general_score(hidden, encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden, encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden, encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()

        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)
      
class AttnDecoderRNN(nn.Module):
    def __init__(self,attn_model,embedding,emotion_embedding,hidden_size,output_size,n_layers=1,dropout=0.1,use_emb=False,use_imemory=False,use_ememory=False):
        super(AttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        self.use_emb = use_emb
        self.use_imemory = use_imemory
        self.use_ememory = use_ememory

        # Define layers
        self.embedding = embedding
        self.emotion_embedding = emotion_embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat_1 = nn.Linear(hidden_size * 2, hidden_size)
        self.concat_3 = nn.Linear(hidden_size * 2, hidden_size)

        self.out = nn.Linear(hidden_size, output_size)
        
        
        # DIY layers
        self.read_linear = nn.Linear(hidden_size*(self.n_layers+1),hidden_size)
        self.write_linear = nn.Linear(hidden_size,hidden_size)
        self.gru_concat = nn.Linear(hidden_size*2,hidden_size)
        

        self.attn = Attn(attn_model, hidden_size)
    def forward(self, input_step, emotion, last_hidden, encoder_outputs):
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_step) # 1,64,300
        embedded = self.embedding_dropout(embedded)
        
        if self.use_emb and self.use_imemory:
          
            if emotion.size()!=embedded.size():
                emotion = self.emotion_embedding(emotion).unsqueeze(dim=0) # mem_write
            
            _,tmp_size,_ = last_hidden.size()

            read_gate = torch.sigmoid(self.read_linear(torch.cat([embedded,torch.reshape(last_hidden,(1,tmp_size,-1))],dim=2)))
            mem_read = torch.mul(emotion,read_gate)
            
            gru_input = self.gru_concat(torch.cat([embedded,mem_read],dim=2))
            rnn_output, hidden = self.gru(gru_input, last_hidden)
            
            write_gate = torch.sigmoid(self.write_linear(rnn_output))
            emotion = torch.mul(write_gate,mem_read)
          
          
          

            attn_weights = self.attn(rnn_output, encoder_outputs)
            # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
            context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
            # Concatenate weighted context vector and GRU output using Luong eq. 5
            rnn_output = rnn_output.squeeze(0)
            context = context.squeeze(1)
            concat_input = torch.cat((rnn_output, context), 1)
            concat_output = torch.tanh(self.concat_3(concat_input))
            # Predict next word using Luong eq. 6
            output = self.out(concat_output)
            output = F.softmax(output, dim=1)
            return output,hidden,emotion
        elif self.use_emb:
            emotion_embedded = self.emotion_embedding(emotion).unsqueeze(dim=0)
            embedded = self.concat_1(torch.cat([embedded,emotion_embedded],dim = 2))

        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat_3(concat_input))
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output,hidden,input_emotion      

## Train and Loss Function

In [13]:
def maskNLLLoss(inp,emotion,target,mask,decoder):
    nTotal = mask.sum()
    crossEntropy = -torch.log(torch.gather(inp, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    if decoder.use_imemory:
        emo_loss = torch.norm(emotion)
        if math.isnan(emo_loss): loss+= emo_loss
    loss = loss.to(device)
    return loss, nTotal.item()
MAX_LENGTH = 30
def train(input_variable,input_emotion,lengths, target_variable, mask, max_target_len, encoder, decoder,
          encoder_optimizer, decoder_optimizer, batch_size, clip, max_length=MAX_LENGTH):

    # Zero gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

#     Set device options
    input_variable = input_variable.to(device)
    lengths = lengths.to(device)
    target_variable = target_variable.to(device)
    mask = mask.to(device)

    # Initialize variables
    loss = 0
    print_losses = []
    n_totals = 0

    # Forward pass through encoder
    encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

    # Create initial decoder input (start with SOS tokens for each sentence)
    decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device)
    input_emotion = input_emotion.to(device)
    

    # Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden = encoder_hidden[:decoder.n_layers]

    # Determine if we are using teacher forcing this iteration
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    # Forward batch of sequences through decoder one time step at a time
    if use_teacher_forcing:
        for t in range(max_target_len):
            decoder_output, decoder_hidden,input_emotion = decoder(
                decoder_input,input_emotion,decoder_hidden, encoder_outputs
            )
            # Teacher forcing: next input is current target
            decoder_input = target_variable[t].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output,input_emotion,target_variable[t], mask[t],decoder)
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden,input_emotion = decoder(
                decoder_input,input_emotion,decoder_hidden, encoder_outputs
            )
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            mask_loss, nTotal = maskNLLLoss(decoder_output,input_emotion,target_variable[t], mask[t],decoder)
            loss += mask_loss
            print_losses.append(mask_loss.item() * nTotal)
            n_totals += nTotal

    # Perform backpropatation
    loss.backward()

    # Clip gradients: gradients are modified in place
    _ = torch.nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = torch.nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return sum(print_losses) / n_totals

def trainIters(model_name, voc, pairs, dev_pairs, encoder, decoder, encoder_optimizer, 
               decoder_optimizer, embedding,emo_embedding, encoder_n_layers, decoder_n_layers, n_iteration, batch_size, print_every, save_every, clip, corpus_name, loadFilename=None):

    # Load batches for each iteration
    training_batches = [batch2TrainData(voc, [random.choice(pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]
    dev_batches = [batch2TrainData(voc, [random.choice(dev_pairs) for _ in range(batch_size)])
                      for _ in range(n_iteration)]
    # Initializations
    print('Initializing ...')
    start_iteration = 1
    print_loss = 0
    if loadFilename:
        start_iteration = checkpoint['iteration'] + 1

    # Training loop
    print("Training...")
    for iteration in range(start_iteration, n_iteration + 1):
        training_batch = training_batches[iteration - 1]
        # Extract fields from batch
        input_variable, lengths, target_variable, mask, max_target_len,emo_in,emo_out = training_batch
        # Run a training iteration with batch
        loss = train(input_variable,emo_out,lengths, target_variable, mask, max_target_len, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, batch_size, clip)

        print_loss += loss

        # Print progress
        if iteration % print_every == 0:
            print_loss_avg = print_loss / print_every
            print("Iteration: {}; Percent complete: {:.1f}%; Average loss: {:.4f}".format(iteration, iteration / n_iteration * 100, print_loss_avg))
            print_loss = 0

        # Save checkpoint
        if (iteration % save_every == 0):
            dev_batch = dev_batches[iteration-1]
            input_variable2, lengths2, target_variable2, mask2, max_target_len2,emo_in2,emo_out2 = dev_batch
            
            dev_loss = train(input_variable2,emo_out2, lengths2, target_variable2, mask2, max_target_len2, encoder,
                     decoder,encoder_optimizer, decoder_optimizer, batch_size, clip)
            print("Iteration: {}; Dev loss: {:.4f}".format(iteration, dev_loss))
            directory = os.path.join(model_name, _corpus_name, '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size))
            if not os.path.exists(directory):
                os.makedirs(directory)
            torch.save({
                'iteration': iteration,
                'en': encoder.state_dict(),
                'de': decoder.state_dict(),
                'en_opt': encoder_optimizer.state_dict(),
                'de_opt': decoder_optimizer.state_dict(),
                'loss': loss,
                'voc_dict': voc.__dict__,
                'embedding': embedding.state_dict(),
                'emo_embedding':emo_embedding.state_dict()
            }, os.path.join(directory, '{}_{}.tar'.format(iteration, 'checkpoint')))

## Searcher

In [14]:
class MyTopKDecoder(nn.Module):
    def __init__(self, encoder, decoder,k):
        super(MyTopKDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.k = k
    def forward(self, input_seq,target_emotion,input_length,num_output, max_length):
        # Forward input through encoder model
        encoder_output, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:decoder.n_layers]
        # Initialize decoder input with SOS_token
        
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        emotion = torch.LongTensor([target_emotion]).to(device)
        return beam_decode(decoder, emotion, decoder_hidden, encoder_output=encoder_output,topk=self.k,num_output = num_output,debug=True)     
      
class GreedySearchDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super(GreedySearchDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq,target_emotion, input_length, max_length):
        # Forward input through encoder model
        encoder_outputs, encoder_hidden = self.encoder(input_seq, input_length)
        # Prepare encoder's final hidden layer to be first hidden input to the decoder
        decoder_hidden = encoder_hidden[:decoder.n_layers]
#         # Initialize decoder input with SOS_token
#         decoder_input = torch.ones(1, 1, dtype=torch.long) * SOS_token
#         # Initialize tensors to append decoded words to
#         all_tokens = torch.zeros([0], dtype=torch.long)
#         all_scores = torch.zeros([0])
        # Initialize decoder input with SOS_token
        decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
        # Initialize tensors to append decoded words to
        all_tokens = torch.zeros([0], device=device, dtype=torch.long)
        all_scores = torch.zeros([0], device=device)
        # Iteratively decode one word token at a time
        decoder_emotion = torch.LongTensor([target_emotion]).to(device)
        for _ in range(max_length):
            # Forward pass through decoder
            decoder_output, decoder_hidden, decoder_emotion = self.decoder(decoder_input,decoder_emotion, decoder_hidden, encoder_outputs)
            # Obtain most likely word token and its softmax score
            decoder_scores, decoder_input = torch.max(decoder_output, dim=1)
            # Record token and score
            all_tokens = torch.cat((all_tokens, decoder_input), dim=0)
            all_scores = torch.cat((all_scores, decoder_scores), dim=0)
            # Prepare current token to be next decoder input (add a dimension)
            decoder_input = torch.unsqueeze(decoder_input, 0)
        # Return collections of word tokens and scores
        return all_tokens, all_scores
      

In [15]:
beam_width = 15
max_qsize = 1000
max_past_word = 20
similar_word_len = 2
class BeamSearchNode(object):
    def __init__(self, hiddenstate,hiddenemotion,previousNode, wordId, logProb, length, past=[]):
        '''
        :param hiddenstate:
        :param previousNode:
        :param wordId:
        :param logProb:
        :param length:
        '''
        self.h = hiddenstate
        self.e = hiddenemotion
        self.prevNode = previousNode
        self.wordid = wordId
        self.logp = logProb
        self.leng = length
        self.past = past
        
    def __lt__(self, other):      
#         return self.logp < other.logp
        return self.eval() < other.eval()
    def eval(self, alpha=1.0):
        reward = 0
        # Add here a function for shaping a reward
        return self.logp / float(self.leng - 1 + 1e-6) + alpha * reward
def beam_decode(decoder,decoder_emotion,decoder_hidden,encoder_output,bram_width, num_output,debug=False):
    sent_breaker = ['.','.','!','?',':']
    # Start with the start of the sentence token
    decoder_input = torch.ones(1, 1, device=device, dtype=torch.long) * SOS_token
    endnodes = []

    # starting node -  hidden vector, previous node, word id, logp, length
    node = BeamSearchNode(decoder_hidden, decoder_emotion, None, decoder_input, 0, 1)
    nodes = PriorityQueue()
    # start the queue
    nodes.put((-node.eval(), node))
    qsize = 1
    # start beam search
    while True:
        # give up when decoding takes too long
        if nodes.qsize() > max_qsize or nodes.empty(): break
        # fetch the best node
        score, n = nodes.get()
        decoder_input = n.wordid
        decoder_hidden = n.h
        if n.wordid.item() == EOS_token and n.prevNode != None:
            endnodes.append((score, n))
            # if we reached maximum # of sentences required
            if len(endnodes) >= num_output:break
            else:
                continue
        # decode for one step using decoder
        decoder_output,decoder_hidden,decoder_emotion = decoder(decoder_input,decoder_emotion,decoder_hidden,encoder_output)
        # PUT HERE REAL BEAM SEARCH OF TOP
        log_prob, indexes = torch.topk(decoder_output, beam_width)
        
        nextnodes = []
        word_past = n.past.copy()
        for new_k in range(beam_width):
            decoded_t = indexes[0][new_k].view(1, -1)
            eval_1 = voc.bpemb.decode_ids([decoded_t.item()]) in sent_breaker
            eval_2 = decoded_t.item() in word_past
            eval_3 = len(voc.bpemb.decode_ids([decoded_t.item()]))>similar_word_len
            if (eval_1 and n.leng < max_past_word ) or (eval_2 and eval_3):
                continue
            else:   
#                 print("current word is {}".format(voc.bpemb.decode_ids([decoded_t.item()])))
                log_p = log_prob[0][new_k].item()
                word_past.append(decoded_t.item())
                if len(word_past)>max_past_word:
                    word_past = word_past[-max_past_word:]
                node = BeamSearchNode(decoder_hidden,decoder_emotion, n,decoded_t,n.logp + log_p, n.leng + 1,word_past)
                score = -node.eval()
                nextnodes.append((score, node))
        # put them into queue
        for i in range(len(nextnodes)):
            score, nn = nextnodes[i]
            nodes.put((score, nn))

    
    # choose nbest paths, back trace them
    if len(endnodes) == 0:
        endnodes = [nodes.get() for _ in range(min(num_output,nodes.qsize()))]
    utterances = []
#     print(len(endnodes))
    for score, n in sorted(endnodes, key=operator.itemgetter(0)):

        utterance = []
        utterance.append(n.wordid)
        # back trace
        while n.prevNode != None:
            n = n.prevNode
            utterance.append(n.wordid)
        
        utterance = utterance[::-1]
        utterances.append(utterance)


    return utterances

## Evaluation

In [16]:
def evaluateInput(encoder, decoder, searcher, voc, num_output=5, max_length=10):
    input_sentence = ''
    while(1):
        try:
            # Get input sentence
            input_sentence = input('> ')
            # Check if it is quit case
            if input_sentence == 'q' or input_sentence == 'quit': break
            # Normalize sentence
#             input_sentence = normalizeString(input_sentence)
            # words -> indexes
            indexes_batch = [indexesFromSentence(voc, input_sentence)]
            # Create lengths tensor
            lengths = torch.tensor([len(indexes) for indexes in indexes_batch])
            # Transpose dimensions of batch to match models' expectations
            input_batch = torch.LongTensor(indexes_batch).transpose(0, 1)            
            # Use appropriate device
            input_batch = input_batch.to(device)
            lengths = lengths.to(device)
            # Decode sentence with searcher
            output = []
            for e in range(num_emotion):
                if type(searcher) == MyTopKDecoder:  
                    tmp = searcher(input_batch,e,lengths, num_output, max_length)
#                     res = tmp[e][:num_output]
#                     print(tmp)
                    for i in range(min(len(tmp),num_output)):
                        if voc.version == "bpemb":
                            padding = [0,1,2]
                            words = [token.item() for token in tmp[i]]
                            filtered = filter(lambda x: True if x not in padding else False, words)
                            print('{}: '.format(int2emotion[e]), voc.bpemb.decode_ids(list(filtered)))
                        else:
                            decoded_words = [voc.index2word[token.item()] for token in tmp[i]]
                            decoded_words[:] = [x for x in decoded_words if not (x == 'EOS' or x == 'PAD' or x == 'SOS')]
                            print('{}: '.format(int2emotion[e]), ' '.join(decoded_words))
                        
                elif type(searcher) == GreedySearchDecoder:
                    tokens, scores = searcher(input_batch,e,lengths, max_length)
                    if voc.version == "bpemb":
                        padding = [0,1,2]
                        
                        words = [token.item() for token in tokens]
                        filtered = filter(lambda x: True if x not in padding else False, words)
                        print('{}: '.format(int2emotion[e]), voc.bpemb.decode_ids(list(filtered)))
                    else:
                        decoded_words = [voc.index2word[token.item()] for token in tokens]
                        decoded_words[:] = [x for x in decoded_words if not (x == 'EOS' or x == 'PAD' or x == 'SOS')]
                        print('{}: '.format(int2emotion[e]), ' '.join(decoded_words))

        except KeyError:
            print("Error: Encountered unknown word.")

# Run Model

## Load Data

In [23]:
_corpus_name = "DailyDialogue"
_voc_name = "bpemb"

train_path = os.path.join(_corpus_name,'train')
dev_path = os.path.join(_corpus_name,'train')

small_batch_size = 5
MAX_LENGTH = 30  # Maximum sentence length to consider
int2emotion = ['Anger','Happiness','Sadness','Surpise','Other']
num_emotion = 5


voc = Voc(_corpus_name,_voc_name) # Need to run the cell down below first
train_set = read_data(train_path,voc)
dev_set = read_data(dev_path,voc)

downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs10000.model


100%|██████████| 400869/400869 [00:02<00:00, 162051.54B/s]


downloading https://nlp.h-its.org/bpemb/en/en.wiki.bpe.vs10000.d100.w2v.bin.tar.gz


100%|██████████| 3784656/3784656 [00:07<00:00, 531507.27B/s] 
paramiko missing, opening SSH/SCP/SFTP paths will be disabled.  `pip install paramiko` to suppress


    reading data pair 10000
['▁y', 'es', '▁i', '▁d', '▁like', '▁another', '▁cup', '▁of', '▁coffee', '▁afterwards', '▁.', '▁make', '▁it', '▁hot', '▁ple', 'ase', '▁.']
['▁you', '▁ve', '▁got', '▁it', '▁sir', '▁.']
    reading data pair 20000
['▁there', '▁are', '▁hundreds', '▁and', '▁hundreds', '▁.', '▁english', '▁is', '▁particularly', '▁rich', '▁in', '▁id', 'i', 'om', 'atic', '▁express', 'ions', '▁.']
['▁can', '▁you', '▁give', '▁us', '▁an', '▁example', '▁', '?']
    reading data pair 30000
['▁why', '▁did', '▁you', '▁lie', '▁to', '▁me', '▁in', '▁the', '▁em', 'ail', '▁', '?']
['▁i', '▁didn', '▁t', '▁lie', '▁.', '▁you', '▁just', '▁didn', '▁t', '▁ask', '▁me', '▁my', '▁real', '▁name', '▁.']
    reading data pair 10000
['▁y', 'es', '▁i', '▁d', '▁like', '▁another', '▁cup', '▁of', '▁coffee', '▁afterwards', '▁.', '▁make', '▁it', '▁hot', '▁ple', 'ase', '▁.']
['▁you', '▁ve', '▁got', '▁it', '▁sir', '▁.']
    reading data pair 20000
['▁there', '▁are', '▁hundreds', '▁and', '▁hundreds', '▁.', '▁english'

## Config Model

In [41]:
model_name = 'ecm_model_imemory_bpemb_notfix'

use_embedding =  True
use_imemory = True
use_ememory = False
emb_learnable = True

attn_model = 'dot'
#attn_model = 'general'
#attn_model = 'concat'

hidden_size = 100 # must match pretrained word2vec embedding size!!!!
encoder_n_layers = 2
decoder_n_layers = 2
dropout = 0.1
batch_size = 64

# Set checkpoint to load from; set to None if starting from scratch
loadFilename = None
checkpoint_iter = 2000
# filepath = "content/ecm_model_withimemory_bpemb/DailyDialogue/2-2_100/2000_checkpoint.tar"
# loadFilename = os.path.join('ecm_model_imemory_bpemb_notfix/DailyDialogue/2-2_100/2000_checkpoint.tar')

# Load model if a loadFilename is provided
if loadFilename:
    # If loading on same machine the model was trained on
    checkpoint = torch.load(loadFilename)    
    encoder_sd = checkpoint['en']
    decoder_sd = checkpoint['de']
    encoder_optimizer_sd = checkpoint['en_opt']
    decoder_optimizer_sd = checkpoint['de_opt']
    embedding_sd = checkpoint['embedding']
    voc.__dict__ = checkpoint['voc_dict']    
print('Building encoder and decoder ...')

# Initialize word embeddings
wm = voc.weights_matrix if voc.version == "bpemb" else voc.weights_matrix[:voc.num_words]
num_embeddings, embedding_dim = wm.shape
embedding = nn.Embedding(num_embeddings, embedding_dim)
embedding.load_state_dict({'weight': torch.Tensor(wm)})
emo_embedding = nn.Embedding(num_emotion,embedding_dim)


if use_embedding and not emb_learnable:
        embedding.weight.requires_grad = False

if loadFilename:
    embedding.load_state_dict(embedding_sd)
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, embedding, encoder_n_layers, dropout)
decoder = AttnDecoderRNN(attn_model, embedding, emo_embedding, hidden_size, voc.weights_matrix.shape[0], decoder_n_layers, 
                         dropout,use_emb=use_embedding, use_imemory=use_imemory, use_ememory=use_ememory)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')

Building encoder and decoder ...
Models built and ready to go!


In [40]:
voc.weights_matrix.shape[0]

10000

## Run training

In [26]:
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0
n_iteration = 10000
print_every = 5
save_every = 100

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)
if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# Run training iterations
print("Starting Training!")
trainIters(model_name, voc, train_set, dev_set, encoder, decoder, encoder_optimizer, decoder_optimizer,
           embedding, emo_embedding, encoder_n_layers, decoder_n_layers, n_iteration, batch_size,
           print_every, save_every, clip, _corpus_name, loadFilename)

Building optimizers ...
Starting Training!


KeyboardInterrupt: 

# Debug Training and Train Iter

In [78]:
## TrainIter
pairs = train_set
dev_pairs = dev_set
corpus_name = _corpus_name

pair_batch = batch2TrainData(voc,[random.choice(dev_set) for _ in range(small_batch_size)])
input_variable, lengths, target_variable, mask, max_target_len,emo_in,emo_out = pair_batch
loss = train(input_variable,emo_out,lengths, target_variable, mask, max_target_len, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, batch_size, clip)

RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 2. Got 64 and 5 in dimension 1 at /Users/administrator/nightlies/pytorch-1.0.0/wheel_build_dirs/conda_3.6/conda/conda-bld/pytorch_1544137972173/work/aten/src/TH/generic/THTensorMoreMath.cpp:1333

In [69]:
## Train
input_emotion = emo_out
encoder_optimizer.zero_grad()
decoder_optimizer.zero_grad()
input_variable = input_variable.to(device)
lengths = lengths.to(device)
target_variable = target_variable.to(device)
mask = mask.to(device)
encoder_outputs, encoder_hidden = encoder(input_variable, lengths)

In [75]:
## Encoder
embedded = embedding(input_variable)
# Pack padded batch of sequences for RNN module
packed = torch.nn.utils.rnn.pack_padded_sequence(embedded, lengths)
# Forward pass through GRU
outputs, hidden = gru(packed,hidden)
# Unpack padding
outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
# Sum bidirectional GRU outputs
# outputs = outputs[:, :, :hidden_size] + outputs[:, : ,hidden_size:]

# decoder_input = torch.LongTensor([[SOS_token for _ in range(batch_size)]])
# decoder_input = decoder_input.to(device)
# input_emotion = input_emotion.to(device)
# decoder_hidden = encoder_hidden[:decoder.n_layers]


# decoder_output, decoder_hidden,input_emotion = decoder(
#                 decoder_input,input_emotion,decoder_hidden, encoder_outputs
#             )

In [76]:
outputs.size()

torch.Size([18, 5, 100])

In [68]:
print(outputs[:, :, :hidden_size].size(),outputs[:, : ,hidden_size:].size())

torch.Size([18, 5, 100]) torch.Size([18, 5, 0])


In [53]:
n_layers = 2 
output_size = num_embeddings
gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
concat_1 = nn.Linear(hidden_size * 2, hidden_size)
concat_3 = nn.Linear(hidden_size * 2, hidden_size)
out = nn.Linear(hidden_size, output_size)
read_linear = nn.Linear(hidden_size*(n_layers+1),hidden_size)
write_linear = nn.Linear(hidden_size,hidden_size)
gru_concat = nn.Linear(hidden_size*2,hidden_size)

In [56]:
## Decoder
# decoder = AttnDecoderRNN(attn_model,embedding,emo_embedding,hidden_size,num_embeddings,decoder_n_layers, 
#                          dropout,use_emb=use_embedding, use_imemory=use_imemory, use_ememory=use_ememory)

embedded = embedding(decoder_input) # 1,64,300
# emo_embedding = nn.Embedding(num_emotion,embedding_dim)
emotion = emo_embedding(input_emotion).unsqueeze(dim=0) 
_,tmp_size,_ = decoder_hidden.size()
print(embedded.size(),emotion.size(),tmp_size)

read_gate = torch.sigmoid(read_linear(torch.cat([embedded,torch.reshape(decoder_hidden,(1,tmp_size,-1))],dim=2)))

# mem_read = torch.mul(emotion,read_gate)
# gru_input = gru_concat(torch.cat([embedded,mem_read],dim=2))
# rnn_output, hidden = gru(gru_input, decoder_hidden)
# write_gate = torch.sigmoid(write_linear(rnn_output))
# emotion = torch.mul(write_gate,mem_read)

torch.Size([1, 64, 100]) torch.Size([1, 5, 100]) 5


RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 2. Got 64 and 5 in dimension 1 at /Users/administrator/nightlies/pytorch-1.0.0/wheel_build_dirs/conda_3.6/conda/conda-bld/pytorch_1544137972173/work/aten/src/TH/generic/THTensorMoreMath.cpp:1333

In [62]:
embedded.size()

torch.Size([1, 64, 100])

In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self,attn_model,embedding,emotion_embedding,hidden_size,output_size,n_layers=1,dropout=0.1,use_emb=False,use_imemory=False,use_ememory=False):
        super(AttnDecoderRNN, self).__init__()

        # Keep for reference
        self.attn_model = attn_model
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout = dropout
        self.use_emb = use_emb
        self.use_imemory = use_imemory
        self.use_ememory = use_ememory

        # Define layers
        self.embedding = embedding
        self.emotion_embedding = emotion_embedding
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.concat_1 = nn.Linear(hidden_size * 2, hidden_size)
        self.concat_3 = nn.Linear(hidden_size * 2, hidden_size)

        self.out = nn.Linear(hidden_size, output_size)
        
        
        # DIY layers
        self.read_linear = nn.Linear(hidden_size*(self.n_layers+1),hidden_size)
        self.write_linear = nn.Linear(hidden_size,hidden_size)
        self.gru_concat = nn.Linear(hidden_size*2,hidden_size)
        

        self.attn = Attn(attn_model, hidden_size)
    def forward(self, input_step, emotion, last_hidden, encoder_outputs):
        # Note: we run this one step (word) at a time
        # Get embedding of current input word
        embedded = self.embedding(input_step) # 1,64,300
        embedded = self.embedding_dropout(embedded)
        
        if self.use_emb and self.use_imemory:
          
            if emotion.size()!=embedded.size():
                emotion = self.emotion_embedding(emotion).unsqueeze(dim=0) # mem_write
            
            _,tmp_size,_ = last_hidden.size()

            read_gate = torch.sigmoid(self.read_linear(torch.cat([embedded,torch.reshape(last_hidden,(1,tmp_size,-1))],dim=2)))
            mem_read = torch.mul(emotion,read_gate)
            
            gru_input = self.gru_concat(torch.cat([embedded,mem_read],dim=2))
            rnn_output, hidden = self.gru(gru_input, last_hidden)
            
            write_gate = torch.sigmoid(self.write_linear(rnn_output))
            emotion = torch.mul(write_gate,mem_read)
          
          
          

            attn_weights = self.attn(rnn_output, encoder_outputs)
            # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
            context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
            # Concatenate weighted context vector and GRU output using Luong eq. 5
            rnn_output = rnn_output.squeeze(0)
            context = context.squeeze(1)
            concat_input = torch.cat((rnn_output, context), 1)
            concat_output = torch.tanh(self.concat_3(concat_input))
            # Predict next word using Luong eq. 6
            output = self.out(concat_output)
            output = F.softmax(output, dim=1)
            return output,hidden,emotion
        elif self.use_emb:
            emotion_embedded = self.emotion_embedding(emotion).unsqueeze(dim=0)
            embedded = self.concat_1(torch.cat([embedded,emotion_embedded],dim = 2))

        # Forward through unidirectional GRU
        rnn_output, hidden = self.gru(embedded, last_hidden)
        # Calculate attention weights from the current GRU output
        attn_weights = self.attn(rnn_output, encoder_outputs)
        # Multiply attention weights to encoder outputs to get new "weighted sum" context vector
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        # Concatenate weighted context vector and GRU output using Luong eq. 5
        rnn_output = rnn_output.squeeze(0)
        context = context.squeeze(1)
        concat_input = torch.cat((rnn_output, context), 1)
        concat_output = torch.tanh(self.concat_3(concat_input))
        # Predict next word using Luong eq. 6
        output = self.out(concat_output)
        output = F.softmax(output, dim=1)
        # Return output and final hidden state
        return output,hidden,input_emotion      