In [1]:
from utils import *
from datetime import datetime
import time
import math

In [2]:
MAX_LENGTH = 50
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Reading the Data In

In [3]:
with open('data/preprocessed/fold_train.json', 'r') as f:
    atis_json = json.load(f)

# Data Preprocessing

In [4]:
def flatten(list_of_lists):
    """Flattens from two-dimensional list to one-dimensional list"""
    return [item for sublist in list_of_lists for item in sublist]

In [5]:
def adjust_sequences(data, length=MAX_LENGTH):
    """
    Fixes the input and output sequences length, adding padding or truncating if necessary
    :param data json file containing entries from atis dataset.
    :param length the fixed length of the sentence.
    """
    for sample in data['data']:
        # adjust the sequence of input words
        if len(sample['words']) < length:
            # add <EOS> and <PAD> if sentence is shorter than maximum length
            sample['words'].append('<EOS>')
            while len(sample['words']) < length:
                sample['words'].append('<PAD>')
        else:
            # otherwise truncate and add <EOS> at last position
            sample['words'] = sample['words'][:length]
            sample['words'][-1] = '<EOS>'

        # adjust in the same way the sequence of output slots
        if len(sample['slots']) < length:
            sample['slots'].append('<EOS>')
            while len(sample['slots']) < length:
                sample['slots'].append('<PAD>')
        else:
            sample['slots'] = sample['slots'][:length]
            sample['slots'][-1] = '<EOS>'

    return data

In [None]:
def get_vocabularies(train_data):
    """
    Collect the input vocabulary, the slot vocabulary and the intent vocabulary
    :param train_data the training data containing words,slots and intent.
    """
    # from a list of training examples, get three lists (columns)
    data = train_data['data']
    seq_in = [sample['words'] for sample in data]
    vocab = flatten(seq_in)
    # removing duplicated but keeping the order
    v = ['<PAD>', '<SOS>', '<EOS>'] + vocab
    vocab = sorted(set(v), key=lambda x: v.index(x)) # https://docs.python.org/3.3/howto/sorting.html
    s = ['<PAD>','<EOS>'] + train_data['meta']['slot_types']
    slot_tag = sorted(set(s), key=lambda x: s.index(x))
    i = train_data['meta']['intent_types']
    intent_tag = sorted(set(i), key=lambda x: i.index(x))

    return vocab, slot_tag, intent_tag

In [None]:
adjusted_atis = adjust_sequences(atis_json)#padded

In [None]:
atis_vocab,atis_slots,atis_intents = get_vocabularies(adjusted_atis)

In [None]:
len(atis_vocab),len(atis_slots),len(atis_intents)

# Next we map the data and set it up for Pytorch
Remember each vocabulary for each sentence, slot and intent will have different embeddings. They are different sized vectors. The Network will try to figure out a mapping from these different vector spaces.

In [None]:
def create_mappings(vocab,forward_map,intent=False):
    """
    This function takes the words in the vocabulary and creates a unique mapping to a number.
    :param vocab contains all the words in the corpus.
    :param forward_map a dictionary that will be populated with mappings.
    returns populated forward_map
    """
    for sample in vocab:
        if not intent and sample not in forward_map.keys():
            forward_map[sample]= len(forward_map)
        elif intent:
            forward_map[sample]= len(forward_map)#+1# this so we can use 1 loss function
            
    return forward_map

In [None]:
word2index = {'<PAD>': 0, '<UNK>':1,'<SOS>':2,'<EOS>':3}
create_mappings(atis_vocab,word2index)
index2word = {v:k for k,v in word2index.items()}

In [None]:
tag2index = {'<PAD>' : 0,'<UNK>':1,'<EOS>':2}
create_mappings(atis_slots,tag2index)
index2tag = {v:k for k,v in tag2index.items()}

In [None]:
intent2index={}
create_mappings(atis_intents,intent2index,intent=True)
index2intent = {v:k for k,v in intent2index.items()}

In [None]:
len(word2index)
len(intent2index)
#word2index

### Next we create a Tensor where each row is a mapped/embedded sequence

In [None]:
def prepare_sequence(seq_data, mapping,map_type):
    """
    :param seq a sequnce which will be embedded as a vector
    :param mapping, a dictionary which contains how each element in the seq will be mapped to a number.
    :param map_type 'words','slots' or 'intent'
    returns a Pytorch Tensor.
    """
    if map_type=='intent':
        intent = seq_data[map_type]
        embeddings = mapping[intent] if intent in mapping.keys() else -1 #mapping["<UNK>"]
        return torch.tensor(embeddings)   
    else:
        embed_fnc = lambda word: mapping[word] if word in mapping.keys() else mapping['<UNK>']
        embeddings = list(map(embed_fnc, seq_data[map_type])) 
        return torch.LongTensor(embeddings)

In [None]:
def create_training_set(padded_atis):
    """
    :param padded_atis, this is padded sequence data.
           Of the form seq,slots,intent. This function coverts 
           these into tensors.
    return train_data; [(seq_tensor,slot_tensor,intent_tensor)]
    """
    train_data = []
    atis_data = padded_atis['data']
    for i in range(len(atis_data)):
        seq_tensor = prepare_sequence(atis_data[i],word2index,'words')
        slot_tensor = prepare_sequence(atis_data[i],tag2index,'slots')
        intent_tensor = prepare_sequence(atis_data[i],intent2index,'intent')
        train_data.append((seq_tensor,slot_tensor,intent_tensor))
    return train_data

In [None]:
train_data = create_training_set(adjusted_atis)

## Batching the data

In [None]:
def concatenate_batch(batch):
#     print(len(batch), len(batch[0]))
    #     print([ex[0] for ex in batch])
    seqs = torch.stack([ex[0] for ex in batch])
    slots = torch.stack([ex[1] for ex in batch])
    intents = torch.stack([ex[2] for ex in batch])
   
    return seqs,slots,intents

In [None]:
def get_batches(batch_size, train_data):
    """
    Returns iteratively a batch of specified size on the data. 
    The last batch can be smaller if the total size is not multiple of the batch
    """
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while sindex < len(train_data):
        batch = train_data[sindex:eindex] #list of batch_size num of tuples.
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        #print('returning', len(batch), 'samples')
        yield concatenate_batch(batch)

# Building RNN  to reconstruct a query 

In [None]:
class Encoder(nn.Module):
    """
    An encoder/decoder that
    takes a batch of sequences embeds the sequence and
    then runs it through a fully connected layer to predict slots and intent.
    """
    def __init__(self,input_dim,hidden_dim,emb_dim,slot_dim,intent_dim):
        super(Encoder, self).__init__()
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
       
        self.slot_dim = slot_dim
        #self.intent_dim = intent_dim
        self.batch_size = batch_size
       
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.RNN(emb_dim, hidden_dim,batch_first=True)
        
        self.slot_fc = nn.Linear(hidden_dim,MAX_LENGTH)# if reconstruction output should be seq len
    
    def forward(self,src):
        #src [seq len,batch size] -> [seq len, batch size, emb_dim]
        embedded_seq = self.embedding(src)
#         print('embedded_seq', embedded_seq.size())
        outputs, hidden = self.rnn(embedded_seq)
        #print(outputs.size(),hidden.size())
        #outputs = outputs.contiguous().view(-1,self.emb_dim)
        hidden = hidden.contiguous().view(hidden.size()[1],self.hidden_dim)
        #print("after turn to col vec",outputs.size())
        slot_space = self.slot_fc(hidden)#self.slot_fc(outputs)
        slot_scores = slot_space
        #print("slot score",slot_scores.size())

        return slot_scores#, intent_scores, hidden

In [None]:
print("Testing models...")
n_layers = 1
input_size = len(word2index)
slot_size = len(tag2index)
intent_size = len(intent2index)
embed_size = 5
hidden_size = 5
output_size = MAX_LENGTH
batch_size = 1
encoder = Encoder(input_size, embed_size,hidden_size,slot_size,intent_size,batch_size).to(device)

In [None]:
for batch in get_batches(1,train_data[:3]):
    inputs = batch[0]
    inputs = inputs.to(device)
    slots = encoder(inputs)
len(batch[2].nonzero())
#batch[2]

# Training the model    

In [None]:
input_size = len(word2index)
slot_size = len(tag2index)
intent_size = len(intent2index)
embed_size = 128
hidden_size = 128
output_size = MAX_LENGTH
batch_size = 256
encoder = Encoder(input_size, embed_size,hidden_size,slot_size,intent_size).to(device)
#-----------------------
criterion = nn.MSELoss()#reduce='sum'
#nn.CrossEntropyLoss(ignore_index=0,reduction='elementwise_mean')#ignore <pad> remember this ignoring 0 in intent as well, need 2 loss
optimizer = optim.Adam(encoder.parameters(), lr=0.01)


In [None]:
num_epochs = 50
start = time.time()
for epoch in range(num_epochs):
    inp = get_batches(batch_size,train_data)
    encoder.train()
    for data in inp:
        # get the inputs
        inputs = data[0].to(device)
        #not_input_pad = torch.Tensor(len(inputs.nonzero())).type(torch.FloatTensor)
        # zero the parameter gradients
        optimizer.zero_grad()
        # forward + backward + optimize
        #out_slots = encoder(inputs)
        out_slots = encoder(inputs)
#         print(out_slots[0,:])
#         print(out_slots.type(),inputs.type())
        slot_loss = criterion(out_slots, inpu array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])ts.type(torch.FloatTensor))#/not_input_pad #any cases where it is all zero? Don't think so.
        
        #out_intents = out_intents.view(-1,intent_size)
        #intent_loss = criterion(out_intents, intents) 

       # slot_loss.backward()        
        slot_loss.backward()
        optimizer.step()

        
    if epoch%10==0:
        print('Reconstruction Loss after epoch '+str(epoch)+':',slot_loss.item())
        print('-----------------------------------------------')
end = time.time()
elapsed = (end-start)/60.
print('Time elapsed: %.4f mins ' % (elapsed))

# What is an RNN actually doing?
Write this here before moving on. This is discussed in my draft.
Will add attention to this simple architecture.

In [134]:
class SimpleEncoderRNN(nn.Module):
    def __init__(self,input_size, hidden_size):
        super(SimpleEncoderRNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size=self.input_size,
                          hidden_size=self.hidden_size,
                          batch_first=True)
    
    def forward(self,src):
        encoded_input, hidden = self.rnn(src)
        #print('Input looks like:',src)
        return encoded_input

class SimpleDecoderRNN(nn.Module):
    """
    Input size is same as hidden size
    """
    def __init__(self,input_size, hidden_size):
        super(SimpleDecoderRNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size=self.input_size,
                          hidden_size=self.hidden_size,
                          batch_first=True)
        self.fc_out = nn.Linear(self.hidden_size,8)# (sequence lenth,seq len) ideally maybe take a smaller dimension but output same
        # The fully connected is need because output from RNN is in range [-1,1] because of tanh activation.
        # Hence the fc allows for linear combinations to get the reconstruction values we need.
    def forward(self,src):
        decoded_input, hidden = self.rnn(src)
        #Attention
        #Attn 1: Obtain a score for every enc hidden state,we will use dot product
            # remember batch is done 1 by 1 so I think batch = 1
            # src dim = [batch_size,seq_len,hidden_size] flatten -> [seq_len,hidden]
            # decoder hidden = [num_layers,batch,hidden] flatten-> [hidden,batch] or [hidden,1]
        src_reshaped = src.contiguous().view(-1,self.hidden_size)
        decoder_hidden_reshaped = hidden.contiguous().view(self.hidden_size,hidden.size()[1])
        #print(self.hidden_size)
        enc_hs_score = torch.matmul(src_reshaped,decoder_hidden_reshaped)
        #print("enc scores:",enc_hs_score,"enc scores dim:",enc_hs_score.size())
        
        #Attn 2: Run the scores through a softmax layer
        normalized_scores = F.softmax(enc_hs_score,dim=0)
        #print("Normalized enc scores:",normalized_scores)
        # Attn 3: Multiply each enc hs by its softmaxed score(i.e weight the enc hidden state)
        weighted_enc_hs = torch.mul(normalized_scores,src_reshaped)
        #print("weighted enc",weighted_enc_hs)
        # Attn 4: Sum up weighted vectors(allignment vectors) to get a context vector
            #then feed this back to the decoder.
        context_vector = torch.sum(weighted_enc_hs, dim=0)#sum along rows
        #print("context vector",context_vector)
        reshaped_cv = context_vector.view(1,1,self.hidden_size)
        #print("reshaped cv",reshaped_cv.size())
        decoded_input_w_attn,_ = self.rnn(reshaped_cv)
        #print(decoded_input_w_attn.size())
        decoded_input = decoded_input_w_attn.squeeze(0)
        #print(decoded_input.size())
        decoded_input = self.fc_out(decoded_input_w_attn)
        #print(decoded_input.size())
        return decoded_input

class SimpleEncDecRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(SimpleEncDecRNN, self).__init__()
        self.encoder = SimpleEncoderRNN(input_size, hidden_size)
        self.decoder = SimpleDecoderRNN(hidden_size, hidden_size)
        
    def forward(self, src):
        encoded_input = self.encoder(src)
        decoded_output = self.decoder(encoded_input)
        return decoded_output

In [135]:
torch.manual_seed(423)
inp_size = 1
hidden_size = 4
rnn_ae = SimpleEncDecRNN(inp_size,hidden_size).to(device)

sequence = torch.randint(8,(2,8,1)).to(device) # extra dimension needed cause Elman RNN assumes we go from predict one
#element of a sequence at a time.

In [59]:
# #can I reconstruct that sequence?
# enc_rnn = nn.RNN(1,4,batch_first=True).to(device)
# output,hidden = enc_rnn(sequence)# output=[batch,seq_len,hidden_size]
# output.size(),hidden.size(),output,hidden

In [60]:
# dec_rnn = nn.RNN(4,4,batch_first=True).to(device)
# dec_out,dec_hidden = dec_rnn(output)
# dec_out.size(),dec_hidden.size()

In [61]:
# all_enc_out = output.contiguous().view(-1)
# all_dec_out = dec_hidden.contiguous().view(-1)
# #torch.dot(all_enc_out,all_dec_out)

In [140]:
# training?
reconstruct_criterion= nn.MSELoss()
ae_optimizer = optim.Adam(rnn_ae.parameters(), lr=0.01)
for i in range(2000):# 5 epochs.
    for seq in sequence:
        #print(str(seq.unsqueeze(0).size())+"\n")
        src = seq.unsqueeze(0)
        rnn_ae.train()
        # zero the parameter gradients/
        ae_optimizer.zero_grad()
        # forward + backward + optimize
        #print(src.size())
        src_hat = rnn_ae(src)
        rec_loss = reconstruct_criterion(src_hat,src.squeeze(2).unsqueeze(0)) # smh     
        rec_loss.backward()
        ae_optimizer.step()
    if i%500==0:
        print('Reconstruction Loss after epoch '+str(i)+':',rec_loss.item())
        print('-----------------------------------------------')

Reconstruction Loss after epoch 0: 0.0001411706325598061
-----------------------------------------------
Reconstruction Loss after epoch 500: 1.9542374662950834e-11
-----------------------------------------------
Reconstruction Loss after epoch 1000: 1.1116901177388172e-08
-----------------------------------------------
Reconstruction Loss after epoch 1500: 1.4000502979172325e-09
-----------------------------------------------


In [143]:
sequence[0]

tensor([[2.],
        [6.],
        [2.],
        [5.],
        [7.],
        [0.],
        [0.],
        [7.]])

In [142]:
rnn_ae(sequence[0].unsqueeze(0))

tensor([[[2.0000e+00, 6.0000e+00, 2.0000e+00, 5.0000e+00, 7.0000e+00,
          5.9009e-06, 1.0081e-05, 7.0000e+00]]], grad_fn=<ThAddBackward>)