In [1]:
from utils import *
from datetime import datetime
import time
import math

In [2]:
MAX_LENGTH = 50
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Reading the Data In

In [3]:
with open('data/preprocessed/fold_train.json', 'r') as f:
    atis_json = json.load(f)

# Data Preprocessing

In [4]:
def flatten(list_of_lists):
    """Flattens from two-dimensional list to one-dimensional list"""
    return [item for sublist in list_of_lists for item in sublist]

In [5]:
def adjust_sequences(data, length=MAX_LENGTH):
    """
    Fixes the input and output sequences length, adding padding or truncating if necessary
    :param data json file containing entries from atis dataset.
    :param length the fixed length of the sentence.
    """
    for sample in data['data']:
        # adjust the sequence of input words
        if len(sample['words']) < length:
            # add <EOS> and <PAD> if sentence is shorter than maximum length
            sample['words'].append('<EOS>')
            while len(sample['words']) < length:
                sample['words'].append('<PAD>')
        else:
            # otherwise truncate and add <EOS> at last position
            sample['words'] = sample['words'][:length]
            sample['words'][-1] = '<EOS>'

        # adjust in the same way the sequence of output slots
        if len(sample['slots']) < length:
            sample['slots'].append('<EOS>')
            while len(sample['slots']) < length:
                sample['slots'].append('<PAD>')
        else:
            sample['slots'] = sample['slots'][:length]
            sample['slots'][-1] = '<EOS>'

    return data

In [6]:
def get_vocabularies(train_data):
    """
    Collect the input vocabulary, the slot vocabulary and the intent vocabulary
    :param train_data the training data containing words,slots and intent.
    """
    # from a list of training examples, get three lists (columns)
    data = train_data['data']
    seq_in = [sample['words'] for sample in data]
    vocab = flatten(seq_in)
    # removing duplicated but keeping the order
    v = ['<PAD>', '<SOS>', '<EOS>'] + vocab
    vocab = sorted(set(v), key=lambda x: v.index(x)) # https://docs.python.org/3.3/howto/sorting.html
    s = ['<PAD>','<EOS>'] + train_data['meta']['slot_types']
    slot_tag = sorted(set(s), key=lambda x: s.index(x))
    i = train_data['meta']['intent_types']
    intent_tag = sorted(set(i), key=lambda x: i.index(x))

    return vocab, slot_tag, intent_tag

In [7]:
adjusted_atis = adjust_sequences(atis_json)#padded

In [8]:
atis_vocab,atis_slots,atis_intents = get_vocabularies(adjusted_atis)

In [9]:
len(atis_vocab),len(atis_slots),len(atis_intents)

(870, 122, 21)

# Next we map the data and set it up for Pytorch
Remember each vocabulary for each sentence, slot and intent will have different embeddings. They are different sized vectors. The Network will try to figure out a mapping from these different vector spaces.

In [10]:
def create_mappings(vocab,forward_map):
    """
    This function takes the words in the vocabulary and creates a unique mapping to a number.
    :param vocab contains all the words in the corpus.
    :param forward_map a dictionary that will be populated with mappings.
    returns populated forward_map
    """
    for sample in vocab:
        if sample not in forward_map.keys():
            forward_map[sample]= len(forward_map)
            
    return forward_map

In [11]:
SOS_token = 2
EOS_token = 3
word2index = {'<PAD>': 0, '<UNK>':1,'<SOS>':2,'<EOS>':3}
create_mappings(atis_vocab,word2index)
index2word = {v:k for k,v in word2index.items()}

In [12]:
tag2index = {'<PAD>' : 0,'<UNK>':1,'<EOS>':1}
create_mappings(atis_slots,tag2index)
index2tag = {v:k for k,v in tag2index.items()}

In [13]:
intent2index={}
create_mappings(atis_intents,intent2index)
index2intent = {v:k for k,v in intent2index.items()}

In [14]:
len(word2index)

871

### Next we create a Tensor where each row is a mapped/embedded sequence

In [15]:
def prepare_sequence(seq_data, mapping,map_type):
    """
    :param seq a sequnce which will be embedded as a vector
    :param mapping, a dictionary which contains how each element in the seq will be mapped to a number.
    :param map_type 'words','slots' or 'intent'
    returns a Pytorch Tensor.
    """
    if map_type=='intent':
        intent = seq_data[map_type]
        embeddings = mapping[intent] if intent in mapping.keys() else -1 #mapping["<UNK>"]
        return torch.tensor(embeddings)   
    else:
        embed_fnc = lambda word: mapping[word] if word in mapping.keys() else mapping['<UNK>']
        embeddings = list(map(embed_fnc, seq_data[map_type])) 
        return torch.LongTensor(embeddings)

In [16]:
def create_training_set(padded_atis):
    """
    :param padded_atis, this is padded sequence data.
           Of the form seq,slots,intent. This function coverts 
           these into tensors.
    return train_data; [(seq_tensor,slot_tensor,intent_tensor)]
    """
    train_data = []
    atis_data = padded_atis['data']
    for i in range(len(atis_data)):
        seq_tensor = prepare_sequence(atis_data[i],word2index,'words')
        slot_tensor = prepare_sequence(atis_data[i],tag2index,'slots')
        intent_tensor = prepare_sequence(atis_data[i],intent2index,'intent')
        train_data.append((seq_tensor,slot_tensor,intent_tensor))
    return train_data

In [17]:
train_data = create_training_set(adjusted_atis)

## Batching the data

In [18]:
def concatenate_batch(batch):
#     print(len(batch), len(batch[0]))
    #     print([ex[0] for ex in batch])
    seqs = torch.stack([ex[0] for ex in batch])
    slots = torch.stack([ex[1] for ex in batch])
    intents = torch.stack([ex[2] for ex in batch])
   
    return seqs,slots,intents

In [19]:
def get_batches(batch_size, train_data):
    """
    Returns iteratively a batch of specified size on the data. 
    The last batch can be smaller if the total size is not multiple of the batch
    """
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while sindex < len(train_data):
        batch = train_data[sindex:eindex] #list of batch_size num of tuples.
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        #print('returning', len(batch), 'samples')
        yield concatenate_batch(batch)

# Building Joint-RNN Model
---Ignore---

Will use encoder-decoer model because of:
Sutskever, Ilya, Oriol Vinyals, and Quoc V. Le. "Sequence to sequence learning with neural networks." Advances in neural information processing systems. 2014.

i) one for the input sequence and another for the output sequence, because doing
so increases the number model parameters at negligible computational cost and makes it natural to
train the LSTMon multiple language pairs simultaneously 

ii) deep LSTMs significantly outperformed shallow LSTMs, so we chose an LSTM with four layers. 

iii) valuable to reverse the order of the words of the input sentence. 

---Ignore---

In [56]:
class Encoder(nn.Module):
    """
    An encoder/decoder that
    takes a batch of sequences embeds the sequence and
    then runs it through a fully connected layer to predict slots and intent.
    """
    def __init__(self,input_dim,hidden_dim,emb_dim,slot_dim,intent_dim):
        super(Encoder, self).__init__()
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
       
        self.slot_dim = slot_dim
        self.intent_dim = intent_dim
       
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.RNN(emb_dim, hidden_dim,batch_first=True)
        
        self.slot_fc = nn.Linear(hidden_dim,slot_dim)
        self.intent_fc = nn.Linear(hidden_dim,intent_dim)
        
    def forward(self,src):
        #src [seq len,batch size] -> [seq len, batch size, emb_dim]
        embedded_seq = self.embedding(src)
#         print('embedded_seq', embedded_seq.size())
        outputs, hidden = self.rnn(embedded_seq)
#         print('hidden size is ', hidden.size())
#         print('outputs size is ', outputs.size())
        outputs = outputs.contiguous().view(-1,self.emb_dim)
        #print('out size',outputs.size())
        slot_space = self.slot_fc(outputs)
        slot_scores = slot_space
#         print(slot_scores[0,:])
#         slot_scores = F.softmax(slot_space, dim=1)
#         print('after softmax',print(slot_scores))
        #print(slot_scores.size())
#         intent_space = self.intent_fc(hidden)
#         intent_scores = intent_space
#         print('intent_space size is ', intent_space.size())
        #print(intent_space.size())
#         intent_scores = F.softmax(intent_space, dim=1)
        #print(intent_scores.size())
        return slot_scores#, intent_scores, hidden

In [57]:
train_data[0]

(tensor([ 63, 287,  24, 147,  33,  34,  38,  25,  58,  44,  50, 228, 170,   3,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0]),
 tensor([122, 122, 122, 122,  48, 107, 122, 122, 122,  77, 122, 122, 122,   1,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0]),
 tensor(11))

In [58]:
print("Testing models...")
n_layers = 1
input_size = len(word2index)
slot_size = len(tag2index)
intent_size = len(intent2index)
embed_size = 5
hidden_size = 5
output_size = MAX_LENGTH
encoder = Encoder(input_size, embed_size,hidden_size,slot_size,intent_size).to(device)

Testing models...


In [59]:
for batch in get_batches(100,train_data[:]):
    inputs = batch[0]
    inputs = inputs.to(device)
    slots = encoder(inputs)
#     slots, intents,_ = encoder(inputs)
    #print(torch.argmax(slots,dim=0))

# Training the model
    

In [60]:
n_layers = 1
input_size = len(word2index)
slot_size = len(tag2index)
intent_size = len(intent2index)
embed_size = 256
hidden_size = 256
output_size = MAX_LENGTH
encoder = Encoder(input_size, embed_size,hidden_size,slot_size,intent_size).to(device)
#-----------------------
criterion = nn.CrossEntropyLoss(ignore_index=0)#ignore <pad> remember this ignoring 0 in intent as well, need 2 loss
optimizer = optim.Adam(encoder.parameters(), lr=0.01)
#batch_size = 100

In [64]:
num_epochs = 1000
start = time.time()
for epoch in range(num_epochs):
    inp = get_batches(256,train_data)
    encoder.train()
    for data in inp:
        # get the inputs
        inputs, labels, intents = data[0],data[1],data[2]
        inputs = inputs.type(torch.LongTensor).to(device)
        labels = labels.type(torch.LongTensor).view(-1).to(device) #squashing
        intents = intents.to(device)#.type(torch.FloatTensor)
        
        # zero the parameter gradients
        optimizer.zero_grad()
        
        out_slots = encoder(inputs)
#         out_slots, out_intents,_ = encoder(inputs)
        slot_loss = criterion(out_slots, labels)
    
#         out_intents = out_intents.view(-1,intent_size)
#         intent_loss = criterion(out_intents, intents) 
        

        # forward + backward + optimize
#         out_slots, out_intents,_ = encoder(inputs)
#         print(out_slots.size())
#         print(intents.size())
#         print(out_intents.size())
        
#         out_slots = out_slots.view(-1,slot_size)
#         correct_label = labels[5]
#         label_score = float(out_slots[5, correct_label])
#         highest_score = float(out_slots[5].max())
#         total_score = float(out_slots[5].sum())
#         print(int(inputs[0,5]))
#         print(correct_label, label_score, highest_score, total_score)
#         print(out_slots.size(), labels.size())
#         slot_loss = criterion(out_slots, labels)
        
#         out_intents = out_intents.view(-1,intent_size)
#         intent_loss = criterion(out_intents, intents) 

        slot_loss.backward()        
#         slot_loss.backward(retain_graph=True)
#         intent_loss.backward()
        optimizer.step()

    if epoch%50==0:
        print('Slot Loss after epoch '+str(epoch)+':',slot_loss.item())
#         print('Intent Loss after epoch '+str(epoch)+':',intent_loss.item())
        print('-----------------------------------------------')
end = time.time()
elapsed = (end-start)/60.
print('Time elapsed: %.4f mins ' % (elapsed))

Slot Loss after epoch 0: 0.011933607049286366
-----------------------------------------------
Slot Loss after epoch 50: 0.0029529081657528877
-----------------------------------------------
Slot Loss after epoch 100: 0.015183504670858383
-----------------------------------------------
Slot Loss after epoch 150: 0.04848215728998184
-----------------------------------------------
Slot Loss after epoch 200: 0.0021547030191868544
-----------------------------------------------
Slot Loss after epoch 250: 0.052389465272426605
-----------------------------------------------
Slot Loss after epoch 300: 0.03923773393034935
-----------------------------------------------
Slot Loss after epoch 350: 0.013141809962689877
-----------------------------------------------
Slot Loss after epoch 400: 0.0273713618516922
-----------------------------------------------
Slot Loss after epoch 450: 0.016797255724668503
-----------------------------------------------
Slot Loss after epoch 500: 0.0131195336580276

In [83]:
# effects of double softmax?
for batch in get_batches(1,train_data[:1]):
    inputs = batch[0]
    labels = batch[1]
    labels = labels.to(device)
    inputs = inputs.to(device)
    slots = encoder(inputs)
s_scores = torch.argmax(F.softmax(slots, dim=1),dim=1)
print(slots ,labels,
     s_scores )

tensor([[ -0.2000,  -9.7143,  -0.0846,  ...,  -9.6479,   4.6154,  61.4233],
        [ -0.0900, -17.8721,  -0.1140,  ..., -16.9577,   6.8846,  55.2109],
        [  0.1297,   9.2990,   0.1128,  ...,  -2.2889, -15.0664,  13.6827],
        ...,
        [ -2.8455,  -1.5662,  -2.3077,  ...,   1.3982,  -2.9453,  60.5977],
        [ -2.8455,  -1.5662,  -2.3077,  ...,   1.3982,  -2.9453,  60.5977],
        [ -2.8455,  -1.5662,  -2.3077,  ...,   1.3982,  -2.9453,  60.5977]],
       device='cuda:0', grad_fn=<ThAddmmBackward>) tensor([[122, 122,  22, 122, 122,  48, 122,  77, 122, 122, 122,  37,  36, 122,
         122,  34,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0]], device='cuda:0') tensor([122, 122,  22, 122, 122,  48, 122,  77, 122, 122, 122,  37,  36, 122,
        122,  34,   1, 107,  94, 122, 122, 122, 122, 122, 122, 122, 122, 122,
        122, 122, 

# Evaluation

In [72]:
with open('data/preprocessed/fold_test.json', 'r') as f:
    atis_test_json = json.load(f)

In [73]:
adjusted_atis_test = adjust_sequences(atis_test_json)#padded

In [74]:
test_data = create_training_set(adjusted_atis_test) 

In [75]:
#Exclude pad 
def calc_bat_fscores(y_pred,y_true,predict_type='slot'):
    """
    Calc average f score for a batch.
    compares each predicted output in a batch to actual output.
    Then averages that.
    """
    batch_avg_f = []
    keep_slots = [i for i in tag2index.values() if i > 2]
    #print(y_true.size())
    if predict_type=='slot':
#         for pred_i in range(len(y_pred)):# range batch size
            #print(y_pred[pred_i].size())
        f_score = f1_score(y_true, y_pred,labels=keep_slots, average ='micro')  
        batch_avg_f.append(f_score)
    else:
        return f1_score(y_true,y_pred,average ='micro')
    return np.mean(batch_avg_f)

In [76]:
import warnings
warnings.filterwarnings('ignore')# get a lot of warnings because some labels are not predicted...

In [80]:
inp = get_batches(128,test_data)
encoder.eval()
f_slot_scores = []
f_intent_scores = []
for data in inp:
    # get the inputs
    inputs, labels , true_intents = data[0],data[1],data[2]
    inputs = inputs.type(torch.LongTensor).to(device)
    labels = labels.type(torch.LongTensor).view(-1).to(device) #squashing
    true_intents = true_intents.to(device)
    slots = encoder(inputs)
    print(slots.size())
#     slots ,intents, slot_embedding = encoder(inputs)
    y_pred_slots = torch.argmax(slots,dim=1)
    #print(y_pred_slots.size())
    f_slot_scores.append(calc_bat_fscores(y_pred_slots,labels.view(-1)))
   #print(intents.size())
#     y_pred_intents = torch.argmax(intents,dim=2)
#     f_intent_scores.append(calc_bat_fscores(y_pred_intents[0],true_intents,predict_type='intent'))
print('Mean Slot F Metric :',np.mean(f_slot_scores))
print('-----------------------------------------------')
# print('Mean Intent F Metric :',np.mean(f_intent_scores))

torch.Size([6400, 123]) torch.Size([6400])
torch.Size([6400, 123]) torch.Size([6400])
torch.Size([6400, 123]) torch.Size([6400])
torch.Size([5800, 123]) torch.Size([5800])
Mean Slot F Metric : 0.36487735848963476
-----------------------------------------------
