# Goal
This notebook contains code for the purpose of implementing a joint-intent-slot-filling RNNN described in (put paper here).
Code is based off of: https://github.com/DSKSD/RNN-for-Joint-NLU
 and https://github.com/D2KLab/botcycle/blob/master/brain/botcycle/nlu/joint/data.py
 
**test tube**: https://github.com/williamFalcon/test-tube


# Importing Libraries

In [1]:
import json
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import random 
import numpy as np
import pandas as pd
USE_CUDA = torch.cuda.is_available()

In [2]:
MAX_LENGTH = 50
SEED = 423

random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Reading the Data in
We are using theAir Travel Information System (ATIS) dataset(I believe it is originally from UPenn, need to verify)
It uses the Inside-Out-Beginning tag format(IOB) described here : https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)

The typical structure is:

Sentence (tokens), Slots to match each token, Intent.

Found a preprocessed atis file from, thanks to: https://github.com/D2KLab/botcycle/tree/master/nlu/data/atis


In [3]:
with open('data/preprocessed/fold_train.json', 'r') as f:
    atis_json = json.load(f)

In [4]:
atis_json['data'][0]

{'words': ['i',
  'want',
  'to',
  'fly',
  'from',
  'baltimore',
  'to',
  'dallas',
  'round',
  'trip'],
 'slots': ['O',
  'O',
  'O',
  'O',
  'O',
  'B-fromloc.city_name',
  'O',
  'B-toloc.city_name',
  'B-round_trip',
  'I-round_trip'],
 'length': 10,
 'intent': 'atis_flight'}

# Data Preprocessing

In [5]:
def flatten(list_of_lists):
    """Flattens from two-dimensional list to one-dimensional list"""
    return [item for sublist in list_of_lists for item in sublist]

In [6]:
def adjust_sequences(data, length=MAX_LENGTH):
    """
    Fixes the input and output sequences length, adding padding or truncating if necessary
    :param data json file containing entries from atis dataset.
    :param length the fixed length of the sentence.
    """
    for sample in data['data']:
        # adjust the sequence of input words
        if len(sample['words']) < length:
            # add <EOS> and <PAD> if sentence is shorter than maximum length
            sample['words'].append('<EOS>')
            while len(sample['words']) < length:
                sample['words'].append('<PAD>')
        else:
            # otherwise truncate and add <EOS> at last position
            sample['words'] = sample['words'][:length]
            sample['words'][-1] = '<EOS>'

        # adjust in the same way the sequence of output slots
        if len(sample['slots']) < length:
            sample['slots'].append('<EOS>')
            while len(sample['slots']) < length:
                sample['slots'].append('<PAD>')
        else:
            sample['slots'] = sample['slots'][:length]
            sample['slots'][-1] = '<EOS>'

    return data

In [7]:
def get_vocabularies(train_data):
    """
    Collect the input vocabulary, the slot vocabulary and the intent vocabulary
    :param train_data the training data containing words,slots and intent.
    """
    # from a list of training examples, get three lists (columns)
    data = train_data['data']
    seq_in = [sample['words'] for sample in data]
    vocab = flatten(seq_in)
    # removing duplicated but keeping the order
    v = ['<PAD>', '<SOS>', '<EOS>'] + vocab
    vocab = sorted(set(v), key=lambda x: v.index(x)) # https://docs.python.org/3.3/howto/sorting.html
    s = ['<PAD>','<EOS>'] + train_data['meta']['slot_types']
    slot_tag = sorted(set(s), key=lambda x: s.index(x))
    i = train_data['meta']['intent_types']
    intent_tag = sorted(set(i), key=lambda x: i.index(x))

    return vocab, slot_tag, intent_tag

In [8]:
#split_atis(new_atis0)

In [9]:
adjusted_atis = adjust_sequences(atis_json)#padded

In [10]:
atis_vocab,atis_slots,atis_intents = get_vocabularies(adjusted_atis)

In [11]:
len(atis_vocab),len(atis_slots),len(atis_intents)
#type(atis_vocab[0])

(870, 122, 21)

# Next we need to embed the data and set it up for Pytorch
Remember each vocabulary for each sentence, slot and intent will have different embeddings. They are different sized vectors. The Network will try to figure out a mapping from these different vector spaces.

In [12]:
def create_mappings(vocab,forward_map):
    """
    This function takes the words in the vocabulary and creates a unique mapping to a number.
    :param vocab contains all the words in the corpus.
    :param forward_map a dictionary that will be populated with mappings.
    returns populated forward_map
    """
    for sample in vocab:
        if sample not in forward_map.keys():
            forward_map[sample]= len(forward_map)
    return forward_map

In [13]:
SOS_token = 2
EOS_token = 3
word2index = {'<PAD>': 0, '<UNK>':1,'<SOS>':2,'<EOS>':3}
create_mappings(atis_vocab,word2index)
index2word = {v:k for k,v in word2index.items()}

In [14]:
tag2index = {'<PAD>' : 0,'<UNK>':1,'<EOS>':2}
create_mappings(atis_slots,tag2index)
index2tag = {v:k for k,v in tag2index.items()}

In [15]:
intent2index={'<UNK>':0}
create_mappings(atis_intents,intent2index)
index2intent = {v:k for k,v in intent2index.items()}

In [16]:
word2index['<UNK>']
#tag2index['<SOS>']

1

### Next we create a Tensor where each row is a mapped/embedded sequence

In [17]:
def prepare_sequence(seq_data, mapping,map_type):
    """
    :param seq a sequnce which will be embedded as a vector
    :param mapping, a dictionary which contains how each element in the seq will be mapped to a number.
    :param map_type 'words','slots' or 'intent'
    returns a Pytorch Tensor.
    """
    embed_fnc = lambda word: mapping[word] if word in mapping.keys() else mapping["<UNK>"]
    embeddings = list(map(embed_fnc, seq_data[map_type]))
    tensor = Variable(torch.LongTensor(embeddings)).cuda() if USE_CUDA else Variable(torch.LongTensor(embeddings))
    return tensor


In [18]:
def create_training_set(padded_atis):
    train_data = []
    atis_data = padded_atis['data']
    #print(len(atis_data))
    for i in range(len(atis_data)):
        #print(len(atis_data[i]['words']))
        seq_tensor = prepare_sequence(atis_data[i],word2index,'words').view(1,-1)
        slot_tensor = prepare_sequence(atis_data[i],tag2index,'slots').view(1,-1)
        intent_tensor = prepare_sequence(atis_data[i],intent2index,'intent')#.view(1,-1)
        train_data.append((seq_tensor,slot_tensor,intent_tensor))
    return train_data

In [19]:
train_data = create_training_set(adjusted_atis)


In [20]:
len(train_data)
#train_data[0]

4478

### Batching the data

In [21]:
def get_batch(batch_size, train_data):
    """
    Returns iteratively a batch of specified size on the data. 
    The last batch can be smaller if the total size is not multiple of the batch
    """
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while sindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        #print('returning', len(batch), 'samples')
        yield batch

# Building the Joint-RNN Model
Gives an explanation of encoder-decoder models using RNNs: https://github.com/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb

**Note the inputs:**

* input_dim is the size/dimensionality of the one-hot vectors that will be input to the encoder. This is equal to the input (source) vocabulary size.
* emb_dim is the dimensionality of the embedding layer. This layer converts the one-hot vectors into dense vectors with emb_dim dimensions.
* hid_dim is the dimensionality of the hidden and cell states.
* n_layers is the number of layers in the RNN.
* dropout is the amount of dropout to use. This is a regularization parameter to prevent overfitting. Check out this for more details about dropout.
Remember to use progress bar


In [41]:
class Encoder(nn.Module):
    """"BiDir-LSTM"""
    def __init__(self, input_dim, emb_dim, hid_dim,n_layers=1):
        super(Encoder, self).__init__()

        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
       
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, n_layers,bidirectional=True)
        
    def forward(self, src):
        # batch size is determined from dimensions of the input.
        #src = [sent len, batch size]
        embedded = self.embedding(src)

        #embedded = [sent len, batch size, emb dim]
        outputs, (hidden, cell) = self.lstm(embedded)

        #outputs = [sent len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]

        #outputs are always from the top hidden layer

        return hidden , cell # makes up the context vector
   

In [42]:
class Decoder(nn.Module):
    """
    Decoder 
    """
    def __init__(self, slot_dim, intent_dim , emb_dim, hid_dim, n_layers=1):
        super().__init__()
        
        self.slot_dim = slot_dim 
        self.intent_dim = intent_dim
        
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(slot_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers)
        
        self.out = nn.Linear(hid_dim, slot_dim) #slot output
        self.intent_out = nn.Linear(hid_dim,intent_dim) # intent output
        
    def forward(self, input_dec, hidden, cell):
        
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [n layers, batch size, hid dim]
        #context = [n layers, batch size, hid dim]
        
        #input_enc = [1, batch size]   
        #input_dec = input_dec # current word
        print(input_dec.shape)
        
        embedded = self.embedding(input_dec)
        #embedded = [1, batch size, emb dim]
        #print('embed ',embedded)
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        #output = [sentence len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #sentence len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]
        
        slot_prediction = self.out(output.squeeze(0))
        intent_prediction = self.intent_out(output.squeeze(0))
        #prediction = [batch size, output dim]
        
        return slot_prediction, intent_prediction
    


# Need to figure out how data flows through model, dimensions

In [59]:
print("Testing models...")
n_layers = 1
INPUT_DIM = len(atis_vocab)
OUTPUT_DIM = len(atis_slots)
#print(INPUT_DIM,OUTPUT_DIM)
intent_size = 1
embed_size = 5
hidden_size = 5
output_size = MAX_LENGTH
encoder = Encoder(INPUT_DIM, embed_size,hidden_size,n_layers=n_layers)
decoder = Decoder(OUTPUT_DIM,intent_size,embed_size ,hidden_size, n_layers=2)

# Test encoder
inp = get_batch(1,train_data[:1])
for i in inp:
    print(i[0][0].shape) # [batch size , sent len] needs to be [sent len, batch size] cause pytorch accepts that
    print(i[0][1].shape)
    encoder_hidden, encoder_cell = encoder(i[0][0])
print('encoder_cell', encoder_cell.size())
print('encoder_hidden', encoder_hidden.size())

# Test decoder
# First feed Decoder <SOS> token
decoder_input = i[0][0]#Variable(torch.LongTensor([[0]])) # SOS
decoder_hidden = encoder_hidden
decoder_cell = encoder_cell
dec_slot_output, dec_intent_output = decoder(decoder_input, decoder_hidden, decoder_cell)
# print('dec_slot_output dim : ' ,dec_slot_output.size())
# print('dec_intent_output dim: ', dec_intent_output.size())



Testing models...
torch.Size([1, 50])
torch.Size([1, 50])
encoder_cell torch.Size([2, 50, 5])
encoder_hidden torch.Size([2, 50, 5])
torch.Size([1, 50])


# Training

In [None]:
LEARNING_RATE=0.001
EMBEDDING_SIZE=64
HIDDEN_SIZE=64
BATCH_SIZE=16
LENGTH=50
STEP_SIZE=10

In [None]:
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    

    for di in range(target_length):
        decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs)
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()  # detach from history as input

        loss += criterion(decoder_output, target_tensor[di])
        if decoder_input.item() == EOS_token:
            break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [None]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np


def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)