In [None]:
! git clone https://github.com/sz128/slot_filling_and_intent_detection_of_SLU.git

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip
!ls -lat

In [108]:
from torch import nn
import torch

'''
A simple bidirectional lstm for joint slot filling and intent classification
'''
class IntentSlot(nn.Module):
  def __init__(self, embedding_size, vocab_size, hidden_size, intent_size, slot_size):
    super(IntentSlot, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx=0)
    self.lstm = nn.LSTM(input_size=embedding_size,
                         hidden_size=hidden_size,
                         num_layers=1,
                         bidirectional=True)
    print(intent_size, slot_size)
    self.classifier_slot = nn.Linear(2*hidden_size, slot_size)
    self.classifier_intent = nn.Linear(2*hidden_size, intent_size)
  
  def forward(self, x, mask):
    x = self.embedding(x)
    x, _ = self.lstm(x)
    agg  = torch.sum(x * mask.unsqueeze(-1), dim=1)
    agg = agg / (1e-9 + torch.sum(mask, dim=1, keepdim=True))
    slots = self.classifier_slot(x)
    intent = self.classifier_intent(agg)
    return slots, intent

if __name__ == '__main__':
   model = IntentSlot(100, 10, 10, 4, 2)
   x = torch.tensor([[1, 4], [5, 6]])
   mask = torch.tensor([[1, 1], [1, 1]])
   slots, intent = model(x, mask)
   print(slots.size(), intent.size())

4 2
torch.Size([2, 2, 2]) torch.Size([2, 4])


In [99]:
import numpy as np
import random
from torch.nn.utils.rnn import pad_sequence
import torch

'''
create a tokenized dataset using the glove embeddings with a custom iterator
'''
class dataset:
  def __init__(self, train_file, val_file, test_file, embedding_dim=100):
    self.glove_vocab = self._build_glove_vocab(embedding_dim)
    self.w_to_t, self.t_to_w, self.s_to_l, self.l_to_s, self.i_to_l, self.l_to_i \
     = {'<PAD>': 0, '<UNK>': 1}, {0: '<PAD>', 1: '<UNK>'}, {'<PAD>': 0}, {0: '<PAD>'} \
     , {}, {}
    self._create_vocabulary(train_file, embedding_dim)
    self._create_vocabulary(val_file, embedding_dim)
    self._create_vocabulary(test_file, embedding_dim)
    self.intent_size = len(self.i_to_l)
    self.slot_size = len(self.s_to_l)
    self.embedding_size = embedding_dim
    self.vocab_size = len(self.t_to_w)
    self.embedding = [np.zeros((1, embedding_dim)), np.random.randn(1, embedding_dim)]
    for i in range(2, len(self.t_to_w)):
      self.embedding.append(
          self.glove_vocab[self.t_to_w[i]]
          )
    tokenized_train = self._process(train_file)
    random.shuffle(tokenized_train)
    tokenized_val = self._process(val_file)
    tokenized_test = self._process(test_file)

    self.splits = {'train': tokenized_train,
                   'val': tokenized_val,
                   'test': tokenized_test}

  def iterate(self, mode, batch_size=64):
    data = self.splits[mode]
    idx = np.arange(len(data))
    n_chunks = idx.shape[0] // batch_size + 1
    for chunk_id, chunk in enumerate(np.array_split(idx, n_chunks)):
        sentences = [torch.tensor(data[idx][0]) for idx in chunk]
        slots = [torch.tensor(data[idx][1]) for idx in chunk]
        intents = [torch.tensor(data[idx][2]).reshape(1) for idx in chunk]
        padded_sentences = pad_sequence(sentences, batch_first=True, padding_value=0)
        padded_slots = pad_sequence(slots, batch_first=True, padding_value=0)
        mask = (padded_slots != 0).float()
        intents = torch.cat(intents, 0)
        yield padded_sentences, padded_slots, intents, mask
  
  def _process(self, input_file):
    with open(input_file, 'rt') as fi:
      sentences = fi.read().strip().split('\n')
      tokenized = []
      for i, sentence in enumerate(sentences):
        text, intent = sentence.split(' <=> ')
        tokens, slots = [], []
        for word_slot in text.split(' '):
            word, slot = word_slot.split(":")
            if word not in self.w_to_t:
              tokens.append(1)
            else:
              tokens.append(self.w_to_t[word])
            slots.append(self.s_to_l[slot])
        tokenized.append((tokens, slots, self.i_to_l[intent]))
      return tokenized

    
  def _create_vocabulary(self, train_file, embedding_dim):
    with open(train_file, 'rt') as fi:
      sentences = fi.read().strip().split('\n')
      for sentence in sentences:
        text, intent = sentence.split(' <=> ')
        for idx, word_slot in enumerate(text.split(' ')):
          word, slot = word_slot.split(":")
          if slot not in self.s_to_l:
            x = self.s_to_l[slot] = len(self.s_to_l)
            self.l_to_s[x] = slot
          if intent not in self.i_to_l:
            x = self.i_to_l[intent] = len(self.i_to_l)
            self.l_to_i[x] = intent
          if word not in self.w_to_t and word in self.glove_vocab:
            x = self.w_to_t[word] = len(self.w_to_t)
            self.t_to_w[x] = word
    

  def _build_glove_vocab(self, dimension=100):
    vocab = {}
    with open('glove.6B.{}d.txt'.format(dimension),'rt') as fi:
      full_content = fi.read().strip().split('\n')
      for line in full_content:
        splits = line.split()
        word, embedding = splits[0], list(map(float, splits[1:]))
        vocab[word] = np.array(embedding)
    print('finished reading glove', len(vocab))
    return vocab
    
# if __name__ == '__main__':
#   directory = 'slot_filling_and_intent_detection_of_SLU/data/atis-2/'
#   d = dataset(directory+'train', directory+'valid', directory+'test')
#   for x,y1,y2 in d.iterate('train'):
#     print(x.size(), y1.size(), y2.size())

In [104]:
import copy

def eval(model, mode='val'):
  model.eval()
  total_slot, total_intent, correct_slot, correct_intent = 0, 0, 0, 0
  for x, y_s, y_i, mask in d.iterate(mode):
    logits_slots, logits_intent = model(x, mask)
    pred_slot = torch.argmax(logits_slots, dim=-1)
    pred_intent = torch.argmax(logits_intent, dim=-1)
    total_intent += y_i.size(0)
    total_slot += torch.sum((y_s != 0).float()).item()
    correct_slot += torch.sum(((pred_slot == y_s) & (y_s != 0)).float()).item()
    correct_intent += torch.sum((pred_intent == y_i).float()).item()
  return float(correct_slot) / total_slot, float(correct_intent) / total_intent 

directory = 'slot_filling_and_intent_detection_of_SLU/data/atis-2/'
d = dataset(directory+'train', directory+'valid', directory+'test')
model = IntentSlot(embedding_size=d.embedding_size, vocab_size=d.vocab_size, 
                   hidden_size = 512, intent_size=d.intent_size, slot_size=d.slot_size)
loss_slot = nn.CrossEntropyLoss(ignore_index=0)
loss_intent = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())
eval_period, num_epochs, steps, best_acc = 20, 10, 0, 0

for epoch in range(num_epochs):
  for x, y_s, y_i, mask in d.iterate('train'):
    model.train()
    optimizer.zero_grad()
    logits_slots, logits_intent = model(x, mask)
    l = loss_intent(logits_intent, y_i)
    l = l + loss_slot(logits_slots.view(-1, d.slot_size), y_s.view(-1))
    l.backward()
    optimizer.step()
    steps += 1
    if steps % eval_period == 0:
      slot_acc, intent_acc = eval(model, 'val')
      print('val accuracy of slot filling is {0:.2f}  and accuracy of intent \
       classification is {1:.2f}'.format(slot_acc, intent_acc))
      if slot_acc > best_acc:
        best_model = copy.deepcopy(model)
        best_acc = slot_acc
      
slot_acc, intent_acc = eval(best_model, 'test')
print('test accuracy of slot filling is {0:.2f}  and accuracy of intent \
  classification is {1:.2f}'.format(slot_acc, intent_acc))




finished reading glove 400000
26 128
val accuracy of slot filling is 0.63  and accuracy of intent        classification is 0.71
val accuracy of slot filling is 0.68  and accuracy of intent        classification is 0.71
val accuracy of slot filling is 0.72  and accuracy of intent        classification is 0.72
val accuracy of slot filling is 0.77  and accuracy of intent        classification is 0.77
val accuracy of slot filling is 0.80  and accuracy of intent        classification is 0.77
val accuracy of slot filling is 0.83  and accuracy of intent        classification is 0.80
val accuracy of slot filling is 0.84  and accuracy of intent        classification is 0.80
val accuracy of slot filling is 0.85  and accuracy of intent        classification is 0.79
val accuracy of slot filling is 0.86  and accuracy of intent        classification is 0.83
val accuracy of slot filling is 0.86  and accuracy of intent        classification is 0.84
val accuracy of slot filling is 0.86  and accuracy of

# New Section