# 0.0 Preprocessing

In [1]:
import glob
import json
import torch.utils.data as D

In [2]:
class DstcDataset(D.Dataset):
    
    def __init__(self, filename):
        with open(filename) as f:
            self.dialogues = json.load(f)
            
    def __len__(self):
        return len(self.dialogues)
    
    def __getitem__(self, idx):
        return self.dialogues[int(idx)]

In [10]:
train_ds = []
for count, filename in enumerate(glob.glob("data/train/dialogues_*.json")):
    train_ds.append(DstcDataset(filename))
    if count > 20:
        break

train_ds = D.ConcatDataset(train_ds)

In [11]:
def text_to_ids(tokenizer, text):
    tokens = tokenizer.tokenize(text)
    indices = tokenizer.convert_tokens_to_ids(tokens)
    return indices
    
def tokenize_dialogue(tokenizer, dialogue):
    # for every tokenizable field-value, a new t_field_name is created with tokens
    for turn in dialogue.get("turns", []):
        # tokenize turn.utterance
        turn["t_utterance"] = text_to_ids(tokenizer, turn["utterance"])
        for frame in turn.get("frames", []):
            # tokenize turn.frame.state.slot_values
            if frame.get("state"):
                tokenized = {}
                for slot in frame["state"].get("slot_values", []):
                    tokenized[slot] = []
                for slot, values in frame["state"].get("slot_values", {}).items():
                    for val in values:
                        tokenized[slot].append(text_to_ids(tokenizer, val))
                frame["state"]["t_slot_values"] = tokenized
            # tokenize.turn.frame.action.values
            if frame.get("actions"):
                for action in frame["actions"]:
                    tokenized = []
                    for val in action.get("values", []):
                        tokenized.append(text_to_ids(tokenizer, val))
                    action["t_values"] = tokenized
    return dialogue


def tokenize_schema(tokenizer, schema):
    schema["t_description"] = text_to_ids(tokenizer, schema["description"])
    for slot in schema.get("slots", []):
        slot["t_description"] = text_to_ids(tokenizer, slot["description"])
        slot["t_is_categorical"] = int(slot["is_categorical"])
        slot["t_possible_values"] = [text_to_ids(tokenizer, v) for v in slot["possible_values"]]
    for intent in schema.get("intents", []):
        intent["t_description"] = text_to_ids(tokenizer, intent["description"])
        intent["t_is_transactional"] = int(intent["is_transactional"])
        intent["t_optional_slots"] = dict((k, text_to_ids(tokenizer, v)) for k, v in intent["optional_slots"].items())
    return schema

In [12]:
from pytorch_transformers import BertTokenizer

In [13]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")    

In [14]:
for dialogue in train_ds:
    tokenize_dialogue(tokenizer, dialogue)

In [15]:
with open("data/train/schema.json") as f:
    train_schemas = json.load(f)
for each_schema in train_schemas:
    tokenize_schema(tokenizer, each_schema)

# 0.1 Setup Features

In [16]:
import copy
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence

from collections import OrderedDict
from pytorch_transformers import BertModel, BertConfig

In [17]:
def convert_dialog_to_sample(dialogue, schemas):
    # Note: try to keep this padding and tensor free..
    # 1-1 mapping of intent, desc, service in dialog
    y_intents = []
    y_descriptions = []
    y_descriptions_tokens = []
    y_descriptions_mask = []
    y_services = []
    for service in dialogue["services"]:
        for schema_item in schemas:
            if schema_item["service_name"] == service:
                for intent_item in schema_item["intents"]:
                    y_intents.append(intent_item["name"])
                    y_descriptions.append(intent_item["description"])
                    y_descriptions_tokens.append(intent_item["t_description"])
                    y_descriptions_mask.append([1] * len(intent_item["t_description"]))
                    y_services.append(service)
    
    # indexed by turn ID
    sample = {
        "x_turn_mask": [],
        "x_utterance_mask": [], 
        "x_utterance_tokens": [],
        "x_utterance": [],

        "y_descriptions_mask": [],
        "y_descriptions_tokens": [],
        "y_descriptions": [],
        
        "y_intents": [],
        "y_scores": [], # score = 1 => valid intent
    }
    
    for turn in dialogue["turns"]:
        if turn["speaker"] == "USER":
            sample["x_turn_mask"].append(1)
            sample["x_utterance_tokens"].append(turn["t_utterance"])
            sample["x_utterance_mask"].append([1] * len(turn["t_utterance"]))
            sample["x_utterance"].append(turn["utterance"])
            sample["y_descriptions_tokens"].append(y_descriptions_tokens.copy())
            sample["y_descriptions_mask"].append(y_descriptions_mask.copy())
            sample["y_descriptions"].append(y_descriptions.copy())
            sample["y_intents"].append(y_intents.copy())
            
            # initialize scores: service, intent = 0
            scores_kw = OrderedDict()
            for service, intent in zip(y_services, y_intents):
                scores_kw[service, intent] = 0

            # if intent exists in frame set to one
            for service, intent in zip(y_services, y_intents):
                for frame in turn["frames"]:
                    if frame["service"] == service and frame["state"]["active_intent"] == intent:
                        scores_kw[service, intent] = 1
            
            # add to the sample in order
            scores = list(scores_kw.values())
            sample["y_scores"].append(scores)
            
    return sample

In [18]:
def recursive_pad(tensors, value=0, mode="constant", batch_first=True):
    # all tensors have same shape
    max_num_dims = max(len(t.shape) for t in tensors)
    for i in range(len(tensors)):
        diff = max_num_dims - len(tensors[i].shape)
        shape = [...] + [None] * diff
        tensors[i] = tensors[i][shape]

    # max dim across tensors
    max_dims = [0] * max_num_dims
    for t in tensors:
        for i, d in enumerate(t.shape):
            max_dims[i] = max(max_dims[i], d)

    # pad unequal dims
    result = []
    for t in tensors:
        diff = []
        for max_d, t_d in zip(max_dims, t.shape):
            diff = [0, max_d - t_d] + diff
        result.append(F.pad(t, diff, mode=mode, value=value))

    return pad_sequence(result, batch_first=batch_first)

In [19]:
def convert_mini_batch_to_tensors(batch):
    # y_scores: [batch, turn, desc]
    for b, sample in enumerate(batch["y_scores"]):
        batch["y_scores"][b] = torch.tensor(sample)
    batch["y_scores"] = recursive_pad(batch["y_scores"]).to("cpu")  
    
    # x_turn_mask: [batch, turn]
    for b, sample in enumerate(batch["x_turn_mask"]):
        batch["x_turn_mask"][b] = torch.tensor(sample)
    batch["x_turn_mask"] = recursive_pad(batch["x_turn_mask"]).to("cpu")
    
    # x_utterance_tokens: [batch, turn, seq]
    for b, sample in enumerate(batch["x_utterance_tokens"]):
        for t, turn in enumerate(sample):
            sample[t] = torch.tensor(turn)
        batch["x_utterance_tokens"][b] = recursive_pad(sample)
    batch["x_utterance_tokens"] = recursive_pad(batch["x_utterance_tokens"]).to("cpu")
    
    # x_utterance_mask: [batch, turn, seq]
    for b, sample in enumerate(batch["x_utterance_mask"]):
        for t, turn in enumerate(sample):
            sample[t] = torch.tensor(turn)
        batch["x_utterance_mask"][b] = recursive_pad(sample)
    batch["x_utterance_mask"] = recursive_pad(batch["x_utterance_mask"]).to("cpu")  
        
    # y_descriptions_tokens: [batch, turn, desc, seq]
    for b, sample in enumerate(batch["y_descriptions_tokens"]):
        for t, turn in enumerate(sample):
            for d, desc in enumerate(turn):
                turn[d] = torch.tensor(desc)
            sample[t] = recursive_pad(turn)
        batch["y_descriptions_tokens"][b] = recursive_pad(sample)
    batch["y_descriptions_tokens"] = recursive_pad(batch["y_descriptions_tokens"]).to("cpu")  

    # y_descriptions_mask: [batch, turn, desc, seq]
    for b, sample in enumerate(batch["y_descriptions_mask"]):
        for t, turn in enumerate(sample):
            for d, desc in enumerate(turn):
                turn[d] = torch.tensor(desc)
            sample[t] = recursive_pad(turn)
        batch["y_descriptions_mask"][b] = recursive_pad(sample)
    batch["y_descriptions_mask"] = recursive_pad(batch["y_descriptions_mask"]).to("cpu")

    return batch

def create_mini_batch(dialogues):
    # indexed by turn
    batch = {
        "x_turn_mask": [],
        "x_utterance_tokens": [],
        "x_utterance_mask": [],
        "x_utterance": [],
        "y_descriptions_tokens": [],
        "y_descriptions_mask": [],
        "y_intents": [],
        "y_scores": [],
    }
    
    # create a batch
    for d in dialogues:
        sample = convert_dialog_to_sample(d, train_schemas)
        for f in batch:
            batch[f].append(sample[f])

    # dimension [batch, turn, ...]
    return convert_mini_batch_to_tensors(batch)

In [20]:
batch = create_mini_batch([train_ds[0], train_ds[1]])

# 1. Intent Prediction

In [21]:
import torch.optim as optim

In [60]:
class UserIntentPredictor(nn.Module):
    
    def __init__(self, config={}):
        super().__init__()
        self.config = config
        
        # pretrained embedding/language model
        lm_config = BertConfig.from_pretrained("bert-base-uncased")
        self.lm = BertModel(lm_config).eval()
        
        # layers
        self.l0 = nn.Linear(config["embed_size"], config["embed_size"])
        self.l1 = nn.Linear(config["embed_size"], config["embed_size"])
        self.l2 = nn.Linear(1, 1)
        
        # freeze lm
        for name, param in self.named_parameters():
            if name.startswith("lm."):
                param.requires_grad = False
                
        # init weights
        for name, param in self.named_parameters():
            if not name.startswith("lm."):
                param.data.fill_(0)
    
    def forward(self, batch, turn):
        # encode current utterance
        # utter = [batch, seq, emb]
        utter, _ = self.lm(
            input_ids=batch["x_utterance_tokens"][:,turn,:],
            attention_mask=batch["x_utterance_mask"][:,turn,:]
        )
        utter = self.l0(utter)

        # individually encode description
        # desc = [desc, batch, seq, emb]
        desc = []
        num_descriptions = batch["y_descriptions_tokens"].shape[2]
        for i in range(num_descriptions):
            each_desc, _ = self.lm(
                input_ids=batch["y_descriptions_tokens"][:,turn,i,:],
                attention_mask=batch["y_descriptions_mask"][:,turn,i,:]
            )
            each_desc = self.l1(each_desc)
            desc.append(each_desc)

        # score = [batch, desc, emb]
        scores = []
        for each_desc in desc:
            utter = utter.sum(1, keepdim=True) # [b,1,e] collapse across sequence length
            each_desc = each_desc.sum(1, keepdim=True).permute(0, 2, 1) # [b,e,1]
            each_score = torch.bmm(utter, each_desc) # [b,1,1]
            scores.append(each_score)
        scores = torch.cat(scores, dim=1)

        # [batch]
        turn_mask = batch["x_turn_mask"][:,turn:turn+1]

        # score = [batch, desc]
        scores = self.l2(scores).squeeze(2)
        scores = scores * turn_mask.float()
        return scores


def calc_accuracy(predicted_scores, target_scores):
    # tensor score: [batch, turn, desc]
    # turn_len padding shows acc worse than actual!
    with torch.no_grad():
        batch_size = target_scores.shape[0]
        turn_len = target_scores.shape[1]
        binarized = predicted_scores >= predicted_scores.max(dim=2).values
        matched = binarized.float() * target_scores.float()
        matched = matched.sum().item()
    return matched / (batch_size * turn_len)

    
def predict(model, batch):
    with torch.no_grad():
        num_turns = batch["y_scores"].shape[1]
        batch_size = batch["y_scores"].shape[0]
        
        # convert onehot to actual labels
        y_target = []
        indices = torch.argmax(batch["y_scores"], dim=2)
        for i, sample_indices in enumerate(indices):
            intents = []
            for j, turn_index in enumerate(sample_indices):
                if batch["x_turn_mask"][i][j] == 1:
                    intents.append(batch["y_intents"][i][j][turn_index])
            y_target.append(intents)
        
        y_predicted = [[] for i in range(batch_size)]

        for turn in range(num_turns):
            scores = model(batch, turn)
            indices = torch.argmax(scores, dim=1)
            for b, i in enumerate(indices):
                if batch["x_turn_mask"][b][turn] == 1:
                    intent = batch["y_intents"][b][turn][i]
                    y_predicted[b].append(intent)
        
        # calc acc
        accuracy = 0
        for tgt, pred in zip(y_target, y_predicted):
            accuracy = sum([i == j for i, j in zip(tgt, pred)]) / len(tgt)
        accuracy /= batch_size
        
        return {
            "predicted": y_predicted,
            "target": y_target,
            "accuracy": accuracy
        }


def eval_step(model, batch):
    with torch.no_grad():
        num_turns = batch["y_scores"].shape[1]
        loss_value = 0
        for turn in range(num_turns):
            # make predictions
            target_scores = batch["y_scores"][:,turn,:].float()
            predicted_scores = model(batch, turn).float()

            # calculate loss
            loss = F.mse_loss(
                predicted_scores,
                target_scores,
            )
            loss_value += loss.item()
    return loss_value # loss per dialogue


def train_step(model, batch, model_optim):
    loss_value = 0
    num_turns = batch["y_scores"].shape[1]
    
    for turn in range(num_turns):
        # zero gradients
        model_optim.zero_grad()
        
        # make predictions
        target_scores = batch["y_scores"][:,turn,:].float()
        predicted_scores = model(batch, turn).float()

        # calculate loss
        loss = F.mse_loss(
            predicted_scores,
            target_scores,
        )
        
        loss.backward()
        model_optim.step()
        
        loss_value += loss.item()
    return loss_value # loss per dialogue
    


def train(params):
    train_loader = D.DataLoader(
        params["train_ds"], 
        batch_size=params["batch_size"],
        collate_fn=params["collate_fn"],
    )
    
    test_ds = D.DataLoader(
        params["test_ds"],
        batch_size=params["batch_size"],
        collate_fn=params["collate_fn"],
    )
    
    model = params["model"]
    model = model.to(params["device"])
    model_optim = optim.Adam(model.parameters(), lr=params["lr"])
    
    for epoch in range(params["epochs"]):
        for count, batch in enumerate(train_loader):
            for field in batch:
                if type(batch[field]) is torch.Tensor:
                    batch[field] = batch[field].to(params["device"])
            loss = train_step(model, batch, model_optim)
            if count % params["print_every"] == 0:
                acc = predict(model, batch)["accuracy"]
                print("Epoch {}, Batch: {}, Loss: {} Acc: {}".format(epoch, count, loss, acc))

In [61]:
model = UserIntentPredictor({
    "embed_size": 768,
    "max_history": 5,
})

tr_config = {
    "model": model,
    "train_ds": train_ds,
    "test_ds": [],
    "batch_size": 50,
    "collate_fn": create_mini_batch,
    "lr": 0.0001,
    "device": "cuda",
    "epochs": 10,
    "print_every": 100,
}

train(tr_config)

Epoch 0, Batch: 0, Loss: 1.5582641558721662 Acc: 0.007272727272727273
Epoch 1, Batch: 0, Loss: 1.408170253969729 Acc: 0.007272727272727273
Epoch 2, Batch: 0, Loss: 1.3632856030017138 Acc: 0.007272727272727273
Epoch 3, Batch: 0, Loss: 1.3772158781066537 Acc: 0.007272727272727273


KeyboardInterrupt: 

> [0;32m/home/suryak/anaconda3/lib/python3.7/site-packages/torch/nn/modules/module.py[0m(525)[0;36m__getattr__[0;34m()[0m
[0;32m    523 [0;31m            [0mself[0m[0;34m.[0m[0m_load_state_dict_pre_hooks[0m [0;34m=[0m [0mOrderedDict[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    524 [0;31m[0;34m[0m[0m
[0m[0;32m--> 525 [0;31m    [0;32mdef[0m [0m__getattr__[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mname[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    526 [0;31m        [0;32mif[0m [0;34m'_parameters'[0m [0;32min[0m [0mself[0m[0;34m.[0m[0m__dict__[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m    527 [0;31m            [0m_parameters[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0m__dict__[0m[0;34m[[0m[0;34m'_parameters'[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  q
