# Training a dialog model using HuggingFace's pytorch-transformers and the ConvAI Dataset 

### Incorporating migration notes from pytorch_pretrained_bert -> pytorch_transformers
Updating the implementations in: https://medium.com/huggingface/how-to-build-a-state-of-the-art-conversational-ai-with-transfer-learning-2d818ac26313

## Basic Setup Overview

In [1]:
'''
requirements.txt: 
torch
pytorch-ignite
#pytorch-pretrained-bert >= 0.6.2 -> replaced with pytorch-transformers
pytorch-transformers
tensorboardX
tensorflow  # for tensorboardX
'''

'\nrequirements.txt: \ntorch\npytorch-ignite\n#pytorch-pretrained-bert >= 0.6.2 -> replaced with pytorch-transformers\npytorch-transformers\ntensorboardX\ntensorflow  # for tensorboardX\n'

In [6]:
from pytorch_transformers import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer

model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


In [4]:
# We will use 5 special tokens:
# - <bos> to indicate the start of the sequence
# - <eos> to indicate the end of the sequence
# - <speaker1> to indicate the beginning and the tokens of an utterance from the user
# - <speaker2> to indicate the beginning and the tokens of an utterance from the bot
# - <pad> as a padding token to build batches of sequences
SPECIAL_TOKENS = {"bos_token": "<bos>", 
                  "eos_token": "<eos>",
                  "speaker1_token": "<speaker1>", 
                  "speaker2_token": "<speaker2>",
                  "pad_token": "<pad>"}

# We can add these special tokens to the vocabulary and the embeddings of the model:
num_added_token = tokenizer.add_special_tokens(SPECIAL_TOKENS)
print("Number of added tokens: ", num_added_token)
model.resize_token_embeddings(len(tokenizer))

Number of added tokens:  5


Embedding(40483, 768)

In [5]:
print(tokenizer.all_special_tokens)
tokenizer.speaker1_token

['<pad>', '<bos>', '<eos>', '<unk>']


'<speaker1>'

In [75]:
from itertools import chain

# Let's define our contexts and special tokens
persona = [["i", "like", "playing", "football", "."],
           ["i", "am", "from", "NYC", "."]]
history = [["hello", "how", "are", "you", "?"],
           ["i", "am", "fine", "thanks", "."]]
reply = ["great", "to", "hear"]
bos, eos, speaker1, speaker2 = "<bos>", "<eos>", "<speaker1>", "<speaker2>"

def build_inputs(persona, history, reply):
    # Build our sequence by adding delimiters and concatenating
    sequence = [[bos] + list(chain(*persona))] + history + [reply + [eos]]
    sequence = [sequence[0]] + [ [speaker2 if (len(sequence)-i) % 2 else speaker1] + s
                                for i, s in enumerate(sequence[1:])]
    # Build our word, segments and position inputs from the sequence
    words = list(chain(*sequence))                          # word tokens
    
    ''' fixed i -> i+1 to make it consistent with the labels in `sequence`'''
    segments = [speaker2 if (i+1) % 2 else speaker1             # segment tokens
                for i, s in enumerate(sequence) for _ in s]
    position = list(range(len(words)))                      # position tokens
    return words, segments, position, sequence

words, segments, position, sequence = build_inputs(persona, history, reply)

# >>> print(sequence)  # Our inputs looks like this:
# [['<bos>', 'i', 'like', 'playing', 'football', '.', 'i', 'am', 'from', 'NYC', '.'],
#  ['<speaker1>', 'hello', 'how', 'are', 'you', '?'],
#  ['<speaker2>', 'i', 'am', 'fine', 'thanks', '.'],
#  ['<speaker1>', 'great', 'to', 'hear', '<eos>']]



In [76]:
# Tokenize words and segments embeddings:
words = tokenizer.convert_tokens_to_ids(words)
segments = tokenizer.convert_tokens_to_ids(segments)

# don't use tokenizer.encode(x) because the sequence is already tokenized. 
# If given as a string, use tokenizer.encode(x), which is equivalent to tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x))

In [77]:
import torch

In [78]:
distractor = ["sorry", "to", "hear", "that"]

words_distractor, segments_distractor, _, _ = build_inputs(persona, history, distractor)
words_distractor, segments_distractor = tokenizer.convert_tokens_to_ids(words_distractor),  tokenizer.convert_tokens_to_ids(segments_distractor)

In [79]:
# prepare language modeling targets: keep only the reply segment, -1 on the rest 
lm_targets = [-1] * sum(len(s) for s in sequence[:-1]) + [-1] + tokenizer.convert_tokens_to_ids(sequence[-1][1:])
lm_distractor = [-1]*len(words_distractor)

In [80]:
last_token_idx = len(words) - 1
last_token_distractor = len(words_distractor) -1

# pad reply and distractor inputs and targets to same length 
padding_length = max(len(words), len(words_distractor))

def pad(x, padding, padding_length): 
    return x + [padding] * (padding_length - len(x))

words, words_distractor, segments, segments_distractor = [pad(x, tokenizer.convert_tokens_to_ids('<pad>'), padding_length) for x in (words, words_distractor, segments, segments_distractor)]

assert len(words) == len(words_distractor)

In [81]:
lm_targets, lm_distractor = [pad(x, -1, padding_length) for x in (lm_targets, lm_distractor)]

In [82]:
input_ids = torch.tensor([[words, words_distractor]], dtype = torch.long)

In [83]:
input_ids.size()

torch.Size([1, 2, 29])

In [88]:
token_type_ids = torch.tensor([[segments, segments_distractor]], dtype=torch.long)
mc_token_ids = torch.tensor([[last_token_idx, last_token_distractor]])

lm_labels = torch.tensor([[lm_targets, lm_distractor]], dtype=torch.long)
mc_labels = torch.tensor([0], dtype=torch.long) # first one is the gold label. Index of first one is 0 

In [117]:
# need to change it to new loss return format (comes in tuples, with loss and the prediction logits)
lm_loss, mc_loss, lm_predict, mc_predict = model(input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids) 

lm_coef = 2.0 
mc_coef = 1.0 
total_loss = lm_loss*lm_coef + mc_loss*mc_coef

In [119]:
import numpy as np 

In [97]:
total_loss

tensor(19.2084, grad_fn=<AddBackward0>)

# Training on entire dataset (ConvAI)

Original code referenced: [github repo](https://github.com/huggingface/transfer-learning-conv-ai/blob/master/train.py)

In [2]:
import json
from pytorch_transformers import cached_path

In [3]:
url = "s3://datasets.huggingface.co/personachat/personachat_self_original.json"

In [65]:
personachat_file = cached_path(url)
with open(personachat_file, "r", encoding="utf-8") as f: 
    dataset = json.load(f)
    
print(personachat_file)

/Users/justincho/.cache/torch/pytorch_transformers/738e4d3f264f46d2c9161a43c7389d03a34fb336ae842a4337014123c68e744e.bb42905dd6e1098e87c24845469ee12018cfd142e10fcc50f97b28e002a9ac02


In [50]:
personachat = dataset

In [12]:


def tokenize_dataset(obj): 
    if isinstance(obj, str): 
        return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
    if isinstance(obj, dict): 
        return {n: tokenize_dataset(o) for n, o in obj.items()}
    return list(tokenize_dataset(o) for o in obj)

dataset_cache = './dataset_cache'
dataset_cache = dataset_cache + '_' + type(tokenizer).__name__

if dataset_cache and os.path.isfile(dataset_cache):
    dataset = torch.load(dataset_cache)
else: 
    dataset = tokenize_dataset(dataset)
    torch.save(dataset, dataset_cache)


In [66]:
dataset['train'][0]

{'personality': ['i like to remodel homes .',
  'i like to go hunting .',
  'i like to shoot a bow .',
  'my favorite holiday is halloween .'],
 'utterances': [{'candidates': ['my mom was single with 3 boys , so we never left the projects .',
    'i try to wear all black every day . it makes me feel comfortable .',
    'well nursing stresses you out so i wish luck with sister',
    'yeah just want to pick up nba nfl getting old',
    'i really like celine dion . what about you ?',
    'no . i live near farms .',
    "i wish i had a daughter , i'm a boy mom . they're beautiful boys though still lucky",
    'yeah when i get bored i play gone with the wind my favorite movie .',
    "hi how are you ? i'm eating dinner with my hubby and 2 kids .",
    'were you married to your high school sweetheart ? i was .',
    'that is great to hear ! are you a competitive rider ?',
    "hi , i'm doing ok . i'm a banker . how about you ?",
    "i'm 5 years old",
    'hi there . how are you today ?',
  

[{'candidates': [[547,
    1631,
    509,
    2433,
    556,
    281,
    2590,
    240,
    620,
    606,
    868,
    999,
    481,
    12810,
    239],
   [249,
    1302,
    485,
    2985,
    589,
    1301,
    1099,
    850,
    239,
    507,
    2191,
    510,
    1064,
    2589,
    239],
   [862, 12618, 33292, 512, 551, 620, 249, 2275, 3754, 556, 1971],
   [1439, 668, 823, 485, 2572, 609, 9, 6987, 39731, 1381, 1122],
   [249, 976, 649, 22044, 17082, 239, 599, 670, 512, 257],
   [664, 239, 249, 1894, 1957, 17367, 239],
   [249,
    2275,
    249,
    558,
    246,
    2332,
    240,
    249,
    256,
    258,
    246,
    1541,
    1631,
    239,
    600,
    256,
    716,
    1871,
    2590,
    998,
    843,
    3172],
   [1439,
    669,
    249,
    727,
    5271,
    249,
    2200,
    1374,
    556,
    481,
    2272,
    547,
    3898,
    4121,
    239],
   [3569,
    718,
    640,
    512,
    257,
    249,
    256,
    258,
    3658,
    2340,
    556,
    547,
    884

# End of code content shown in Medium post 

# Actual training code and interaction code is in forked repo 

# Reformatting yes-and dataset for fine tuning Huggingface implementation of ConvAI 

In [80]:
with open('../../ISI_exchange/yesand_data/yes-and-data.json', 'r') as f: 
    yesand_data = json.load(f)

In [81]:

all_yesands = []

for k, v in yesand_data['yes-and'].items(): 
    all_yesands += v 
    
reformatted_yesands = []
for idx, yesand in enumerate(all_yesands): 
    instance = {"personality": "", "utterances": []}
    utterance = {"history": [yesand['p']], "candidates": [all_yesands[(idx+1)%len(all_yesands)]['r'], yesand['r']]}
    instance["utterances"].append(utterance)
    reformatted_yesands.append(instance)
    
    

In [82]:
reformatted_yesands[0]

{'personality': '',
 'utterances': [{'history': ["It's good to see you!  What've you been up to?"],
   'candidates': ['Hey, speaking of rude, would you mind climbing out for this conversation?  Because my arms are really getting tired.',
    'Uh, well, I was sleeping, uh, most recently, and then you guys opened the lid and then it was like a rude awakening for me.']}]}

In [84]:
from sklearn.model_selection import train_test_split

In [85]:
train, valid = train_test_split(reformatted_yesands, test_size=0.2, random_state=42)

In [91]:
reformatted_yesands = {'train': train, 'valid': valid}

with open("reformatted_yesands.json", 'w') as f: 
    json.dump(obj=reformatted_yesands, fp=f)

In [87]:
tokenized_yesands = tokenize_dataset(reformatted_yesands)

In [None]:
import os
import math
import logging
from pprint import pformat
from argparse import ArgumentParser
from collections import defaultdict
from itertools import chain

import torch
from torch.nn.parallel import DistributedDataParallel
from torch.utils.data import DataLoader, TensorDataset
from ignite.engine import Engine, Events
from ignite.handlers import ModelCheckpoint
from ignite.metrics import Accuracy, Loss, MetricsLambda, RunningAverage
from ignite.contrib.handlers import ProgressBar, PiecewiseLinear
from ignite.contrib.handlers.tensorboard_logger import TensorboardLogger, OutputHandler, OptimizerParamsHandler
# Migration Notes: pytorch_pretrained_bert -> pytorch_transformers. Also, there is no OpenAIAdam. OpenAIAdam -> AdamW
from pytorch_transformers import (AdamW, OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
                                     GPT2DoubleHeadsModel, GPT2Tokenizer, WEIGHTS_NAME, CONFIG_NAME)

In [57]:
num_added_token = tokenizer.add_special_tokens(SPECIAL_TOKENS)

In [58]:
bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(list(SPECIAL_TOKENS.values())[:-1])

In [59]:
bos, eos, speaker1, speaker2

(40478, 40479, 40480, 40481)

In [88]:

SPECIAL_TOKENS = {"bos_token": "<bos>", 
                  "eos_token": "<eos>",
                  "speaker1_token": "<speaker1>", 
                  "speaker2_token": "<speaker2>",
                  "pad_token": "<pad>"}

MODEL_INPUTS = ["input_ids", "mc_token_ids", "lm_labels", "mc_labels", "token_type_ids"]
PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids"]


def pad_dataset(dataset, padding=0):
    """ Pad the dataset. This could be optimized by defining a Dataset class and pad only batches but this is simpler. """
    max_l = max(len(x) for x in dataset["input_ids"])
    for name in PADDED_INPUTS:
        dataset[name] = [x + [padding if name != "lm_labels" else -1] * (max_l - len(x)) for x in dataset[name]]
    return dataset


def build_input_from_segments(persona, history, reply, tokenizer, lm_labels=False, with_eos=True):
    """ Build a sequence of input from 3 segments: persona, history and last reply """
    bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(list(SPECIAL_TOKENS.values())[:-1])

    instance = {}
    sequence = [[bos] + list(chain(*persona))] + history + [reply + ([eos] if with_eos else [])]
    sequence = [sequence[0]] + [[speaker2 if (len(sequence)-i) % 2 else speaker1] + s for i, s in enumerate(sequence[1:])]

    instance["input_ids"] = list(chain(*sequence))
    instance["token_type_ids"] = [speaker2 if i % 2 else speaker1 for i, s in enumerate(sequence) for _ in s]
    instance["mc_token_ids"] = len(instance["input_ids"]) - 1
    instance["lm_labels"] = [-1] * len(instance["input_ids"])
    if lm_labels:
        instance["lm_labels"] = ([-1] * sum(len(s) for s in sequence[:-1])) + [-1] + sequence[-1][1:]
    return instance, sequence


def get_data_loaders(tokenizer):
    """ Prepare the dataset for training and evaluation """
#     dataset_cache = './dataset_cache'
#     dataset_cache = dataset_cache + '_' + type(tokenizer).__name__

#     if dataset_cache and os.path.isfile(dataset_cache):
#         personachat = torch.load(dataset_cache)

    personachat = tokenized_yesands

    import pdb; pdb.set_trace()
    print("Build inputs and labels")
    datasets = {"train": defaultdict(list), "valid": defaultdict(list)}
    for dataset_name, dataset in personachat.items():
        num_candidates = len(dataset[0]["utterances"][0]["candidates"])
        if 2 > 0 and dataset_name == 'train':
            num_candidates = min(2, num_candidates)
        for dialog in dataset:
            persona = dialog["personality"].copy()
            for _ in range(1):
                for utterance in dialog["utterances"]:
                    history = utterance["history"][-(2*2+1):]
                    for j, candidate in enumerate(utterance["candidates"][-num_candidates:]):
                        lm_labels = bool(j == num_candidates-1)
                        instance, _ = build_input_from_segments(persona, history, candidate, tokenizer, lm_labels)
                        for input_name, input_array in instance.items():
                            datasets[dataset_name][input_name].append(input_array)
                    datasets[dataset_name]["mc_labels"].append(num_candidates - 1)
                    datasets[dataset_name]["n_candidates"] = num_candidates
                persona = [persona[-1]] + persona[:-1]  # permuted personalities
                
            break 
#         break 

    print("Pad inputs and convert to Tensor")
    tensor_datasets = {"train": [], "valid": []}
    for dataset_name, dataset in datasets.items():
        dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS["pad_token"]))
        for input_name in MODEL_INPUTS:
            tensor = torch.tensor(dataset[input_name])
            if input_name != "mc_labels":
                tensor = tensor.view((-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:])
            tensor_datasets[dataset_name].append(tensor)
            
#         break

    print("Build train and validation dataloaders")
    train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"])
    train_sampler = None
    valid_sampler = None
    train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=4, shuffle=(not False))
    valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=4, shuffle=False)

    print("Train dataset (Batch, Candidates, Seq length): {}".format(train_dataset.tensors[0].shape))
    print("Valid dataset (Batch, Candidates, Seq length): {}".format(valid_dataset.tensors[0].shape))
    
    return train_loader, valid_loader, train_sampler, valid_sampler




In [89]:
get_data_loaders(tokenizer)

> <ipython-input-88-7cbd7b890c89>(48)get_data_loaders()
-> print("Build inputs and labels")
(Pdb) n
Build inputs and labels
> <ipython-input-88-7cbd7b890c89>(49)get_data_loaders()
-> datasets = {"train": defaultdict(list), "valid": defaultdict(list)}
(Pdb) n
> <ipython-input-88-7cbd7b890c89>(50)get_data_loaders()
-> for dataset_name, dataset in personachat.items():
(Pdb) n
> <ipython-input-88-7cbd7b890c89>(51)get_data_loaders()
-> num_candidates = len(dataset[0]["utterances"][0]["candidates"])
(Pdb) n
> <ipython-input-88-7cbd7b890c89>(52)get_data_loaders()
-> if 2 > 0 and dataset_name == 'train':
(Pdb) n
> <ipython-input-88-7cbd7b890c89>(53)get_data_loaders()
-> num_candidates = min(2, num_candidates)
(Pdb) n
> <ipython-input-88-7cbd7b890c89>(54)get_data_loaders()
-> for dialog in dataset:
(Pdb) n
> <ipython-input-88-7cbd7b890c89>(55)get_data_loaders()
-> persona = dialog["personality"].copy()
(Pdb) n
> <ipython-input-88-7cbd7b890c89>(56)get_data_loaders()
-> for _ in range(1):
(Pdb) n

BdbQuit: 