# Training a dialog model using HuggingFace's pytorch-transformers and the ConvAI Dataset 

### Incorporating migration notes from pytorch_pretrained_bert -> pytorch_transformers
Updating the implementations in: https://medium.com/huggingface/how-to-build-a-state-of-the-art-conversational-ai-with-transfer-learning-2d818ac26313

## Basic Setup Overview

In [1]:
'''
requirements.txt: 
torch
pytorch-ignite
#pytorch-pretrained-bert >= 0.6.2 -> replaced with pytorch-transformers
pytorch-transformers
tensorboardX
tensorflow  # for tensorboardX
'''

'\nrequirements.txt: \ntorch\npytorch-ignite\n#pytorch-pretrained-bert >= 0.6.2 -> replaced with pytorch-transformers\npytorch-transformers\ntensorboardX\ntensorflow  # for tensorboardX\n'

In [3]:
from pytorch_transformers import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer

model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.


In [4]:
# We will use 5 special tokens:
# - <bos> to indicate the start of the sequence
# - <eos> to indicate the end of the sequence
# - <speaker1> to indicate the beginning and the tokens of an utterance from the user
# - <speaker2> to indicate the beginning and the tokens of an utterance from the bot
# - <pad> as a padding token to build batches of sequences
SPECIAL_TOKENS = {"bos_token": "<bos>", 
                  "eos_token": "<eos>",
                  "speaker1_token": "<speaker1>", 
                  "speaker2_token": "<speaker2>",
                  "pad_token": "<pad>"}

# We can add these special tokens to the vocabulary and the embeddings of the model:
num_added_token = tokenizer.add_special_tokens(SPECIAL_TOKENS)
print("Number of added tokens: ", num_added_token)
model.resize_token_embeddings(len(tokenizer))

Number of added tokens:  5


Embedding(40483, 768)

In [5]:
print(tokenizer.all_special_tokens)
tokenizer.speaker1_token

['<pad>', '<bos>', '<eos>', '<unk>']


'<speaker1>'

In [75]:
from itertools import chain

# Let's define our contexts and special tokens
persona = [["i", "like", "playing", "football", "."],
           ["i", "am", "from", "NYC", "."]]
history = [["hello", "how", "are", "you", "?"],
           ["i", "am", "fine", "thanks", "."]]
reply = ["great", "to", "hear"]
bos, eos, speaker1, speaker2 = "<bos>", "<eos>", "<speaker1>", "<speaker2>"

def build_inputs(persona, history, reply):
    # Build our sequence by adding delimiters and concatenating
    sequence = [[bos] + list(chain(*persona))] + history + [reply + [eos]]
    sequence = [sequence[0]] + [ [speaker2 if (len(sequence)-i) % 2 else speaker1] + s
                                for i, s in enumerate(sequence[1:])]
    # Build our word, segments and position inputs from the sequence
    words = list(chain(*sequence))                          # word tokens
    
    ''' fixed i -> i+1 to make it consistent with the labels in `sequence`'''
    segments = [speaker2 if (i+1) % 2 else speaker1             # segment tokens
                for i, s in enumerate(sequence) for _ in s]
    position = list(range(len(words)))                      # position tokens
    return words, segments, position, sequence

words, segments, position, sequence = build_inputs(persona, history, reply)

# >>> print(sequence)  # Our inputs looks like this:
# [['<bos>', 'i', 'like', 'playing', 'football', '.', 'i', 'am', 'from', 'NYC', '.'],
#  ['<speaker1>', 'hello', 'how', 'are', 'you', '?'],
#  ['<speaker2>', 'i', 'am', 'fine', 'thanks', '.'],
#  ['<speaker1>', 'great', 'to', 'hear', '<eos>']]



In [76]:
# Tokenize words and segments embeddings:
words = tokenizer.convert_tokens_to_ids(words)
segments = tokenizer.convert_tokens_to_ids(segments)

# don't use tokenizer.encode(x) because the sequence is already tokenized. 
# If given as a string, use tokenizer.encode(x), which is equivalent to tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x))

In [77]:
import torch

In [78]:
distractor = ["sorry", "to", "hear", "that"]

words_distractor, segments_distractor, _, _ = build_inputs(persona, history, distractor)
words_distractor, segments_distractor = tokenizer.convert_tokens_to_ids(words_distractor),  tokenizer.convert_tokens_to_ids(segments_distractor)

In [79]:
# prepare language modeling targets: keep only the reply segment, -1 on the rest 
lm_targets = [-1] * sum(len(s) for s in sequence[:-1]) + [-1] + tokenizer.convert_tokens_to_ids(sequence[-1][1:])
lm_distractor = [-1]*len(words_distractor)

In [80]:
last_token_idx = len(words) - 1
last_token_distractor = len(words_distractor) -1

# pad reply and distractor inputs and targets to same length 
padding_length = max(len(words), len(words_distractor))

def pad(x, padding, padding_length): 
    return x + [padding] * (padding_length - len(x))

words, words_distractor, segments, segments_distractor = [pad(x, tokenizer.convert_tokens_to_ids('<pad>'), padding_length) for x in (words, words_distractor, segments, segments_distractor)]

assert len(words) == len(words_distractor)

In [81]:
lm_targets, lm_distractor = [pad(x, -1, padding_length) for x in (lm_targets, lm_distractor)]

In [82]:
input_ids = torch.tensor([[words, words_distractor]], dtype = torch.long)

In [83]:
input_ids.size()

torch.Size([1, 2, 29])

In [88]:
token_type_ids = torch.tensor([[segments, segments_distractor]], dtype=torch.long)
mc_token_ids = torch.tensor([[last_token_idx, last_token_distractor]])

lm_labels = torch.tensor([[lm_targets, lm_distractor]], dtype=torch.long)
mc_labels = torch.tensor([0], dtype=torch.long) # first one is the gold label. Index of first one is 0 

In [117]:
# need to change it to new loss return format (comes in tuples, with loss and the prediction logits)
lm_loss, mc_loss, lm_predict, mc_predict = model(input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids) 

lm_coef = 2.0 
mc_coef = 1.0 
total_loss = lm_loss*lm_coef + mc_loss*mc_coef

In [119]:
import numpy as np 

In [97]:
total_loss

tensor(19.2084, grad_fn=<AddBackward0>)

# Training on entire dataset (ConvAI)

Original code referenced: [github repo](https://github.com/huggingface/transfer-learning-conv-ai/blob/master/train.py)

In [98]:
import json
from pytorch_transformers import cached_path

In [99]:
url = "s3://datasets.huggingface.co/personachat/personachat_self_original.json"

In [123]:
personachat_file = cached_path(url)
with open(personachat_file, "r", encoding="utf-8") as f: 
    dataset = json.load(f)
    
print(personachat_file)

/Users/justincho/.cache/torch/pytorch_transformers/738e4d3f264f46d2c9161a43c7389d03a34fb336ae842a4337014123c68e744e.bb42905dd6e1098e87c24845469ee12018cfd142e10fcc50f97b28e002a9ac02


In [113]:
def tokenize_dataset(obj): 
    if isinstance(obj, str): 
        return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
    if isinstance(obj, dict): 
        return {n: tokenize_dataset(o) for n, o in obj.items()}
    return list(tokenize_dataset(o) for o in obj)

dataset = tokenize_dataset(dataset)

# End of code content shown in Medium post 

# Actual training code and interaction code is in forked repo 