In [0]:
import numpy as np
import torch, random, unicodedata, re
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import BertModel, BertTokenizer

In [0]:
cmdc_train = './cmdc/cmdc_train.txt'
cmdc_dev = './cmdc/cmdc_dev.txt'
cmdc_test = './cmdc/cmdc_test.txt'

## Initial data processing


In [0]:
def get_prompts_and_replies(filename):
    '''loads the conversation data called filename (which should have prompts and replies on each line
    separated by a tab and already tokenized), and returns parallel lists of prompts and replies'''
    prompts = []
    replies = []
    with open(filename,encoding="utf-8") as inF:
        for line in inF:
            prompt, reply = line.split('\t')
            prompts.append(prompt.split())
            replies.append(reply.strip().split())
    return prompts,replies

In [0]:
def get_longest_utterance(utterances):
    '''get the longest utterance among a list of utterances'''
    return max([len(utterance) for utterance in utterances])

In [0]:
def build_vocab_dicts(utterances):
    '''given a list of utterances, build index dicts mapping tokens to ids and ids to tokens. Includes
    special padding, start of sentence, end of sentence, and unknown tokens'''
    vocab = set()
    for utterance in utterances:
        vocab.update(utterance)

    vocab_to_idx = {'<pad>': 0, '<SOS>':1, '<EOS>':2,'<UNK>':3}
    idx_to_vocab = {value:key for key,value in vocab_to_idx.items()}
    for word in vocab:
        idx_to_vocab[len(vocab_to_idx)] = word
        vocab_to_idx[word] = len(vocab_to_idx)
    return vocab_to_idx,idx_to_vocab

In [0]:
def converttext2tensors(utterances, vocab_to_idx, longest_utterance):
    '''given a list of lists of strings corresponding to utterances, converts each
    utterance to a single 1d tensor of ids based on vocab_to_index, padded to longest_utterance'''
    text_tensors =[]
    for utterance in utterances:
        tokens = np.array([vocab_to_idx.get(word,3) for word in utterance] + [2] + [0] * (longest_utterance - len(utterance)))
        assert len(tokens) <= longest_utterance + 1
        text_tensors.append(torch.tensor(tokens, dtype=torch.long))
    return text_tensors

In [0]:
class CBdataset(Dataset):
    '''Builds our pytorch Dataset after preprocessing our text (vectorizing and padding)'''
    def __init__(self, input_data, output_data):
        self.input_data = input_data
        self.output_data = output_data
        
    def __len__(self):
        return len(self.input_data)
    
    def __getitem__(self, index):
        target = self.output_data[index]
        data_val = self.input_data[index]
        return data_val,target 

## Building the neural chatboat

In [0]:
class WeightedSum(nn.Module):
    '''converts a matrix of the form (token length X batch_size X hidden_dim) to one of shape
    (batch_size, hidden_dim) by doing a weighted sum based on the similarity of each embedding
    to weight_vector'''
    def __init__(self, hidden_dim):
        super(WeightedSum,self).__init__()        
        self.weight_vector = nn.Parameter(torch.randn(hidden_dim, requires_grad=True))
    
    def forward(self,question_embeddings):
        question_embeddings = question_embeddings.permute(1,0,2)
        weights = (question_embeddings @ self.weight_vector).softmax(dim=1)
        weighted_question_embedding = (weights.unsqueeze(2)*question_embeddings).sum(axis=1)        
        return weighted_question_embedding

In [0]:
class DecoderRNN(nn.Module):
    '''The decoder part of a chatbot. Given an input_step of token ids for each batch, an last_hidden
    state from the previous time step, and the representation of the prompt from the encoder, it
    runs an RNN step, concatenates the output with the prompt embedding, and then applies two
    hidden layers to predict the next word (token id)'''
    def __init__(self, vocab_size, embedding_dim, hidden_size, n_layers=1, dropout=0.1):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding_dropout = nn.Dropout(dropout)
        self.gru = nn.GRU(embedding_dim, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout))
        self.hidden_layer = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, vocab_size)

    def forward(self, input_step, last_hidden, encoder_output):
        input_emb = self.embedding(input_step)
        input_emb = self.embedding_dropout(input_emb)
        rnn_out, hidden = self.gru(input_emb, last_hidden)
        cat = torch.cat((rnn_out, encoder_output.unsqueeze(0)), dim = 2)
        output = self.hidden_layer(cat)
        output = torch.tanh(output)
        output = self.out(output)
        output = output.squeeze(0)
        output = nn.functional.log_softmax(output, dim = 1)


        return output, hidden

        


In [16]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=433, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=440473133, style=ProgressStyle(description_…




BERT uses its own special wordpiece tokenization, and assumes sentences are initialized with the token `'[CLS]'` and that utterances are separated by the token `'[SEP]'`. The function provided below does the conversion to tensors for BERT, including padding.

In [0]:
def pad_and_convert_to_tensor(utterances, longest):
    '''Accepts variable lengths utterances and pads them to the same length'''    
    new_list = []
    for utterance in utterances:
        new_list.append(torch.tensor(utterance + [0]*(longest -len(utterance))))
    return new_list


def get_bert_tensors(utterances):
    '''Accepts a list of utterances, tokenizes them with BERTs tokenizer and converts wordpieces 
    to their integer IDs'''
    indexed_utterances = []
    longest = 0
    for utterance in utterances:
        utterance = '[CLS] ' + ' '.join(utterance) + ' [SEP]'
        tokenized_utterance = bert_tokenizer.tokenize(utterance)
        indexed_utterance = bert_tokenizer.convert_tokens_to_ids(tokenized_utterance)
        longest = len(indexed_utterance) if len(indexed_utterance) > longest else longest
        indexed_utterances.append(indexed_utterance)
    return pad_and_convert_to_tensor(indexed_utterances,longest)


In [0]:
class EncoderRNN(nn.Module):
    '''The encoder part of a chat bot. Given an input sequence corresponding to
    (utterance length X batch_size) of token ids, converts them to embeddings, runs through
    an RNN, does a weighted sum and returns a matrix (batch_size X embedding dim) of prompt
    representations and the final hidden state of the RNN'''
    def __init__(self, hidden_size, vocab_size, embedding_dim, n_layers=1, dropout=0, bert = False):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.bert = bert
        if bert:
            self.embedding = bert_model
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_size, n_layers, dropout=(0 if n_layers == 1 else dropout), 
                          bidirectional=True)
        self.hidden_layer = nn.Linear(hidden_dim*2, hidden_dim)
        self.weighted_sum = WeightedSum(hidden_dim)

    def forward(self, input_seq):
        if self.bert:
            embedded, _ = self.embedding(input_ids=input_seq)
        else:
            embedded = self.embedding(input_seq)
        input_lens = torch.sum(input_seq != 0, dim=0) # needed for packing
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lens, enforce_sorted=False)
        rnn_out, hidden = self.gru(packed)
        rnn_out, _ = nn.utils.rnn.pad_packed_sequence(rnn_out)
        rnn_out = self.hidden_layer(rnn_out)
        output = self.weighted_sum(rnn_out) 

        return output, hidden

In [0]:
def generateReply(encoder, decoder, input_prompt, sample=False, bert = False):
    if bert:
        prompts_tensors = get_bert_tensors([input_prompt])
    else:
        prompts_tensors = converttext2tensors([input_prompt],vocab_to_idx,get_longest_utterance([input_prompt]))

    prompts_tensors = prompts_tensors[0].unsqueeze(1)
    #print(prompts_tensors.shape)

    input_tensor = prompts_tensors.to(device)
    encoder_out, encoder_hidden = encoder(input_tensor)
    decoder_input = torch.LongTensor([[1 for _ in range(prompts_tensors.shape[1])]]) #1 is <SOS> idx
    decoder_input = decoder_input.to(device)
    decoder_hidden = encoder_hidden[:decoder_n_layers]

    output_words = []
    max_length = 10

    while len(output_words) < max_length:
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_out)
        #print(decoder_output.shape) 
        if sample:
            output_probs = np.exp(decoder_output.to('cpu').detach().numpy())[0]
            output_idx = np.random.choice(decoder_output.shape[1], 1, p = output_probs)
            decoder_input = torch.Tensor(output_idx).long().unsqueeze(1)
            decoder_input = decoder_input.to(device)
        else:
            output_idx = torch.argmax(decoder_output, dim = 1)
            decoder_input = output_idx.unsqueeze(1)
            decoder_input = decoder_input.to(device)
            output_idx = output_idx.item()

        if bert:
            word = bert_tokenizer.convert_ids_to_tokens(output_idx)
        else:
            word = idx_to_vocab[output_idx]

        if word == '[CLS]':
            continue
        if word == '<EOS>' or word == '[SEP]':
            break
        # if word.startswith("##"):
        #     word = word[2:]
        output_words.append(word)

    return output_words

In [0]:
def getUserInput(encoder, decoder,sample=False, bert = False):
    '''Given a trained encoder and decoder for a chatbot, reads in user text, normalizes it 
    and passes it to the generateReply() function'''
    input_sentence = ''
    while(1):
        # Get input sentence
        input_sentence = input('> ')
        # Check if it is quit case
        if input_sentence == 'q' or input_sentence == 'quit': break
        # Normalize sentence
        input_sentence = normalizeString(input_sentence)
        # Generate reply
        output_words = generateReply(encoder, decoder, input_sentence.split(),sample=sample, bert = bert)
        # Format and print response sentence
        output_words[:] = [x for x in output_words if not (x == '<EOS>' or x == '<pad>')]
        print(chatbot_name + ":", ' '.join(output_words))

In [21]:
hidden_dim = 100
embedding_dim = 768
batch_size = 128
dropout = 0.1
clip = 10
decoder_n_layers = 1
epochs = 30
teacher_forcing_ratio = 0.8
learning_rate = 1e-4
decoder_learning_ratio = 5.0
loss_function = nn.NLLLoss(ignore_index=0)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [0]:
prompts, replies = get_prompts_and_replies(cmdc_train)
prompts_tensors = get_bert_tensors(prompts)
replies_tensors = get_bert_tensors(replies)
train_dataset = CBdataset(prompts_tensors, replies_tensors)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

In [0]:
p_encoder = EncoderRNN(hidden_dim, bert_tokenizer.vocab_size, embedding_dim, bert=True)
encoder_optimizer = optim.Adam(p_encoder.parameters(), lr=learning_rate)
r_decoder = DecoderRNN(bert_tokenizer.vocab_size, embedding_dim, hidden_dim)
decoder_optimizer = optim.Adam(r_decoder.parameters(), lr=learning_rate * decoder_learning_ratio)

In [24]:
# Ensure dropout layers are in train mode
p_encoder.train()
r_decoder.train()
p_encoder.to(device)
r_decoder.to(device)
counter = 0
for epoch in range(epochs):
    print("Epoch:", epoch)
    epoch_loss_history = []
    for train_in, train_out in train_dataloader:
        batch_loss = 0
        train_in, train_out = train_in.t().to(device), train_out.t().to(device)
        longest_utterance_in_batch = max(torch.sum(train_out != 0, dim=0))

        # Pass through encoder
        encoder_out, encoder_hidden = p_encoder(train_in)
        
        # Prep initial state of decoder
        decoder_input = torch.LongTensor([[1 for _ in range(train_in.shape[1])]]) #1 is <SOS> idx
        decoder_input = decoder_input.to(device)
        decoder_hidden = encoder_hidden[:decoder_n_layers]

        # Determine if we have teacher forcing
        use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
        for t in range(longest_utterance_in_batch):   
            decoder_output, decoder_hidden = r_decoder(decoder_input, decoder_hidden, encoder_out) 
            batch_loss += loss_function(decoder_output, train_out[t])
            # implement the teacher forcing choice using the use_teacher_forcing boolean  
            if use_teacher_forcing:
                decoder_input = train_out[t].unsqueeze(0)
            else:
                decoder_input = torch.argmax(decoder_output, 1).unsqueeze(0)
  
        batch_loss.backward()
        epoch_loss_history.append(batch_loss.item())
        # Clip gradients: gradients are modified in place
        _ = nn.utils.clip_grad_norm_(p_encoder.parameters(), clip)
        _ = nn.utils.clip_grad_norm_(r_decoder.parameters(), clip)

        # Adjust model weights
        encoder_optimizer.step()
        decoder_optimizer.step()
        counter += 1
        # if counter %20 == 0:
        #     print('Processed:', counter*batch_size, 'of', len(train_dataset))
            
    print('After epoch', epoch, 'loss:', np.mean(epoch_loss_history))
    counter = 0

Epoch: 0
After epoch 0 loss: 55.73053069612873
Epoch: 1
After epoch 1 loss: 42.43156409382227
Epoch: 2
After epoch 2 loss: 40.15135604232105
Epoch: 3
After epoch 3 loss: 38.98281636878626
Epoch: 4
After epoch 4 loss: 37.54163203547843
Epoch: 5
After epoch 5 loss: 36.125199953715004
Epoch: 6
After epoch 6 loss: 35.54246772462456
Epoch: 7
After epoch 7 loss: 35.35595742980046
Epoch: 8
After epoch 8 loss: 34.191769234576626
Epoch: 9
After epoch 9 loss: 33.44221929768425
Epoch: 10
After epoch 10 loss: 33.84090556434138
Epoch: 11
After epoch 11 loss: 31.80801658725264
Epoch: 12
After epoch 12 loss: 32.444529685214974
Epoch: 13
After epoch 13 loss: 32.23501539467579
Epoch: 14
After epoch 14 loss: 31.632749842174018
Epoch: 15
After epoch 15 loss: 31.365764157689032
Epoch: 16
After epoch 16 loss: 31.238405868188657
Epoch: 17
After epoch 17 loss: 30.55668385349103
Epoch: 18
After epoch 18 loss: 30.50053205063094
Epoch: 19
After epoch 19 loss: 30.635914275895303
Epoch: 20
After epoch 20 loss: 29

In [57]:
def eval(encoder, decoder,dev_file):
    '''evaluate a chatbot encoder/decoder by seeing how often given a prompt, it correctly predicts 
    the first word of the reply. Returns the accuracy'''
    dev_prompts,dev_replies = get_prompts_and_replies(dev_file)
    total = 0
    correct = 0
    for i in range(len(dev_prompts)):
        total += 1
        prompt = dev_prompts[i]
        gold_reply = dev_replies[i]
        pred_reply = generateReply(p_encoder, r_decoder, prompt,bert=True)
        if pred_reply and gold_reply and pred_reply[0] == gold_reply[0]:
            correct += 1
        
    return correct/total

score = eval(p_encoder,r_decoder,cmdc_dev)
print(score)

0.11482806908355375
