## Model Initialization

In [1]:
from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
model

GPT2DoubleHeadsModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2):

In [8]:
import torch.nn as nn
import torch

genre_dict = {'comedy': '<comedy>',
 'sport': '<sport>',
 'biography': '<biography>',
 'romance': '<romance>',
 'action': '<action>',
 'adventure': '<adventure>',
 'drama': '<drama>',
 'sci-fi': '<sci-fi>',
 'family': '<family>',
 'fantasy': '<fantasy>',
 'musical': '<musical>',
 'crime': '<crime>',
 'thriller': '<thriller>',
 'short': '<short>',
 'western': '<western>',
 'documentary': '<documentary>',
 'horror': '<horror>',
 'animation': '<animation>',
 'film-noir': '<film-noir>',
 'music': '<music>',
 'war': '<war>',
 'mystery': '<mystery>'}

genres = genre_dict.keys()

special_tokens = ["<speaker1>", "<speaker2>"] + ["<" + genre + ">" for genre in genres]

SPECIAL_TOKENS = {"bos_token": "<bos>", "eos_token": "<eos>", "additional_special_tokens": special_tokens, "pad_token": "<pad>"}

tokenizer.add_special_tokens(SPECIAL_TOKENS)
model.resize_token_embeddings(len(tokenizer))

Embedding(50284, 768)

In [9]:
#retrain only final fc layer of first head for language modeling task
ngpu = 0
device = torch.device("cuda:0" if (torch.cuda.is_available() and ngpu > 0) else "cpu")

for param in model.parameters():
    param.requires_grad = False

# Parameters of newly constructed modules have requires_grad=True by default
model.lm_head = nn.Linear(model.lm_head.in_features, len(tokenizer))

model = model.to(device)

## Data Processing

In [17]:
from __future__ import print_function, division
import os
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import random
import transformers
from itertools import chain

#handles one personality and history sentence
def build_inputs_simple(persona, history, reply):
    # Build our sequence by adding delimiters and concatenating

    bos, eos, speaker1, speaker2 = "<bos>", "<eos>", "<speaker1>", "<speaker2>"

    sequence = [[bos] + persona] + history + [reply + [eos]]
    sequence = [sequence[0]] + [ [speaker2 if (len(sequence)-i) % 2 else speaker1] + s
                                        for i, s in enumerate(sequence[1:])]
    words = list(chain(*sequence))                          # word tokens
    segments = [speaker2 if i % 2 else speaker1             # segment tokens
                        for i, s in enumerate(sequence) for _ in s]
    position = list(range(len(words)))                      # position tokens
    return words, segments, position, sequence

def pad(x, padding, padding_length):
    return x + [padding] * (padding_length - len(x))

def randomIncorrectReply(df, row):
    return df[df['movie_name'] != row['movie_name']]['sentence_2'].sample(1).iloc[0]

def processRow(df, idx, genre_dict, max_len):

    row = df.iloc[idx]

    # Let's define our contexts and special tokens
    genre = eval(row['genre'])
    if genre is None:
        genre = random.choice(list(genre_dict.keys()))
    if isinstance(genre, list):
        genre = genre[0]
    persona = [genre_dict[genre]]
    #history = process_history(row).split()
    history = [row['sentence_1'].split()]
    correct_reply = row['sentence_2'].split()
    incorrect_reply = randomIncorrectReply(df, row).split()

    words_1, segments_1, position_1, sequence_1 = build_inputs_simple(persona, history, correct_reply)
    words_2, segments_2, position_2, sequence_2 = build_inputs_simple(persona, history, incorrect_reply)

    # Tokenize words and segments embeddings:
    words_1 = tokenizer.convert_tokens_to_ids(words_1) 
    words_2 = tokenizer.convert_tokens_to_ids(words_2)
    segments_1 = tokenizer.convert_tokens_to_ids(segments_1)
    segments_2 = tokenizer.convert_tokens_to_ids(segments_2)

    lm_targets_1 = ([-1] * sum(len(s) for s in sequence_1[:-1])) \
             + [-1] + tokenizer.convert_tokens_to_ids(sequence_1[-1][1:])
    lm_targets_2 = [-1] * len(words_2)

    # Store the position of the last tokens for the next-sentence prediction loss
    last_token_1 = len(words_1) - 1
    last_token_2 = len(words_2) - 1

    (words_1, words_2,
    segments_1, segments_2) = [pad(x, tokenizer.convert_tokens_to_ids('<pad>'), max_len)
                                   for x in (words_1, words_2,
                                             segments_1, segments_2)]

    (lm_targets_1, lm_targets_2) = [pad(x, -1, max_len) for x in (lm_targets_1, lm_targets_2)]

    # And gather reply and distractor inputs to build the input tensors:
    # words tokens
    input_ids = torch.tensor([words_1, words_2])#.unsqueeze(0)
    # segment tokens
    token_type_ids = torch.tensor([segments_1, segments_2])#.unsqueeze(0)
    # Last tokens location
    mc_token_ids = torch.tensor([last_token_1, last_token_2], dtype=torch.long)#.unsqueeze(0)
    # Language modeling labels
    lm_labels = torch.tensor([lm_targets_1, lm_targets_2], dtype=torch.long)#.unsqueeze(0)
    # Next-sentence prediction labels
    mc_labels = torch.zeros(1, dtype=torch.long) # Gold reply is 1st (index 0)

    return input_ids, token_type_ids, mc_token_ids, lm_labels, mc_labels
    
class DialogueDataset(Dataset):
    """Movie dialogue conversation dataset."""
    #data processing functions

    def __init__(self, csv_file, max_sent_len, min_sent_len, process_fun):
        #loading data and some processing
        df = pd.read_csv(csv_file, low_memory = False)
        genres = [genre for genre in df.columns[22:43]]

        #remove short or missing sentences
        df = df[pd.notnull(df['sentence_1'])]
        df = df[pd.notnull(df['sentence_2'])]
        df = df[df['sentence_1'].apply(lambda x: len(x.split()) >= min_sent_len)]
        df = df[df['sentence_2'].apply(lambda x: len(x.split()) >= min_sent_len)]
        
        #trim sentences to max length
        df['sentence_1'] = df['sentence_1'].apply(lambda x: x.split()[:max_sent_len]).apply(lambda x: " ".join(x))
        df['sentence_2'] = df['sentence_2'].apply(lambda x: x.split()[:max_sent_len]).apply(lambda x: " ".join(x))

        self.genre_dict = dict.fromkeys(genres, "")
        for key in genre_dict:
            self.genre_dict[key] = '<' + key + ">" 
        self.X = df
        self.genre_dict['mystery'] = '<mystery>'
        self.processRow = process_fun
        
    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        return self.processRow(self.X, idx, self.genre_dict, 80)

In [20]:
csv_file = '/Users/mm06832/Documents/Classes/CS585/final_project/chatbot-with-personality/data/processed_data_final.csv'

dataset = DialogueDataset(csv_file = csv_file, max_sent_len = 20, min_sent_len = 5, process_fun = processRow)
dataset.__len__()

101807

In [21]:
#may need to adjust this to avoid target leakage
from torch.utils.data.sampler import SubsetRandomSampler

batch_size = 32
validation_split = .2
shuffle_dataset = True
random_seed = random.randint(1, 10000)
print("random seed: ", random_seed)

# Creating data indices for training and validation splits:
dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]

# Creating PT data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)

train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, 
                                           sampler=train_sampler)
validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                                sampler=valid_sampler)

random seed:  1596


## Training 

In [22]:
from transformers import AdamW
from transformers import WarmupLinearSchedule as get_linear_schedule_with_warmup

# Parameters: (some from hugging face repo)
lr = 1e-3
max_grad_norm = 1.0
num_training_steps = 1000
num_warmup_steps = 100
warmup_proportion = float(num_warmup_steps) / float(num_training_steps)  # 0.1

lm_losses = []
mc_losses = []
total_losses = []

lm_losses_val = []
mc_losses_val = []
total_losses_val = []

iters = 0
lm_coef = 2.0
mc_coef = 1.0

num_epochs = 5

### In Transformers, optimizer and schedules are splitted and instantiated like this:
optimizer = AdamW(model.parameters(), lr=lr)  # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps)  # PyTorch scheduler

In [None]:
print("Starting Training Loop...")
# For each epoch
for epoch in range(num_epochs):
    # For each batch in the dataloader
    for i, data in enumerate(train_loader, 0):
        model.train()
        
        batch_size = len(data)
        
        input_ids = data[0]
        token_type_ids = data[1]
        mc_token_ids = data[2]
        lm_labels = data[3]
        mc_labels = data[4]

        output = model(input_ids, mc_token_ids=mc_token_ids, mc_labels = mc_labels, token_type_ids = token_type_ids, lm_labels = lm_labels)

        lm_loss = output[0]
        mc_loss = output[1]

        total_loss = lm_loss * lm_coef + mc_loss * mc_coef

        total_loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) 
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
        # Output training stats
        if i % 50 == 0:
            print('[%d/%d][%d/%d]\tLoss LM: %.4f\tLoss MC: %.4f\tLoss total:%.4f'
                  % (epoch, num_epochs, i, len(train_loader),
                     lm_loss.item(), mc_loss.item(), total_loss.item()))

        # Save Losses for plotting later
        lm_losses.append(lm_loss.item())
        mc_losses.append(mc_loss.item())
        total_losses.append(total_loss.item())
        
        iters +=1

Starting Training Loop...
[0/5][0/2546]	Loss LM: 15.2908	Loss MC: 0.9488	Loss total:31.5305


## Testing 

In [None]:
print("Starting Testing..")
model.eval()
for i, data in enumerate(val_loader, 0):
    output = model(data)

    lm_loss = output[0]
    mc_loss = output[1] 
    lm_scores = output[2]
    mc_scores = output[3]

    total_loss = lm_loss * lm_coef + mc_loss * mc_coef

    # Output validation stats
        if i % 50 == 0:
            print('[%d/%d][%d/%d]\tLoss LM: %.4f\tLoss MC: %.4f\tLoss total:%.4f'
                  % (epoch, num_epochs, i, len(val_loader),
                     lm_loss.item(), mc_loss.item(), total_loss.item()))

    # Save Losses for plotting later
    lm_losses_val.append(lm_loss.item())
    mc_losses_val.append(mc_loss.item())
    total_losses_val.append(total_loss.item())

    iters +=1

## Decoding

In [None]:
#haven't touched yet 

def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
        Args:
            logits: logits distribution shape (vocabulary size)
            top_k >0: keep only top k tokens with highest probability (top-k filtering).
            top_p >0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
    """
    assert logits.dim() == 1  # batch size 1 for now - could be updated for more but the code would be less clear
    top_k = min(top_k, logits.size(-1))  # Safety check
    if top_k > 0:
        # Remove all tokens with a probability less than the last token of the top-k
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability above the threshold
        sorted_indices_to_remove = cumulative_probs > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        logits[indices_to_remove] = filter_value
    return logits

# Here is how to use this function for top-p sampling
temperature = 1.0
top_k = 0
top_p = 0.9

# Get logits with a forward pass in our model (input is pre-defined)
logits = model(input)

# Keep only the last token predictions of the first batch item (batch size 1), apply a temperature coefficient and filter
logits = logits[0, -1, :] / temperature
filtered_logits = top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)

# Sample from the filtered distribution
probabilities = F.softmax(filtered_logits, dim=-1)
next_token = torch.multinomial(probabilities, 1)

## Evaluation / Error Analysis