# Data collection

## Import all third-party libraries

In [1]:
import json
import numpy as np
from collections import defaultdict
from nltk import wordpunct_tokenize
import en_core_web_sm
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
import os
import math

## Loading the dataset

In [2]:
DATA_DIR = 'data/'

In [3]:
with open(f"{DATA_DIR}train_full.json") as f:
    dataset = json.load(f)

## Split the dataset into human-to-human and human-to-bot parts

In [4]:
human_human, human_bot = np.zeros((len(dataset), )), np.zeros((len(dataset), ))
for i, d in enumerate(dataset):
    human, bot = 0, 0
    for u in d['users']:
        if u['userType'] == 'Human':
            human += 1
        elif u['userType'] == 'Bot':
            bot += 1
        else:
            print('Unknown user type: {}'.format(u['userType']))
    if human == 2:
        human_human[i] = 1
    elif human == 1 and bot == 1:
        human_bot[i] = 1
    else:
        print('Unknown combination of users: human = {}, bot = {}'.format(human, bot))

In [5]:
silent_user = []
long_dialogue = []
empty_dialogue = []
for i, d in enumerate(dataset):
    user_utt = defaultdict(int)
    user_map = {}
    if len(d['thread']) == 0:
        empty_dialogue.append(i)
        continue
    for th in d['thread']:
        user_utt[th['userId']] += 1
    for u in d['users']:
        if u['userType'] == 'Human':
            user_map[u['id']] = 'human'
        elif u['userType'] == 'Bot':
            user_map[u['id']] = 'bot'
        else:
            print('Unknown user type: {}'.format(u['userType']))
    for u in user_map:
        if u not in user_utt:
            silent_user.append(i)
            break
    ok = False
    for u in user_map:
        if user_utt[u] > 2:
            if ok:
                long_dialogue.append(i)
            else:
                ok = True

In [6]:
print('\t\t\tTotal\tHuman-to-bot\tHuman-to-human')

# Total
hh_dialog = [d for i, d in enumerate(dataset) if human_human[i] == 1]
hb_dialog = [d for i, d in enumerate(dataset) if human_bot[i] == 1]
print('All dialogues\t\t{}\t\t{}\t\t{}'.format(len(dataset), len(hh_dialog), len(hb_dialog)))

# Empty
hh_empty = [d for i, d in enumerate(empty_dialogue) if human_human[i] == 1]
hb_emtpy = [d for i, d in enumerate(empty_dialogue) if human_bot[i] == 1]
print('Empty dialogues\t\t{}\t\t{}\t\t{}'.format(len(empty_dialogue), len(hh_empty), len(hb_emtpy)))

# One-sided
hh_silent = [d for i, d in enumerate(silent_user) if human_human[i] == 1]
hb_silent = [d for i, d in enumerate(silent_user) if human_bot[i] == 1]
print('One-sided dialogues\t{}\t\t{}\t\t{}'.format(len(silent_user), len(hh_silent), len(hb_silent)))

# Long dialogues
hh_long = [d for i, d in enumerate(long_dialogue) if human_human[i] == 1]
hb_long = [d for i, d in enumerate(long_dialogue) if human_bot[i] == 1]
print('Long dialogues\t\t{}\t\t{}\t\t{}'.format(len(long_dialogue), len(hh_long), len(hb_long)))

			Total	Human-to-bot	Human-to-human
All dialogues		2778		441		2337
Empty dialogues		119		66		53
One-sided dialogues	560		229		331
Long dialogues		1719		368		1351


## Calculate initiative metrics

### Utterance length

In [7]:
def calc_utt_len(sentence):
    utterances = wordpunct_tokenize(sentence)
    return len(utterances)

In [8]:
for dialog in hb_dialog:
    for thread in dialog['thread']:
        thread['UTT_LEN'] = calc_utt_len(thread['text'])

In [9]:
for dialog in hh_dialog:
    for thread in dialog['thread']:
        thread['UTT_LEN'] = calc_utt_len(thread['text'])

### NP count

In [10]:
nlp = en_core_web_sm.load()

In [11]:
def calc_np_len(sentence):
    noun_phrases=set()
    doc = nlp(sentence)
    for nc in doc.noun_chunks:
        noun_phrases.add(nc.text)
        noun_phrases.add(doc[nc.root.left_edge.i:nc.root.right_edge.i+1].text)
    return len(noun_phrases)

In [12]:
for dialog in hb_dialog:
    for thread in dialog['thread']:
        thread['NP_LEN'] = calc_np_len(thread['text'])

In [13]:
for dialog in hh_dialog:
    for thread in dialog['thread']:
        thread['NP_LEN'] = calc_np_len(thread['text'])

### NLL (entropy)

[Fine-tuning](https://gist.github.com/mf1024/3df214d2f17f3dcc56450ddf0d5a4cd7)

In [19]:
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [20]:
# initialize tokenizer and model from pretrained GPT2 model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')

In [21]:
model = model.to(device)

In [83]:
class DialogDataset(Dataset):
    def __init__(self, dialogues):
        super().__init__()

        self.dialog_list = []
        self.end_of_text_token = "<|endoftext|>"

        for dialog in dialogues:
            user = [x for x in dialog['users'] if x['id'] == th['userId']][0]
            if user['userType'] == 'Human':
                for thread in dialog['thread']:
                    if len(thread['text']) <= 1024:
                        dialog_str = f"DIALOG:{thread['text']}{self.end_of_text_token}"
                        self.dialog_list.append(dialog_str)

    def __len__(self):
        return len(self.dialog_list)

    def __getitem__(self, item):
        return self.dialog_list[item]

In [84]:
dataset = DialogDataset(hb_dialog)
data_loader = DataLoader(dataset, batch_size=1, shuffle=True)

In [15]:
BATCH_SIZE = 16
EPOCHS = 4
LEARNING_RATE = 3e-5
WARMUP_STEPS = 5000
MAX_SEQ_LEN = 400

In [92]:
model.train()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps = -1)
proc_seq_count = 0
sum_loss = 0.0
batch_count = 0

tmp_dialog_tens = None
models_folder = "trained_models"
if not os.path.exists(models_folder):
    os.mkdir(models_folder)

for epoch in range(EPOCHS):

    print(f"EPOCH {epoch} started" + '=' * 30)

    for idx,dialog in enumerate(data_loader):

        #################### "Fit as many dialog sequences into MAX_SEQ_LEN sequence as possible" logic start ####
        dialog_tens = torch.tensor(tokenizer.encode(dialog[0])).unsqueeze(0).to(device)
        #Skip sample from dataset if it is longer than MAX_SEQ_LEN
        if dialog_tens.size()[1] > MAX_SEQ_LEN:
            continue

        #The first dialog sequence in the sequence
        if not torch.is_tensor(tmp_dialog_tens):
            tmp_dialog_tens = dialog_tens
            continue
        else:
            #The next dialog does not fit in so we process the sequence and leave the last dialog
            #as the start for next sequence
            if tmp_dialog_tens.size()[1] + dialog_tens.size()[1] > MAX_SEQ_LEN:
                work_dialog_tens = tmp_dialog_tens
                tmp_dialog_tens = dialog_tens
            else:
                #Add the dialog to sequence, continue and try to add more
                tmp_dialog_tens = torch.cat([tmp_dialog_tens, dialog_tens[:,1:]], dim=1)
                continue
        ################## Sequence ready, process it trough the model ##################

        outputs = model(work_dialog_tens, labels=work_dialog_tens)
        loss, logits = outputs[:2]
        loss.backward()
        sum_loss = sum_loss + loss.detach().data

        proc_seq_count = proc_seq_count + 1
        if proc_seq_count == BATCH_SIZE:
            proc_seq_count = 0
            batch_count += 1
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            model.zero_grad()

        # print(f"sum loss {sum_loss}")

        if batch_count == 100:
            print(f"sum loss {sum_loss}")
            batch_count = 0
            sum_loss = 0.0

    # Store the model after each epoch to compare the performance of them
    torch.save(model.state_dict(), os.path.join(models_folder, f"gpt2_medium_{epoch}.pt"))




KeyboardInterrupt: 

In [22]:
models_folder = "trained_models"

for epoch in range(EPOCHS-1):
    model_path = os.path.join(models_folder, f"gpt2_medium_{epoch}.pt")
    model.load_state_dict(torch.load(model_path))
    sequence = "Are you a bot?"
    inputs = tokenizer.encode(sequence, return_tensors='pt')
    outputs = model.generate(inputs, max_length=50, do_sample=True)
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Are you a bot? Have you tried building and testing applications? Have you tried writing functional code to solve problems? In this discussion, we're going to dive deep into how to become a truly functional app developer.

The "real" thing


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Are you a bot?

Catch me in the comments!

Related

Filed under: Business, Computers, computers, e-mail, Interviews, Computers, Microsoft, new games, computers, productivity, technology


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Are you a bot?

We were inspired by bot language for the most part, or at least based on it. Some of the ideas were taken from the popular chat language "chatbot" like mr bot or bot_wizard,


In [23]:
def calc_nll(sentence, model, tokenizer):
    input_ids = torch.tensor(tokenizer.encode(sentence)).unsqueeze(0)
    input_ids = input_ids.to('cpu')
    with torch.no_grad():
        try:
            outputs = model(input_ids, labels=input_ids)
            loss, logits = outputs[:2]
            return math.exp(loss)
        except RuntimeError:
            return math.nan

In [24]:
best_model = 2
model_path = os.path.join(models_folder, f"gpt2_medium_{best_model}.pt")
model.load_state_dict(torch.load(model_path))

<All keys matched successfully>

In [25]:
for dialog in hb_dialog:
    for thread in dialog['thread']:
        if len(thread['text']) <= 1024:
            thread['NLL'] = calc_nll(thread['text'], model, tokenizer)

In [26]:
for dialog in hh_dialog:
    for thread in dialog['thread']:
        if len(thread['text']) <= 1024:
            thread['NLL'] = calc_nll(thread['text'], model, tokenizer)

## Write the data to files

In [27]:
with open(f'{DATA_DIR}train_hh.json', 'w') as f:
    json.dump(hh_dialog, f)

with open(f'{DATA_DIR}train_hb.json', 'w') as f:
    json.dump(hb_dialog, f)