In [None]:
# Created by: c00k1ez (https://github.com/c00k1ez)

In [1]:
import torch

import transformers
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config

from src.utils import get_answer
from src.gpt2.data_parser import Dialogue, DataParser
from src.gpt2.dataset import DialogueDataset

In [2]:
params_config = {
    'pad_len': 100,
    'train_batch_size': 10,
    'model_name': 'gpt2',
    'lr': 5e-5,
    'residual_dropout': 0.7,
    'embedding_dropout': 0.7,
    'attention_dropout': 0.7
}

In [7]:
config = GPT2Config.from_pretrained(params_config['model_name'])
config.resid_pdrop = params_config['residual_dropout']
config.attn_pdrop = params_config['attention_dropout']
config.embd_pdrop = params_config['embedding_dropout']

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = GPT2LMHeadModel.from_pretrained(params_config['model_name'], config=config).to(device)

Downloading: 100%|██████████| 548M/548M [06:21<00:00, 1.44MB/s]


In [9]:
tokenizer = GPT2Tokenizer.from_pretrained(params_config['model_name'])
tokenizer.add_special_tokens({'additional_special_tokens': ['[CONTEXT]', '[ANSWER]']})
model.resize_token_embeddings(len(tokenizer))

Embedding(50259, 768)

In [10]:
parser = DataParser('./data/TwitterLowerAsciiCorpus.txt')
train, test = parser.train_test_split()

In [15]:
from collections import Counter

print('Our dataset contains {} context-answer pairs and unique {} dialogues'.format(len(parser.all_pairs), len(parser.dialogues)))

tokens = []
for sample in parser.all_pairs:
    sample = tokenizer.tokenize(sample['context'] + ' ' + sample['answer'])
    tokens.extend(sample)
counter = Counter(tokens)
print('There are {} unique tokens in dataset and {} tokens at all. Notice, that small GPT2 have vocabulary with 50k sub-words.'.format(len(counter), sum([v for _,v in dict(counter).items()])))
print('')
print('Most common 10 tokens:')
for token, freq in counter.most_common(10):
    print('{} : {}'.format(token, freq))

Our dataset contains 8574 context-answer pairs and unique 1983 dialogues
There are 10046 unique tokens in dataset and 233751 tokens at all. Notice, that small GPT2 have vocabulary with 50k sub-words.

Most common 10 tokens:
. : 7196
Ġi : 7186
Ġthe : 4561
Ġto : 4025
Ġyou : 3977
, : 3602
Ġa : 3373
Ġit : 3218
Ġand : 2602
's : 2285


In [17]:
train_dataset = DialogueDataset(train, tokenizer, params_config['pad_len'])
train_loader = torch.utils.data.DataLoader(train_dataset, params_config['train_batch_size'], shuffle=True)

In [18]:
optimizer = transformers.AdamW(model.parameters(), lr=params_config['lr'])

In [19]:
def train_epoch(model, loader, optimizer, epoch_num, device, log_interval=100):
    losses = []
    avg_loss = []
    step = 1
    for batch in loader:
        optimizer.zero_grad()
        input_ids, mask, label = batch['sample'], batch['mask'], batch['label']
        input_ids = input_ids.to(device)
        mask = mask.to(device)
        label = label.to(device)
        outputs = model(input_ids, attention_mask=mask, labels=label)
        loss, logits = outputs[:2]
        avg_loss.append(loss.detach().item())
        if step % log_interval == 0:
            val_loss = sum(avg_loss) / len(avg_loss)
            losses.append(val_loss)
            avg_loss = []
            print('epoch {}\t[{}/{}]\tloss = {:.4f}'.format(epoch_num, step, len(loader), val_loss))
        loss.backward()
        optimizer.step()
        step += 1
    return losses

In [None]:
EPOCHS = 2
losses = []
for epoch in range(EPOCHS):
    ep_losses = train_epoch(model, train_loader, optimizer, epoch, device)

In [None]:
model = model.to(torch.device('cpu'))
model.eval()

In [None]:
get_answer("where are you?", model, tokenizer)