http://www.peterbloem.nl/blog/transformers

In [1]:
# for google colab
# mount
# from google.colab import drive
# drive.mount('/content/gdrive/', force_remount=True)

In [2]:
# change directory
import os
# # for google colab
os.chdir('gdrive/My Drive/ai')
os.getcwd()

'/content/gdrive/My Drive/ai'

In [None]:
class charVocabulary(object):
    def __init__(self, token_to_idx=None):
        if token_to_idx is None:
            token_to_idx = {}
        self.token_to_idx = token_to_idx
        self.idx_to_token = {idx: token 
                                for token, idx in self.token_to_idx.items()}

        self.mask_token = '<mask>'
        self.begin_token = '<begin>'
        self.end_token = '<end>'
        self.unk_token = '<unk>'
        self.space_token = ' '

        self.mask_idx = self.add_token(self.mask_token)
        self.begin_idx = self.add_token(self.begin_token)
        self.end_idx = self.add_token(self.end_token)
        self.unk_idx = self.add_token(self.unk_token)
        self.space_idx = self.add_token(self.space_token)

    def add_token(self, token):
        if token in self.token_to_idx:
            index = self.token_to_idx[token]
        else:
            index = len(self.token_to_idx)
            self.token_to_idx[token] = index
            self.idx_to_token[index] = token
        return index

    def __len__(self):
        assert len(self.token_to_idx) == len(self.idx_to_token)
        return len(self.token_to_idx)

    def lookup_token(self,token):
        return self.token_to_idx[token]

    def lookup_idx(self,i):
        return self.idx_to_token[i]

    def add_txt(self,path):
        with open(path, 'r') as f:
            fulltext = f.read()
            for c in fulltext:
                if c != '\n':
                    self.add_token(c)
        return None

    def add_series(self,df):
        for sentence in df:
            max_len = min(300, len(sentence))
            for char in sentence[:max_len]:
                self.add_token(char)
        return None

In [None]:
import numpy as np
class charVectorizer(object):
    def __init__(self,vocab):
        self.vocab = vocab

    def vectorize(self, sent, max_len=-1):
        """
        max_len is used to know how much to pad
        """
        ind = [self.vocab.begin_idx]
        ind.extend(self.vocab.lookup_token(token) for token in sent)
        ind.append(self.vocab.end_idx)
        
        max_len = max(len(ind), max_len)

        x = np.empty(max_len-1, dtype=np.int64)
        x[:len(ind)-1] = ind[:-1]
        x[len(ind)-1:] = self.vocab.mask_idx

        y = np.empty(max_len-1, dtype=np.int64)
        y[:len(ind)-1] = ind[1:]
        y[len(ind)-1:] = self.vocab.mask_idx

        return x,y

In [None]:
from torch.utils.data import Dataset
class charDataset(Dataset):
    def __init__(self,vectorizer,posts,extra_pad=100):
        self.posts = posts
        self.vectorizer = vectorizer

        posts_len = len(posts.iloc[0])
        for sentence in posts:
            posts_len = max(posts_len, len(sentence))
        self.max_len = posts_len + extra_pad
        self.seq_len = self.max_len - 1

    def __len__(self):
        return len(self.posts)
    
    def __getitem__(self,i):
        sent = self.posts.iloc[i]
        x,y = self.vectorizer.vectorize(sent=sent, max_len=self.max_len)
        assert x.shape == y.shape
        assert x.shape[0] == self.max_len-1
        return x,y

In [None]:
from torch import nn
import torch.nn.functional as F

def mask_(matrices, maskval=0.0, mask_diagonal=False):
    b, h, w = matrices.size()

    indices = torch.triu_indices(h, w, offset=0 if mask_diagonal else 1)
    matrices[:, indices[0], indices[1]] = maskval

class SelfAttention(nn.Module):
    def __init__(self, k, heads=8, mask=True):
        super().__init__()
        self.k, self.heads, self.mask = k, heads, mask
        # instead of having one matrix per head,
        # we stack them into one big matrix
        self.tokeys = nn.Linear(k, k*heads, bias=False)
        self.toqueries = nn.Linear(k, k*heads, bias=False)
        self.tovalues = nn.Linear(k, k*heads, bias=False)
        self.unifyheads = nn.Linear(heads*k, k)
        
    def forward(self, x):
        b,t,k = x.size()
        h = self.heads
        assert k == self.k
        
        queries = self.toqueries(x).view(b,t,h,k)
        keys = self.tokeys(x).view(b,t,h,k)
        values = self.tovalues(x).view(b,t,h,k)
        
        queries = queries.transpose(1,2).contiguous().view(b*h,t,k)
        keys = keys.transpose(1,2).contiguous().view(b*h,t,k)
        values = values.transpose(1,2).contiguous().view(b*h,t,k)
        
        queries = queries / (k ** (1/4))
        keys = keys / (k ** (1/4))
        # weights
        dot = torch.bmm(queries, keys.transpose(1,2))
        assert dot.size() == (b*h, t, t)

        if self.mask: # mask out the upper half of the dot matrix, excluding the diagonal
            mask_(dot, maskval=float('-inf'), mask_diagonal=False)

        dot = F.softmax(dot, dim=2)
        
        # apply self-attienton to values
        out = torch.bmm(dot, values)
        out = out.view(b,h,t,k)
        out = out.transpose(1,2).contiguous().view(b,t,h*k)
        out = self.unifyheads(out)
        return out

In [None]:
# http://www.peterbloem.nl/files/transformers/transformer-block.svg
# https://pytorch.org/docs/stable/nn.html#torch.nn.LayerNorm
# ff stands for 'feed forward'
class TransformerBlock(nn.Module):
    def __init__(self, k, heads, seq_length, mask=True, ff_multiple=4, dropout=0.0):
        super().__init__()
        
        self.attention = SelfAttention(k,heads, mask=mask)
#         self.mask = mask
        
        self.norm1 = nn.LayerNorm(k)
        self.norm2 = nn.LayerNorm(k)
        
        self.ff = nn.Sequential(
                               nn.Linear(k,ff_multiple*k),
                               nn.ReLU(),
                               nn.Linear(ff_multiple*k,k))
        
        self.dout = nn.Dropout(dropout)
        
    def forward(self,x):
        # apply attention
        attended = self.attention(x)
        # normalize 1 + residual connection
        normalized = self.norm1(attended + x)
        # apply dropout
        dropped = self.dout(normalized)
        # look up MLP and feedforward
        forwarded = self.ff(dropped)
        # normalize 2 + residual connection
        normalized_again = self.norm2(forwarded + dropped)
        out = self.dout(normalized_again)
        return out

In [None]:
class Transformer(nn.Module):
    def __init__(self, 
                 embedding_dim,
                 heads, depth,
                 seq_length,
                 num_tokens,
                 mask_id,
                 ff_multiple=4,
                 dropout=0.0):

        super().__init__()
        self.num_tokens = num_tokens
        self.mask_id = mask_id
        self.seq_len = seq_length
        self.token_emb = nn.Embedding(embedding_dim=embedding_dim, num_embeddings=num_tokens, padding_idx=mask_id)
        self.pos_emb = nn.Embedding(embedding_dim=embedding_dim, num_embeddings=self.seq_len)
        
        # in tblocks I should also specify:
        # ff_multiple and dropout
        tblocks = []
        for i in range(depth):
            tblocks.append(TransformerBlock(k=embedding_dim,heads=heads, seq_length=self.seq_len, mask=True, ff_multiple=4, dropout=dropout))
        self.tblocks = nn.Sequential(*tblocks)
        
        # num_classes = num_tokens, when you're generating text
        self.to_probs = nn.Linear(embedding_dim, num_tokens)
    
    def forward(self,x):
        tokens = self.token_emb(x)
        b,t,k = tokens.size()

        assert t == self.seq_len
        
        
        if torch.cuda.is_available():
            positions = torch.arange(t, device='cuda')
        else:
            positions.to('cuda')
        positions = self.pos_emb(positions)
        positions = positions[None, :, :]
        positions = positions.expand(b, t, k)
        
        x = tokens + positions
        x = self.tblocks(x)
        
        out = self.to_probs(x.view(b*t, k)).view(b, t, self.num_tokens)
        return out
        
#         # not sure I want to apply log-softmax
#         # just use cross-entropy loss directly, no?
#         if apply_softmax:
#             out = F.log_softmax(out, dim=2)
#         return out

In [None]:
def gen_samp(model,
            vocab,
            prompt=''):
    
    seq_len = model.seq_len
    if len(prompt) >= seq_len:
        return prompt

    mask_id = model.mask_id
    one_hot = [mask_id for _ in range(seq_len)]
    
    bos = vocab.begin_idx
    one_hot[0] = bos
    for i,c in enumerate(prompt):
        idx = vocab.lookup_token(c)
        one_hot[i+1] = idx

    for i in range(len(prompt), seq_len):
        if torch.cuda.is_available():
            hot_tensor = torch.tensor(one_hot, dtype=torch.int64, device='cuda').unsqueeze(dim=0)
        else:
            hot_tensor = torch.tensor(one_hot, dtype=torch.int64).unsqueeze(dim=0)
        pred = model(hot_tensor)
        last = pred.squeeze(dim=0)[i,:]
        probs = F.softmax(last, dim=0)
        winner = torch.multinomial(probs, num_samples=1)
        one_hot[i] = winner.item()
    
    output = ""
    for idx in one_hot:
        token = vocab.lookup_idx(idx)
        output += token
    
    start = vocab.begin_token
    end = vocab.end_token
    return output[output.find(start)+len(start):output.find(end)]

In [None]:
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
import tqdm
import pickle

In [None]:
DATASET = 'may15nov17_above130_less100_light.csv'
csv_path = os.path.join('training_data',DATASET)

rootpath = 'bertrand'
if rootpath not in os.listdir():
    os.mkdir(rootpath)

dict_path = os.path.join(rootpath, 'dict.pkl')
model_path = os.path.join(rootpath, 'model.pt')
train_losses_path = os.path.join(rootpath, 'train_losses.txt')
test_losses_path = os.path.join(rootpath, 'test_losses.txt')
bestloss_path = os.path.join(rootpath, 'best_loss.txt')
params_path = os.path.join(rootpath, 'params.pkl')

RESUME = False

In [None]:
import pandas as pd
posts = pd.read_csv(csv_path).title.astype('U')

if RESUME:
    token_to_idx = pickle.load(open(dict_path,'rb'))
    vocab = charVocabulary(token_to_idx=token_to_idx)
else:
    vocab = charVocabulary()
    vocab.add_series(df=posts)
    pickle.dump(vocab.token_to_idx, open(dict_path,'wb'))

mask_id = vocab.mask_idx
vectorizer = charVectorizer(vocab=vocab)

full_ds = charDataset(vectorizer=vectorizer, posts=posts)

if RESUME:
    params = pickle.load(open(params_path,'rb'))
else:
    params = {}
    params['num_tokens'] = len(vocab)
    params['embedding_dim'] = 128
    params['seq_length'] = full_ds.seq_len
    params['heads'] = 8
    params['depth'] = 4
    params['ff_multiple'] = 4
    params['dropout'] = 0.5
    params['mask_id'] = mask_id

    pickle.dump(params, open(params_path,'wb'))

In [None]:
CUDA = True
NUM_EPOCHS = 88
BATCH_SIZE = 64
LEARNING_RATE = 1e-3

SPLIT_FRAC = 0.20
test_size = int(SPLIT_FRAC * len(full_ds))
train_size = len(full_ds) - test_size
train_ds, test_ds = random_split(full_ds, [train_size, test_size])

train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=False)
test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
import logging
logpath = os.path.join(rootpath, 'logbook.log')
logger = logging.getLogger()
hdlr = logging.FileHandler(logpath)
logger.addHandler(hdlr) 

In [193]:
import torch
if CUDA and torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

t_device = torch.device(device)

s = f"I am using {device}."
logging.info(s)
print(s)

I am using cuda.


In [194]:
model = Transformer(**params)

if RESUME:
    model.load_state_dict(torch.load(model_path, map_location=device))

model.to(device)

Transformer(
  (token_emb): Embedding(136, 128, padding_idx=0)
  (pos_emb): Embedding(199, 128)
  (tblocks): Sequential(
    (0): TransformerBlock(
      (attention): SelfAttention(
        (tokeys): Linear(in_features=128, out_features=1024, bias=False)
        (toqueries): Linear(in_features=128, out_features=1024, bias=False)
        (tovalues): Linear(in_features=128, out_features=1024, bias=False)
        (unifyheads): Linear(in_features=1024, out_features=128, bias=True)
      )
      (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (ff): Sequential(
        (0): Linear(in_features=128, out_features=512, bias=True)
        (1): ReLU()
        (2): Linear(in_features=512, out_features=128, bias=True)
      )
      (dout): Dropout(p=0.5, inplace=False)
    )
    (1): TransformerBlock(
      (attention): SelfAttention(
        (tokeys): Linear(in_features=128, out_features=1024, bias=False)
    

In [None]:
optimizer = optim.Adam(model.parameters())

In [None]:
# OK let's start training ----------------------------------------------------------
try:
    if RESUME:
        with open(bestloss_path, 'r') as f:
            bestloss = float(f.readline())
    else:
        bestloss = float('inf')
        
    train_epoch_losses = []
    test_epoch_losses = []

    for epoch in range(NUM_EPOCHS):
        ### training ----------
        print('\nTraining\n')
        model.train()
        train_batch_losses = []
        with tqdm.tqdm(total=len(train_dl)) as progress_bar:
            for x,y in train_dl:
                optimizer.zero_grad()

                x = x.to(device)
                y = y.to(device)

                y_pred = model(x)

                batch_size, seq_len, feats = y_pred.shape
                y_pred_loss = y_pred.view(batch_size*seq_len,feats)
                y_loss = y.view(-1)

                loss = F.cross_entropy(y_pred_loss, y_loss, ignore_index=mask_id)
                loss.backward()
                optimizer.step()

                train_batch_losses.append(loss.item())

                progress_bar.update(1)
        
        avgloss = np.asarray(train_batch_losses).mean()
        print(f"\nEpoch number {epoch+1} is done training. The mean average loss was {avgloss}.\n")
        with open(train_losses_path, 'a') as f:
            stringa = '\n' + str(avgloss)
            f.write(stringa)

        ### testing ----------
        print('\nTesting\n')
        model.eval()
        test_batch_losses = []
        with tqdm.tqdm(total=len(test_dl)) as progress_bar:
            for x,y in test_dl:
                optimizer.zero_grad()

                x = x.to(device)
                y = y.to(device)

                y_pred = model(x)

                batch_size, seq_len, feats = y_pred.shape
                y_pred_loss = y_pred.view(batch_size*seq_len,feats)
                y_loss = y.view(-1)

                loss = F.cross_entropy(y_pred_loss, y_loss, ignore_index=mask_id)
                loss.backward()
                optimizer.step()

                test_batch_losses.append(loss.item())

                progress_bar.update(1)
        
        avgloss = np.asarray(test_batch_losses).mean()
        print(f"\nEpoch number {epoch+1} is done testing. The mean average loss was {avgloss}.\n")
        with open(test_losses_path, 'a') as f:
            stringa = '\n' + str(avgloss)
            f.write(stringa)
        
        if avgloss < bestloss:
            bestloss = avgloss
            s = "Loss improved! I am saving this model."
            print(s)
            logging.info(s)
            torch.save(model.state_dict(), model_path)
            with open(bestloss_path, 'w') as f:
                f.write(str(bestloss))
        
        if epoch > 0: # and epoch % 10 == 0:
            for i in range(3):
                print(gen_samp(model=model,vocab=vocab))

        print(f"\nEpoch number {epoch+1} is done testing. The mean average loss was {avgloss}.\n")

except KeyboardInterrupt:
    logger.error('something went wrong', exc_info=True)

In [104]:
model.to('cpu')
gen_samp(model,vocab, prompt='eggs and yo')

'eggs and yodt oayeelibmaesfi '

In [169]:
def gen_samp_prova(model,
            vocab,
            prompt=''):
    
    seq_len = model.seq_len
    if len(prompt) >= seq_len:
        return prompt

    mask_id = model.mask_id
    one_hot = [mask_id for _ in range(seq_len)]
    
    bos = vocab.begin_idx
    one_hot[0] = bos
    for i,c in enumerate(prompt):
        idx = vocab.lookup_token(c)
        one_hot[i+1] = idx

    for i in range(len(prompt), seq_len):
        if torch.cuda.is_available():
            hot_tensor = torch.tensor(one_hot, dtype=torch.int64, device='cuda').unsqueeze(dim=0)
        else:
            hot_tensor = torch.tensor(one_hot, dtype=torch.int64).unsqueeze(dim=0)
        pred = model(hot_tensor)
        last = pred.squeeze(dim=0)[i,:]
        probs = F.softmax(last, dim=0)
        winner = torch.multinomial(probs, num_samples=1)
        one_hot[i] = winner.item()
    
    output = ""
    for idx in one_hot:
        token = vocab.lookup_idx(idx)
        output += token
    
    start = vocab.begin_token
    end = vocab.end_token
    return output[output.find(start)+len(start):output.find(end)]

model = Transformer(**params)
model.to('cuda')
print(gen_samp_prova(model,vocab, prompt='hello'))

hell4C️\—4 9#³2öé]ß﻿qqUgM÷R😮QH7ʖ}(Ü,ébe6kwv’B])R2'I<begin>h/—+🤔.c
