In [1]:
from minigrad import tensor
import numpy as np
from minigrad import nn
from copy import deepcopy
from minigrad.tensor import Tensor
from typing import Callable
import math

In [2]:
# not used for now
def clones(module,N):
    return nn.ModuleList([deepcopy(module) for _ in range(N)])

In [3]:
class LayerNorm(nn.Module):
    
    def __init__(self,features,eps=1e-5):
        super().__init__()
        self.eps = eps
        self.add_parameter("a_2",tensor.Tensor.ones(features))
        self.add_parameter("b_2",tensor.Tensor.zeros(features))
    
    def forward(self,x: Tensor):
        return self._parameters["a_2"].value*x.layer_norm(eps=self.eps)+self._parameters["b_2"].value


In [4]:
Tensor.training = True
class SublayerConnection(nn.Module):

    def __init__(self,size,dropout=0.1):
        super().__init__()
        self.add_module("layer_norm",LayerNorm(features=size))
        self.dropout = dropout
    
    def forward(self,x: Tensor,sublayer: Callable[[Tensor],Tensor]):
        return x + sublayer(self._modules["layer_norm"].forward(x)).dropout(p=self.dropout)
    

In [5]:
def attention(query,key,value,dropout=None,mask=None):
    dk = query.shape[-1]
    scores = query.matmul(key.transpose(-2,-1))/math.sqrt(dk)
    if mask is not None:
        scores = scores.masked_fill_(mask.data==0,-1e9)
    p_attn = scores.softmax(-1)
    if dropout is not None:
        p_attn = p_attn.dropout(dropout)
    return p_attn.matmul(value),p_attn

In [6]:
class Linear(nn.Module):
    def __init__(self,in_features,out_features):
        super().__init__()
        self.add_parameter("ln_w",Tensor.randn(in_features,out_features))
    
    def forward(self,x: Tensor):
        return x.linear(self._parameters["ln_w"].value)


In [7]:
class MultiHeadAttention(nn.Module):
    
    def __init__(self,d_model,h,dropout=0.1):
        super().__init__()
        assert d_model % h == 0
        self.d_k = d_model//h
        self.h = h
        self.add_module("ln_q",Linear(d_model,d_model))
        self.add_module("ln_k",Linear(d_model,d_model))
        self.add_module("ln_v",Linear(d_model,d_model))
        self.add_module("ln_o",Linear(d_model,d_model))
        self.attn = None
        self.dropout = dropout

    def forward(self,query,key,value,mask=None):
        if mask is not None:
            mask = mask.unsqueeze(1)
        nbatches = query.shape[0]
        # TODO what if seqlen is different for enc and dec
        # in enc; it is source seq len, in dec; it is target seq len.
        seq_len = query.shape[1]

        # do all linear proj in batch from d_model => h * d_k
        query, key, value = [
            lin.forward(x).view(nbatches, x.size(1), self.h,self.d_k).transpose(1,2)
            for lin, x in zip((self._modules["ln_q"],self._modules["ln_k"],self._modules["ln_v"]), (query,key,value))
        ]

        # # apply attention on all projected vectors in batch.
        x, self.attn = attention(
            query,key,value,mask=mask,dropout=self.dropout
        )
        x = x.squeeze()
        # concat using view and apply final linear.

        x = (
            x.transpose(1,2)
            .view(nbatches,seq_len,self.h*self.d_k)
        )
        del query
        del key
        del value

        return self._modules["ln_o"].forward(x)

In [8]:
class PositionwiseFeedForward(nn.Module):

    def __init__(self,d_model,d_ff,dropout=0.1):
        super(PositionwiseFeedForward,self).__init__()
        self.add_module("w1",Linear(d_model,d_ff))
        self.add_module("w2",Linear(d_ff,d_model))
        self.dropout = dropout

    def forward(self,x):
        return self._modules["w2"].forward((self._modules["w1"].forward(x).relu()).dropout(self.dropout))


In [9]:
# using explicit loop and concat for now 
class Embeddings(nn.Module):

    def __init__(self,d_model,vocab):
        super(Embeddings,self).__init__()
        self.add_parameter("embed",Tensor.randn(vocab,d_model))
        self.d_model = d_model

    def forward(self,x):
        batch, seq_len = x.shape

        batch_out = None  # will become (batch, seq_len, d_model)

        for b in range(batch):
            seq_out = None  # will become (seq_len, d_model)

            for s in range(seq_len):
                idx = int(x[b][s].data.item())
                vec = self._parameters["embed"].value[idx].unsqueeze(0)
                # vec shape: (1, d_model)
                if seq_out is None:
                    seq_out = vec
                else:
                    seq_out = seq_out.cat(vec, dim=0)
            # seq_out shape: (seq_len, d_model)
            seq_out = seq_out.unsqueeze(0)  # (1, seq_len, d_model)
            if batch_out is None:
                batch_out = seq_out
            else:
                batch_out = batch_out.cat(seq_out, dim=0)
        return batch_out


In [10]:
# using numpy
class PositionalEncoding(nn.Module):

    def __init__(self,d_model,dropout,max_len=5000):
        super(PositionalEncoding,self).__init__()
        self.dropout = dropout

        # compute the positional encodings once in log space
        pe = np.zeros((max_len,d_model))
        position = np.arange(0,max_len).reshape(max_len,1)
        div_term = np.exp(
            np.arange(0,d_model,2) * -(math.log(10000.0)/d_model)
        )
        pe[:,0::2] = np.sin(position*div_term)
        pe[:,1::2] = np.cos(position*div_term)

        self.pe = pe.reshape(1,max_len,d_model)

    def forward(self,x):
        x = x + self.pe[:,: x.size(1)]
        return x.squeeze(0).dropout(self.dropout)
    

In [11]:
class EncoderLayer(nn.Module):

    def __init__(self,size,self_attn,feed_forward,dropout=0.1):
        super().__init__()
        self.self_attn = self_attn
        self.add_module("ffn",feed_forward)
        self.add_module("sc1",SublayerConnection(size,dropout))
        self.add_module("sc2",SublayerConnection(size,dropout))
        self.size = size
    
    def forward(self,x,mask):
        x = self._modules["sc1"].forward(x,lambda x: self.self_attn.forward(x,x,x,mask))
        return self._modules["sc2"].forward(x,self._modules["ffn"].forward)

class DecoderLayer(nn.Module):
    def __init__(self,size,self_attn,src_attn,ffn,dropout):
        super().__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.add_module("ffn",ffn)
        for i in range(3):
            self.add_module(f"sc{i+1}",SublayerConnection(size,dropout))
        self.dropout = dropout

    def forward(self,x,m,src_mask,tgt_mask):
        x = self._modules["sc1"].forward(x,lambda x: self.self_attn.forward(x,x,x,tgt_mask))
        x = self._modules["sc2"].forward(x,lambda x: self.src_attn.forward(x,m,m,src_mask))
        return self._modules["sc3"].forward(x,self._modules["ffn"].forward)


In [12]:
class Encoder(nn.Module):

    def __init__(self,layer,N):
        super().__init__()
        [self.add_module(f"l{i+1}",deepcopy(layer)) for i in range(N)]
        self.add_module("ln",LayerNorm(layer.size))
    
    def forward(self,x,mask):
        for module in self._modules.values():
            if isinstance(module,EncoderLayer):
                x = module.forward(x,mask)
            else:
                x = module.forward(x)
        return x


class Decoder(nn.Module):

    def __init__(self,layer,N):
        super().__init__()
        [self.add_module(f"l{i+1}",deepcopy(layer)) for i in range(N)]
        self.add_module("ln",LayerNorm(layer.size))

    def forward(self,x,m,src_mask,tgt_mask):
        for module in self._modules.values():
            if isinstance(module,DecoderLayer):
                x = module.forward(x,m,src_mask,tgt_mask)
            else:
                x = module.forward(x)
        return x
    

In [13]:
class EncoderDecoder(nn.Module):

    def __init__(self,encoder,decoder,src_embed,tgt_embed,pos_src,pos_tgt,generator):
        super(EncoderDecoder,self).__init__()
        self.add_module("enc",encoder)
        self.add_module("dec",decoder)
        self.add_module("src_emb", src_embed)
        self.add_module("tgt_emb", tgt_embed)
        self.pos_src = pos_src.forward
        self.pos_tgt = pos_tgt.forward
        self.add_module("gen",generator)

    def forward(self,src,tgt,src_mask,tgt_mask):
        return self.decode(self.encode(src,src_mask),src_mask,tgt,tgt_mask)
    
    def encode(self,src,src_mask):
        return self._modules["enc"].forward(self.pos_src(self._modules["src_emb"].forward(src)),src_mask)

    def decode(self,memeory,src_mask,tgt,tgt_mask):
        return self._modules["dec"].forward(self.pos_tgt(self._modules["tgt_emb"].forward(tgt)),memeory,src_mask,tgt_mask)

class Generator(nn.Module):

    def __init__(self,d_model,vocab):
        super(Generator,self).__init__()
        self.add_module("proj",Linear(d_model,vocab))

    def forward(self,x):
        return self._modules["proj"].forward(x)

In [14]:
def make_model(
        src_vocab, tgt_vocab, N=6, d_model=512, d_ff = 2048, h=8, dropout=0.1
):
    c = deepcopy
    attn = MultiHeadAttention(d_model,h)
    ff = PositionwiseFeedForward(d_model,d_ff,dropout)
    position = PositionalEncoding(d_model=d_model,dropout=dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model,c(attn),c(ff),dropout),N),
        Decoder(DecoderLayer(d_model,c(attn),c(attn),c(ff),dropout),N),
        Embeddings(d_model,src_vocab),
        Embeddings(d_model,tgt_vocab),
        c(position),c(position),
        Generator(d_model,tgt_vocab)
    )
    return model


In [15]:
# using numpy
def subsequent_mask(size):
    "mask out subsequent positions."
    attn_shape = (1,size,size)
    subsequent_mask = np.tril(np.ones(attn_shape),k=0).astype(np.int8)
    return subsequent_mask

class Batch:
    
    def __init__(self,src,tgt=None,pad=1):
        self.src = src
        self.src_mask = (self.src != pad).astype(np.int8)
        self.src_mask = self.src_mask.reshape(self.src_mask.shape[0],1,self.src_mask.shape[1])

        if tgt is not None:
            self.tgt = tgt[:,:-1]
            self.tgt_y = tgt[:,1:]
            self.tgt_mask = self.make_std_mask(self.tgt, pad)
            self.ntokens = (self.tgt_y != pad).sum()
        
    @staticmethod
    def make_std_mask(tgt,pad):
        "create a mask to hide padding and future words."
        tgt_mask = (tgt != pad)
        tgt_mask = tgt_mask.reshape(tgt_mask.shape[0],1,tgt_mask.shape[1])
        tgt_mask = tgt_mask & subsequent_mask(tgt.shape[-1]).astype(
            np.int8
        )
        return tgt_mask

In [16]:
def data_gen(V, batch_size, nbatches):
    "generate random data for a src-tgt copy task"
    for i in range(nbatches):
        data = np.random.randint(1,V, size=(batch_size,V))
        data[:,0] = 1
        src = data
        tgt = data
        yield Batch(src,tgt,pad=0)

In [17]:
class CrossEntropyLoss:

    def __init__(self):
        pass

    def __call__(self,input,target):
        input = input.softmax(-1)
        loss = -(target * input.log())
        return loss.sum()/(input.shape[1]*input.shape[0])
    

In [18]:
vocab=11
batch_size=40
nbatches = 20
criterion = CrossEntropyLoss()
test_model = make_model(vocab,vocab,2,d_model=128,h=4,d_ff=256)
data_iter = data_gen(vocab,batch_size,nbatches)
for i,batch in enumerate(data_iter):
    src,tgt,src_mask,tgt_mask = Tensor(batch.src), Tensor(batch.tgt), Tensor(batch.src_mask),Tensor(batch.tgt_mask)
    out = test_model.forward(src,tgt,src_mask,tgt_mask)
    logits = test_model._modules["gen"].forward(out)
    target = np.eye(vocab)[batch.tgt_y]
    loss = criterion(logits,Tensor(target,requires_grad=False))
    print("Loss: ",loss.data[0])
    loss.backward()
    for param in test_model.parameters():
        param.update(Tensor(param.value.data-0.1 * param.value.grad.data,requires_grad=True))


Loss:  18.33117
Loss:  17.49627
Loss:  14.991483
Loss:  13.273812
Loss:  13.518925
Loss:  12.615138
Loss:  10.464443
Loss:  10.94338
Loss:  10.380503
Loss:  10.308905
Loss:  10.114498
Loss:  10.096762
Loss:  9.268982
Loss:  9.2127285
Loss:  8.756202
Loss:  9.260834
Loss:  9.851846
Loss:  8.042778
Loss:  8.754878
Loss:  7.9792795
