In [2]:
import torch
from torch import nn
import torch.nn.functional as F

In [2]:
device = ("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using {device}')

Using cpu


In [59]:
d_model = 512
d_hidden = 2048
h = 1
d_k = d_v = d_model//h
max_length = 256
dict_size = 26

class FeedForward(nn.Module):
    def __init__(self):
        super(FeedForward, self).__init__()
        self.W1 = nn.Linear(d_model, d_hidden)
        self.W2 = nn.Linear(d_hidden, d_model)
        
    def forward(self, x):
        x = self.W1(x)
        x = F.relu(x)
        x = self.W2(x)
        return x

class MaskedSelfAttention(nn.Module):
    def __init__(self):
        super(MaskedSelfAttention, self).__init__()

    def forward(self, Q, K, V):
        x = (Q@K.T) * (1/torch.sqrt(torch.tensor(d_k)))
        mask = torch.triu(torch.ones_like(x, dtype=torch.bool),diagonal=1)
        x = x.masked_fill(mask, float('-inf'))
        return F.softmax(x, dim=1) @ V

#single head for now. Will fix later
class MultiHeadAttention(nn.Module):
    def __init__(self):
        super(MultiHeadAttention, self).__init__()
        self.Q_proj = nn.Linear(d_model, d_k*h)
        self.K_proj = nn.Linear(d_model, d_k*h)
        self.V_proj = nn.Linear(d_model, d_v*h)
        self.out_proj = nn.Linear(d_v*h, d_model)
        self.self_attention = MaskedSelfAttention()
    
    def forward(self, Q, K, V):
        Q_inp = self.Q_proj(Q)
        K_inp = self.K_proj(K)
        V_inp = self.V_proj(V)
        x = self.self_attention(Q_inp, K_inp, V_inp)
        x = self.out_proj(x)
        return x
    

class DecoderLayer(nn.Module):
    def __init__(self):
        super(DecoderLayer, self).__init__()
        self.multi_head_attention = MultiHeadAttention()
        self.ffn = FeedForward()
        self.attn_norm = nn.LayerNorm(d_model)
        self.ffn_norm = nn.LayerNorm(d_model)
    
    def forward(self, x):
        x = self.multi_head_attention(x, x, x) + x
        x = self.attn_norm(x)
        x = self.ffn(x) + x
        x = self.ffn_norm(x)
        return x

class PositionalEncoding(nn.Module):
    def __init__(self):
        super(PositionalEncoding, self).__init__()
        self.p_encode = torch.empty(max_length, d_model)
        for k in range(max_length):
            for i in range(d_model):
                theta = torch.tensor(k/10000**(2*i/d_model))
                if i%2 == 0:
                    self.p_encode[k][i] = torch.sin(theta)
                else:
                    self.p_encode[k][i] = torch.cos(theta)
    
    def forward(self, x):
        return self.p_encode[:x.shape[0]] + x


class Transformer(nn.Module):
    def __init__(self):
        super(Transformer, self).__init__()
        self.embed = nn.Embedding(dict_size, d_model)
        self.lm_head = nn.Linear(d_model, dict_size)
        self.positional_encode = PositionalEncoding()
        self.decoder_layers = [DecoderLayer() for _ in range(6)]
    
    def forward(self, x):
        x = self.embed(x)
        x = self.positional_encode(x)
        for layer in self.decoder_layers:
            x = layer(x)
        x = self.lm_head(x)
        x = F.softmax(x, dim=1)
        return x

In [61]:
input = torch.randint(0, 26, (10,))
f = Transformer()
out = f(input)

done
done
done
done
done
done


In [62]:
out.shape

torch.Size([10, 26])

In [41]:
embed = nn.Embedding(26, 512)
input = embed(torch.randint(0, 26, (10,)))

In [42]:
print(input.shape)

torch.Size([10, 512])


In [19]:
tensor1 = torch.randn((3,4))
print(tensor1)
f = nn.LayerNorm(4)
tensor1 = f(tensor1)
print(tensor1)

tensor([[-1.1570, -0.0525,  1.9071, -0.0427],
        [ 0.2758, -0.6716, -0.3214,  0.1559],
        [-1.3432, -1.0895, -0.6403,  0.0365]])
tensor([[-1.1966, -0.1959,  1.5795, -0.1871],
        [ 1.0966, -1.4001, -0.4771,  0.7806],
        [-1.1150, -0.6308,  0.2268,  1.5190]],
       grad_fn=<NativeLayerNormBackward0>)


In [21]:
tensor1 = torch.randn((3,4,5))
tensor2 = torch.randn((3,4,5))
tensor2 = tensor2.transpose(0,1)
print(tensor1.shape)
print(tensor2.shape)
(tensor1@tensor2).shape

torch.Size([3, 4, 5])
torch.Size([4, 3, 5])


RuntimeError: The size of tensor a (3) must match the size of tensor b (4) at non-singleton dimension 0

In [6]:
matrix = torch.tensor([[1, 2, 3],
                       [4, 5, 6],
                       [7, 8, 9]],dtype=torch.float32)

# Create a mask for the upper triangle
mask = torch.triu(torch.ones_like(matrix, dtype=torch.bool),diagonal=1)

# Replace the upper triangle elements with negative infinity
masked_matrix = matrix.masked_fill(mask, float('-inf'))

In [8]:
mask

tensor([[False,  True,  True],
        [False, False,  True],
        [False, False, False]])