In [1]:
import torch
from torch import nn
import numpy as np
import pandas as pd

In [3]:
# companies_df = pd.read_csv('data/companies_sorted.csv')

In [4]:
# companies_df.head()

In [5]:
#companies_df['name'][:1000].to_csv('companies_names_list.csv', index=False)

In [6]:
# company_names = companies_df['name'][:1000].to_list()

In [7]:
# len(company_names)

In [8]:
# print(' '.join(company_names))

In [9]:
# len(''.join(company_names))

In [10]:
# all_chars = sorted(list(set(''.join(company_names))))

In [11]:
# vocab_size = len(all_chars)

In [12]:
# print(''.join(all_chars))

In [13]:
# print(all_chars)

In [14]:
# stoi = {s:i for i,s in enumerate(all_chars)}

In [15]:
# stoi

In [16]:
# itos = {i:s for i,s in enumerate(all_chars)}

In [17]:
# encode = lambda s: [stoi[c] for c in s]
# decode = lambda l: ''.join([itos[i] for i in l])

In [18]:
# for name in company_names:
#     print(name)
#     print(encode(name))
#     print(decode(encode(name)))
#     break

In [2]:
vocab_size = 51

In [3]:
token_embedding = nn.Embedding(vocab_size, 32)

In [4]:
token = token_embedding(torch.tensor([23,16,27]))

In [5]:
token.shape

torch.Size([3, 32])

## Transformer Encoder implementation

In [6]:
s_t = torch.stack((torch.tensor([23,16,27]), torch.tensor([23,16,27])))

In [7]:
s_t.shape

torch.Size([2, 3])

In [8]:
s_t.dtype

torch.int64

In [9]:
class LayerNorm(nn.Module):
    def __init__(self,d_model=512, eps=1e-12):
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(d_model))
        self.beta = nn.Parameter(torch.zeros(d_model))
        self.eps = eps
    def forward(self,x):
        mean = x.mean(-1,keepdim=True)
        var = x.var(-1, keepdim=True, unbiased=False)

        out = (x - mean)/torch.sqrt(var + self.eps)
        out = out * self.gamma + self.beta
        return out


In [10]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model=512, d_ff=2048, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.l1 = nn.Linear(d_model,d_ff )
        self.l2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.l1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.l2(x)
        return x

In [38]:
d_model = 512

class MultiHeadAttention(nn.Module):
    def __init__(self, heads: int = 8, d_model: int = 512):
        super(MultiHeadAttention, self).__init__()
        self.k = nn.Linear(d_model, d_model, bias=False)
        self.v = nn.Linear(d_model, d_model, bias=False)
        self.q = nn.Linear(d_model, d_model, bias=False)
        self.w_concat = nn.Linear(d_model, d_model)
        self.softmax = nn.Softmax(dim=-1)
        self.heads = heads
        self.register_buffer('mask', torch.tril(torch.ones(16,16)).view(1,1,16,16))

    def forward(self, k: torch.tensor, v: torch.tensor, q: torch.tensor, mask: bool =  False):

        k = self.split(self.k(k))
        v = self.split(self.v(v))
        q = self.split(self.q(q))



        k_t = k.transpose(-2,-1)
        # print(q.shape, k_t.shape)
        scale = (q@k_t)*(k.shape[-1]**-0.5)
        # print(scale.shape)
        if mask:
            B, nH, T, Hs = scale.shape 
            scale = scale.masked_fill(self.mask[:,:,:T,:T] == 0, -10000)
            # print('scale shape:', scale.shape)
            # print('scale values:', scale)

        score = self.softmax(scale)

        scale_product = score @ v
        
        batch,heads, length, d_tensor = scale_product.size()
        scale_product = scale_product.transpose(1, 2).contiguous().view(batch, length, heads*d_tensor)
        # print(scale_product.shape)
        scale_product = self.w_concat(scale_product)

        return scale_product
    
    def split(self, tensor:torch.tensor):

        batch, length, d_model = tensor.size()
        d_tensor = d_model//self.heads

        new_tensor = tensor.view(batch, length, self.heads, d_tensor).transpose(1, 2)

        return new_tensor
    




    

In [39]:
device = 'cpu'
vocab_size = 51

class Encoder(nn.Module):
    def __init__(self,n_layers=2, drop_prob=0.1, vocab_size=51, d_model=512, device='cpu'):
        super(Encoder, self).__init__()
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.positional_embedding = nn.Embedding(vocab_size, d_model)
        self.attention = MultiHeadAttention()
        self.ffn = PositionwiseFeedForward()
        self.layer_norm = LayerNorm()
        self.dropout = nn.Dropout(p=drop_prob)
        self.n_layers = n_layers
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.device = device

    def forward(self, x):
        B, T = x.shape
        tok_emb = self.token_embedding(x)
        pos_emb = self.positional_embedding(torch.arange(T, device=device))
        x = tok_emb + pos_emb

        for i in range(self.n_layers):
            _x = x
            x = self.attention(x,x,x)
            x = self.dropout(x)
            x = self.layer_norm(x + _x)

            _x = x
            x = self.ffn(x)
            x = self.dropout(x)
            x = self.layer_norm(x + _x)

        return x
        

In [40]:
encoder_model = Encoder()

In [41]:
output = encoder_model(s_t)

In [42]:
output.shape

torch.Size([2, 3, 512])

In [43]:
output

tensor([[[-0.2478,  1.3195, -1.1371,  ...,  0.3854, -0.9986,  1.0983],
         [ 1.4962, -0.1559,  0.5545,  ...,  0.7497, -0.6175,  0.3648],
         [-0.7865,  0.1280,  1.4923,  ..., -0.6512,  0.6116, -1.5909]],

        [[-0.1754,  1.5139, -1.0582,  ...,  0.8110, -1.3169,  0.9074],
         [ 1.5260,  0.0705, -0.0097,  ...,  0.5855, -0.5872,  0.5131],
         [-0.6733,  0.2820,  1.5630,  ..., -0.7395,  0.1737, -1.4332]]],
       grad_fn=<AddBackward0>)

In [44]:
print(encoder_model)

Encoder(
  (token_embedding): Embedding(51, 512)
  (positional_embedding): Embedding(51, 512)
  (attention): MultiHeadAttention(
    (k): Linear(in_features=512, out_features=512, bias=False)
    (v): Linear(in_features=512, out_features=512, bias=False)
    (q): Linear(in_features=512, out_features=512, bias=False)
    (w_concat): Linear(in_features=512, out_features=512, bias=True)
    (softmax): Softmax(dim=-1)
  )
  (ffn): PositionwiseFeedForward(
    (l1): Linear(in_features=512, out_features=2048, bias=True)
    (l2): Linear(in_features=2048, out_features=512, bias=True)
    (relu): ReLU()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (layer_norm): LayerNorm()
  (dropout): Dropout(p=0.1, inplace=False)
)


In [45]:
total_params = sum(
	param.numel() for param in encoder_model.parameters()
)


In [46]:
trainable_params = sum(
	p.numel() for p in encoder_model.parameters() if p.requires_grad
)


In [47]:
print(f"Total Parameters: {total_params}, Trainable Parameters: {trainable_params}")

Total Parameters: 3202048, Trainable Parameters: 3202048


In [48]:
torch.onnx.export(encoder_model, s_t, 'encoder.onnx', input_names=["in_tokens"], output_names=["out_tokens"])

## Transformer Decoder implementation

In [49]:
# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?

In [50]:

class Decoder(nn.Module):
    def __init__(self,n_layers=2, vocab_size=51, d_model=512,drop_prob=0.1, device='cpu') -> None:
        super(Decoder, self).__init__()

        self.positional_embedding = nn.Embedding(vocab_size, d_model)
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.attention = MultiHeadAttention()
        self.ffn = PositionwiseFeedForward()
        self.layer_norm = LayerNorm()
        self.dropout = nn.Dropout(p=drop_prob)

        self.n_layers = n_layers
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.device = device

        

    
    def forward(self, x,k,v):
        B, T = x.shape
        tok_emb = self.token_embedding(x) 
        pos_emb = self.positional_embedding(torch.arange(T, device=self.device))
        x = tok_emb + pos_emb


        for i in range(self.n_layers):
            _x = x
            x = self.attention(x,x,x, True)
            x = self.dropout(x)
            x = self.layer_norm(x + _x)

            _x = x
            x = self.attention(k,v,x)
            x = self.dropout(x)
            x = self.layer_norm(x + _x)

            _x = x
            x = self.ffn(x)
            x = self.dropout(x)
            x = self.layer_norm(x + _x)

            
        return x
            


In [51]:
decoder = Decoder()

In [52]:
s_t.shape

torch.Size([2, 3])

In [53]:
s_t

tensor([[23, 16, 27],
        [23, 16, 27]])

In [54]:

decoder_output = decoder(s_t, output, output)


In [55]:
decoder_output.shape

torch.Size([2, 3, 512])

## Encoder Decoder Model

In [56]:
class EncoderDecoder(nn.Module):
    def __init__(self, d_model=512, vocab_size=51) -> None:
        super(EncoderDecoder, self).__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()
        self.lm_head = nn.Linear(d_model, vocab_size, bias=False)

    def forward(self,x):
        encoder_output = self.encoder(x)
        output = self.decoder(x, encoder_output, encoder_output)
        output = self.lm_head(output)
        return output
    


In [57]:
encoder_decoder_model = EncoderDecoder()

In [58]:
output = encoder_decoder_model(s_t)

In [59]:
output.shape

torch.Size([2, 3, 51])

In [61]:
output.contiguous().view(-1, output.shape[-1]).shape

torch.Size([6, 51])

## Experiments

In [4]:
s_t = torch.stack((torch.tensor([23,16,27]), torch.tensor([23,16,27])))

In [5]:
d_model = 512

In [7]:
vocab_size = 51

In [8]:
token_embedding = nn.Embedding(vocab_size, d_model)
positional_embedding = nn.Embedding(vocab_size, d_model)

In [9]:
s_t.shape

torch.Size([2, 3])

In [10]:
tok_emb = token_embedding(s_t)

In [11]:
tok_emb.shape

torch.Size([2, 3, 512])

In [12]:
tok_emb

tensor([[[-1.7757,  0.0586,  0.2674,  ..., -0.5857, -0.2443, -0.2355],
         [ 0.6047, -0.5344,  0.9430,  ...,  0.1872, -0.5387,  1.2751],
         [-0.0704, -0.5737,  0.0403,  ...,  0.9521,  0.6735,  0.7023]],

        [[-1.7757,  0.0586,  0.2674,  ..., -0.5857, -0.2443, -0.2355],
         [ 0.6047, -0.5344,  0.9430,  ...,  0.1872, -0.5387,  1.2751],
         [-0.0704, -0.5737,  0.0403,  ...,  0.9521,  0.6735,  0.7023]]],
       grad_fn=<EmbeddingBackward0>)

In [13]:
pos_emb = positional_embedding(torch.arange(s_t.shape[1]))

In [14]:
pos_emb

tensor([[ 0.2713, -1.6347, -1.1250,  ..., -1.2926,  0.1686,  0.3813],
        [ 0.7880, -0.4153, -0.7522,  ..., -0.9934,  1.1213,  0.2822],
        [ 0.6961, -0.2195,  0.0581,  ..., -1.2911,  1.7523, -0.0187]],
       grad_fn=<EmbeddingBackward0>)

In [15]:
pos_emb.shape

torch.Size([3, 512])

In [60]:
a = torch.tensor([[1,2],[3,4]])

In [61]:
a.shape

torch.Size([2, 2])

In [62]:
b = torch.tensor([1,2])

In [63]:
b.shape

torch.Size([2])

In [64]:
a + b

tensor([[2, 4],
        [4, 6]])

In [16]:
input_1 = tok_emb + pos_emb

In [17]:
input_1.shape

torch.Size([2, 3, 512])

In [18]:
heads=8

In [19]:
k = nn.Linear(d_model, d_model)(input_1)
v = nn.Linear(d_model, d_model)(input_1)
q = nn.Linear(d_model, d_model)(input_1)


In [36]:
temp_t = torch.rand([2,3,512])

In [40]:
temp_t

tensor([[[0.0802, 0.0982, 0.4308,  ..., 0.7891, 0.7740, 0.0154],
         [0.6104, 0.0826, 0.5575,  ..., 0.5876, 0.5308, 0.4425],
         [0.9638, 0.9295, 0.8518,  ..., 0.3090, 0.9917, 0.1731]],

        [[0.6728, 0.1163, 0.8021,  ..., 0.6018, 0.2805, 0.0305],
         [0.7369, 0.1752, 0.8285,  ..., 0.2431, 0.1567, 0.5318],
         [0.6567, 0.3164, 0.1286,  ..., 0.1141, 0.1067, 0.6810]]])

In [38]:
temp_t.shape

torch.Size([2, 3, 512])

In [42]:
heads

8

In [44]:
temp_t.size()

torch.Size([2, 3, 512])

In [46]:
temp_t_m = temp_t.view(2,3,heads, 512//heads)

In [47]:
temp_t_m.size()

torch.Size([2, 3, 8, 64])

In [51]:
k.shape

torch.Size([2, 3, 512])

In [52]:
k = k.view(2,3,heads, 512//heads)

In [53]:
q.shape

torch.Size([2, 3, 512])

In [54]:
q = q.view(2,3,heads, 512//heads)

In [55]:
v = v.view(2,3,heads, 512//heads)

In [56]:
k.size()

torch.Size([2, 3, 8, 64])

In [57]:
k_t = k.transpose(2,3)

In [58]:
k_t.shape

torch.Size([2, 3, 64, 8])

In [59]:
(q @ k_t).shape

torch.Size([2, 3, 8, 8])

In [60]:
q @ k_t

tensor([[[[  1.2117,  -0.3581,  -0.4260,  -0.2827,  -2.8279,  -2.1437,  -2.8679,
            -1.0343],
          [  7.1344,   3.5787,   0.3892,   3.0862,  -5.9506,  -2.9849,   2.5459,
             3.6199],
          [  5.2800,   1.7048,  -4.2753,   3.9341,  -1.3720,   5.1133,  -4.5258,
             6.0240],
          [  0.0952,  -0.1833,   7.6764,  -1.8878,   4.6266,  -0.9071,   0.0241,
            -1.9372],
          [  3.7228,   7.9129,  -6.5832,   6.3541,  -3.6942,   4.2208,   2.3070,
            -1.6370],
          [ -5.9752,   1.5262,  -2.4834,   1.4054,  -7.2886,   0.4416,   0.1225,
            -1.8685],
          [ -2.5396,   3.4102,  -1.2021,   1.1720,   0.7296,   0.8213,  10.1474,
             1.8474],
          [ -0.6137,  -4.1179,   0.0646,   4.4747,  -0.4499,   9.5888,  -2.5183,
            -5.3273]],

         [[ -1.2489,  -1.0893,   8.4847,  -1.0904,   3.6712,   6.9259,  -3.6202,
             8.8685],
          [ -7.5599,  -5.0985,  -5.1248,   2.2981,   0.1744,  -6.4129, 

In [71]:
scale_product = (q@k_t)/torch.sqrt(torch.Tensor([k_t.shape[-1]]))

In [72]:
scale_product.shape

torch.Size([2, 3, 8, 8])

In [73]:
scale_product

tensor([[[[ 0.4284, -0.1266, -0.1506, -0.0999, -0.9998, -0.7579, -1.0139,
           -0.3657],
          [ 2.5224,  1.2652,  0.1376,  1.0911, -2.1039, -1.0553,  0.9001,
            1.2798],
          [ 1.8668,  0.6027, -1.5116,  1.3909, -0.4851,  1.8078, -1.6001,
            2.1298],
          [ 0.0337, -0.0648,  2.7140, -0.6675,  1.6357, -0.3207,  0.0085,
           -0.6849],
          [ 1.3162,  2.7976, -2.3275,  2.2465, -1.3061,  1.4923,  0.8157,
           -0.5788],
          [-2.1125,  0.5396, -0.8780,  0.4969, -2.5769,  0.1561,  0.0433,
           -0.6606],
          [-0.8979,  1.2057, -0.4250,  0.4144,  0.2579,  0.2904,  3.5876,
            0.6531],
          [-0.2170, -1.4559,  0.0228,  1.5821, -0.1591,  3.3901, -0.8903,
           -1.8835]],

         [[-0.4415, -0.3851,  2.9998, -0.3855,  1.2980,  2.4487, -1.2799,
            3.1355],
          [-2.6728, -1.8026, -1.8119,  0.8125,  0.0617, -2.2673,  0.7963,
           -1.3605],
          [ 1.6338, -2.0747,  1.4839,  0.5899,  

In [74]:
attention_output = nn.Softmax(dim=-1)(scale_product)

In [75]:
attention_output

tensor([[[[2.5269e-01, 1.4506e-01, 1.4161e-01, 1.4898e-01, 6.0578e-02,
           7.7155e-02, 5.9727e-02, 1.1421e-01],
          [4.6742e-01, 1.3296e-01, 4.3053e-02, 1.1172e-01, 4.5766e-03,
           1.3059e-02, 9.2291e-02, 1.3492e-01],
          [2.3213e-01, 6.5578e-02, 7.9165e-03, 1.4423e-01, 2.2097e-02,
           2.1884e-01, 7.2457e-03, 3.0197e-01],
          [4.1459e-02, 3.7572e-02, 6.0490e-01, 2.0565e-02, 2.0578e-01,
           2.9088e-02, 4.0430e-02, 2.0209e-02],
          [1.0018e-01, 4.4070e-01, 2.6201e-03, 2.5398e-01, 7.2763e-03,
           1.1946e-01, 6.0727e-02, 1.5059e-02],
          [1.8046e-02, 2.5597e-01, 6.2019e-02, 2.4527e-01, 1.1342e-02,
           1.7444e-01, 1.5583e-01, 7.7080e-02],
          [8.7404e-03, 7.1631e-02, 1.4025e-02, 3.2466e-02, 2.7765e-02,
           2.8680e-02, 7.7547e-01, 4.1222e-02],
          [2.1176e-02, 6.1347e-03, 2.6915e-02, 1.2798e-01, 2.2439e-02,
           7.8055e-01, 1.0800e-02, 4.0003e-03]],

         [[1.0612e-02, 1.1228e-02, 3.3139e-01,

In [76]:
attention_output.shape

torch.Size([2, 3, 8, 8])

In [77]:
layer_output = attention_output @ v

In [78]:
layer_output.shape

torch.Size([2, 3, 8, 64])

In [79]:
concat_output = layer_output.view(2,3,d_model)

In [80]:
concat_output.size()

torch.Size([2, 3, 512])

In [81]:
w_concat = nn.Linear(d_model, d_model)

In [82]:
final_output = w_concat(concat_output)

In [83]:
final_output.shape

torch.Size([2, 3, 512])

### Layer Normalization experiment

In [96]:
final_output[-1].shape

torch.Size([3, 512])

In [106]:
final_output.mean(-1, keepdim=True).shape

torch.Size([2, 3, 1])

In [113]:
final_output.var(-1, keepdim=True)

tensor([[[0.0712],
         [0.0562],
         [0.0983]],

        [[0.0712],
         [0.0562],
         [0.0983]]], grad_fn=<VarBackward0>)

In [111]:
final_output.var(-1, keepdim=True, unbiased=False)

tensor([[[0.0711],
         [0.0561],
         [0.0981]],

        [[0.0711],
         [0.0561],
         [0.0981]]], grad_fn=<VarBackward0>)

torch.Size([512])

In [118]:
gamma = nn.Parameter(torch.ones(d_model))
beta = nn.Parameter(torch.zeros(d_model))

In [120]:
gamma.shape

torch.Size([512])

In [121]:
beta.shape

torch.Size([512])

In [122]:
eps=1e-12

In [29]:
t = torch.rand((2,3,512))

In [30]:
t

tensor([[[0.8900, 0.9574, 0.4480,  ..., 0.6815, 0.9838, 0.6955],
         [0.4489, 0.2031, 0.7776,  ..., 0.2069, 0.8618, 0.3260],
         [0.2983, 0.6056, 0.7801,  ..., 0.9291, 0.3297, 0.6644]],

        [[0.1237, 0.2730, 0.5457,  ..., 0.6981, 0.4524, 0.2758],
         [0.4233, 0.9857, 0.5810,  ..., 0.5019, 0.9301, 0.6828],
         [0.6560, 0.5124, 0.4303,  ..., 0.6155, 0.7147, 0.4626]]])

In [31]:
t.shape

torch.Size([2, 3, 512])

In [32]:
torch.tril(t).masked_fill()

tensor([[[0.8900, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.4489, 0.2031, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.2983, 0.6056, 0.7801,  ..., 0.0000, 0.0000, 0.0000]],

        [[0.1237, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.4233, 0.9857, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
         [0.6560, 0.5124, 0.4303,  ..., 0.0000, 0.0000, 0.0000]]])