In [1]:
import torch
from torch import nn
import numpy as np
import pandas as pd

In [4]:
companies_df = pd.read_csv('companies_sorted.csv')

In [5]:
companies_df.head()

Unnamed: 0.1,Unnamed: 0,name,domain,year founded,industry,size range,locality,country,linkedin url,current employee estimate,total employee estimate
0,5872184,ibm,ibm.com,1911.0,information technology and services,10001+,"new york, new york, united states",united states,linkedin.com/company/ibm,274047,716906
1,4425416,tata consultancy services,tcs.com,1968.0,information technology and services,10001+,"bombay, maharashtra, india",india,linkedin.com/company/tata-consultancy-services,190771,341369
2,21074,accenture,accenture.com,1989.0,information technology and services,10001+,"dublin, dublin, ireland",ireland,linkedin.com/company/accenture,190689,455768
3,2309813,us army,goarmy.com,1800.0,military,10001+,"alexandria, virginia, united states",united states,linkedin.com/company/us-army,162163,445958
4,1558607,ey,ey.com,1989.0,accounting,10001+,"london, greater london, united kingdom",united kingdom,linkedin.com/company/ernstandyoung,158363,428960


In [None]:
#companies_df['name'][:1000].to_csv('companies_names_list.csv', index=False)

In [9]:
company_names = companies_df['name'][:1000].to_list()

In [10]:
len(company_names)

1000

In [39]:
print(' '.join(company_names))

ibm tata consultancy services accenture us army ey hewlett-packard cognizant technology solutions walmart microsoft at&t united states air force pwc wells fargo infosys deloitte citi bank of america jpmorgan chase & co. us navy amazon apple siemens oracle nokia education nationale capgemini wipro technologies hsbc google united states postal service mcdonald's corporation ericsson nhs boeing vodafone kaiser permanente ford motor company intel corporation department of veterans affairs cisco general motors pepsico target shell hcl technologies ups pfizer unilever nestlé unitedhealth group huawei technologies dell gsk honeywell sap schlumberger lockheed martin ayatama energi, trisco nusantara abb american center of krasnodar starbucks promobroker agente de seguros y de fianzas s a de c v morgan stanley the home depot independiente novartis johnson & johnson bp schneider electric state farm secretaría de educación pública dhl my home td ge philips ubs procter & gamble walgreens keller wil

In [11]:
len(''.join(company_names))

13915

In [23]:
all_chars = sorted(list(set(''.join(company_names))))

In [30]:
vocab_size = len(all_chars)

In [31]:
print(''.join(all_chars))

 "&'()+,-./1234abcdefghijklmnopqrstuvwxyz|®ãéëíóôúē


In [25]:
print(all_chars)

[' ', '"', '&', "'", '(', ')', '+', ',', '-', '.', '/', '1', '2', '3', '4', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|', '®', 'ã', 'é', 'ë', 'í', 'ó', 'ô', 'ú', 'ē']


In [26]:
stoi = {s:i for i,s in enumerate(all_chars)}

In [None]:
stoi

In [28]:
itos = {i:s for i,s in enumerate(all_chars)}

In [32]:
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [34]:
for name in company_names:
    print(name)
    print(encode(name))
    print(decode(encode(name)))
    break

ibm
[23, 16, 27]
ibm


In [41]:
token_embedding = nn.Embedding(vocab_size, 32)

In [None]:
token = token_embedding(torch.tensor([23,16,27]))

In [44]:
token.shape

torch.Size([3, 32])

In [3]:
s_t = torch.stack((torch.tensor([23,16,27]), torch.tensor([23,16,27])))

In [49]:
s_t.shape

torch.Size([2, 3])

In [40]:
torch.arange(3)

tensor([0, 1, 2])

In [154]:
class LayerNorm(nn.Module):
    def __init__(self,d_model=512, eps=1e-12):
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(d_model))
        self.beta = nn.Parameter(torch.zeros(d_model))
        self.eps = eps
    def forward(self,x):
        mean = x.mean(-1,keepdim=True)
        var = x.var(-1, keepdim=True, unbiased=False)

        out = (x - mean)/torch.sqrt(var + self.eps)
        out = out * self.gamma + self.beta
        return out


In [155]:
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model=512, d_ff=2048, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.l1 = nn.Linear(d_model,d_ff )
        self.l2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.l1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.l2(x)
        return x

In [156]:
d_model = 512

class MultiHeadAttention(nn.Module):
    def __init__(self, heads: int = 8):
        super(MultiHeadAttention, self).__init__()
        self.k = nn.Linear(d_model, d_model)
        self.v = nn.Linear(d_model, d_model)
        self.q = nn.Linear(d_model, d_model)
        self.w_concat = nn.Linear(d_model, d_model)
        self.softmax = nn.Softmax(dim=-1)
        self.heads = heads

    def forward(self, x: torch.tensor):

        k = self.split(self.k(x))
        v = self.split(self.v(x))
        q = self.split(self.q(x))



        k_t = k.transpose(2,3)

        score = (q@k_t)*(k.shape[-1]**-0.5)

        score = self.softmax(score)

        scale_product = score @ v
        
        batch, length, heads, d_tensor = scale_product.size()
        scale_product = scale_product.view(batch, length, heads*d_tensor)
        scale_product = self.w_concat(scale_product)

        return scale_product
    
    def split(self, tensor:torch.tensor):

        batch, length, d_model = tensor.size()
        d_tensor = d_model//self.heads

        new_tensor = tensor.view(batch, length, self.heads, d_tensor)

        return new_tensor
    




    

In [157]:
device = 'cpu'


class Encoder(nn.Module):
    def __init__(self, drop_prob=0.1):
        super(Encoder, self).__init__()
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.positional_embedding = nn.Embedding(vocab_size, d_model)
        self.attention = MultiHeadAttention()
        self.ffn = PositionwiseFeedForward()
        self.layer_norm = LayerNorm()
        self.dropout1 = nn.Dropout(p=drop_prob)
        self.dropout2 = nn.Dropout(p=drop_prob)

    def forward(self, x):
        B, T = x.shape
        tok_emb = self.token_embedding(x)
        pos_emb = self.positional_embedding(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        _x = x
        x = self.attention(x)
        x = self.dropout1(x)

        x = self.layer_norm(x + _x)

        _x = x
        x = self.ffn(x)

        x = self.dropout2(x)
        x = self.layer_norm(x + _x)

        return x
        

In [158]:
encoder_model = Encoder()

In [159]:
output = encoder_model(s_t)

In [160]:
output.shape

torch.Size([2, 3, 512])

In [161]:
output

tensor([[[-0.4820, -0.0833, -0.7650,  ..., -0.8206, -1.5066,  0.2730],
         [-1.0457,  0.7325,  0.5840,  ..., -0.9054,  0.8780, -0.7560],
         [-0.5565,  0.5629,  0.2293,  ..., -0.8365, -1.5694, -1.5336]],

        [[-0.2968, -0.1843, -0.7212,  ..., -0.9152, -1.4189,  0.2936],
         [-1.1320,  0.6276,  0.4323,  ..., -0.9554,  1.1707, -0.5591],
         [-0.5562,  0.2240,  0.2159,  ..., -1.0180, -1.4673, -1.4312]]],
       grad_fn=<AddBackward0>)

## Experiments

In [4]:
s_t = torch.stack((torch.tensor([23,16,27]), torch.tensor([23,16,27])))

In [5]:
d_model = 512

In [7]:
vocab_size = 51

In [8]:
token_embedding = nn.Embedding(vocab_size, d_model)
positional_embedding = nn.Embedding(vocab_size, d_model)

In [9]:
s_t.shape

torch.Size([2, 3])

In [10]:
tok_emb = token_embedding(s_t)

In [11]:
tok_emb.shape

torch.Size([2, 3, 512])

In [12]:
tok_emb

tensor([[[-1.7757,  0.0586,  0.2674,  ..., -0.5857, -0.2443, -0.2355],
         [ 0.6047, -0.5344,  0.9430,  ...,  0.1872, -0.5387,  1.2751],
         [-0.0704, -0.5737,  0.0403,  ...,  0.9521,  0.6735,  0.7023]],

        [[-1.7757,  0.0586,  0.2674,  ..., -0.5857, -0.2443, -0.2355],
         [ 0.6047, -0.5344,  0.9430,  ...,  0.1872, -0.5387,  1.2751],
         [-0.0704, -0.5737,  0.0403,  ...,  0.9521,  0.6735,  0.7023]]],
       grad_fn=<EmbeddingBackward0>)

In [13]:
pos_emb = positional_embedding(torch.arange(s_t.shape[1]))

In [14]:
pos_emb

tensor([[ 0.2713, -1.6347, -1.1250,  ..., -1.2926,  0.1686,  0.3813],
        [ 0.7880, -0.4153, -0.7522,  ..., -0.9934,  1.1213,  0.2822],
        [ 0.6961, -0.2195,  0.0581,  ..., -1.2911,  1.7523, -0.0187]],
       grad_fn=<EmbeddingBackward0>)

In [15]:
pos_emb.shape

torch.Size([3, 512])

In [60]:
a = torch.tensor([[1,2],[3,4]])

In [61]:
a.shape

torch.Size([2, 2])

In [62]:
b = torch.tensor([1,2])

In [63]:
b.shape

torch.Size([2])

In [64]:
a + b

tensor([[2, 4],
        [4, 6]])

In [16]:
input_1 = tok_emb + pos_emb

In [17]:
input_1.shape

torch.Size([2, 3, 512])

In [18]:
heads=8

In [19]:
k = nn.Linear(d_model, d_model)(input_1)
v = nn.Linear(d_model, d_model)(input_1)
q = nn.Linear(d_model, d_model)(input_1)


In [36]:
temp_t = torch.rand([2,3,512])

In [40]:
temp_t

tensor([[[0.0802, 0.0982, 0.4308,  ..., 0.7891, 0.7740, 0.0154],
         [0.6104, 0.0826, 0.5575,  ..., 0.5876, 0.5308, 0.4425],
         [0.9638, 0.9295, 0.8518,  ..., 0.3090, 0.9917, 0.1731]],

        [[0.6728, 0.1163, 0.8021,  ..., 0.6018, 0.2805, 0.0305],
         [0.7369, 0.1752, 0.8285,  ..., 0.2431, 0.1567, 0.5318],
         [0.6567, 0.3164, 0.1286,  ..., 0.1141, 0.1067, 0.6810]]])

In [38]:
temp_t.shape

torch.Size([2, 3, 512])

In [42]:
heads

8

In [44]:
temp_t.size()

torch.Size([2, 3, 512])

In [46]:
temp_t_m = temp_t.view(2,3,heads, 512//heads)

In [47]:
temp_t_m.size()

torch.Size([2, 3, 8, 64])

In [51]:
k.shape

torch.Size([2, 3, 512])

In [52]:
k = k.view(2,3,heads, 512//heads)

In [53]:
q.shape

torch.Size([2, 3, 512])

In [54]:
q = q.view(2,3,heads, 512//heads)

In [55]:
v = v.view(2,3,heads, 512//heads)

In [56]:
k.size()

torch.Size([2, 3, 8, 64])

In [57]:
k_t = k.transpose(2,3)

In [58]:
k_t.shape

torch.Size([2, 3, 64, 8])

In [59]:
(q @ k_t).shape

torch.Size([2, 3, 8, 8])

In [60]:
q @ k_t

tensor([[[[  1.2117,  -0.3581,  -0.4260,  -0.2827,  -2.8279,  -2.1437,  -2.8679,
            -1.0343],
          [  7.1344,   3.5787,   0.3892,   3.0862,  -5.9506,  -2.9849,   2.5459,
             3.6199],
          [  5.2800,   1.7048,  -4.2753,   3.9341,  -1.3720,   5.1133,  -4.5258,
             6.0240],
          [  0.0952,  -0.1833,   7.6764,  -1.8878,   4.6266,  -0.9071,   0.0241,
            -1.9372],
          [  3.7228,   7.9129,  -6.5832,   6.3541,  -3.6942,   4.2208,   2.3070,
            -1.6370],
          [ -5.9752,   1.5262,  -2.4834,   1.4054,  -7.2886,   0.4416,   0.1225,
            -1.8685],
          [ -2.5396,   3.4102,  -1.2021,   1.1720,   0.7296,   0.8213,  10.1474,
             1.8474],
          [ -0.6137,  -4.1179,   0.0646,   4.4747,  -0.4499,   9.5888,  -2.5183,
            -5.3273]],

         [[ -1.2489,  -1.0893,   8.4847,  -1.0904,   3.6712,   6.9259,  -3.6202,
             8.8685],
          [ -7.5599,  -5.0985,  -5.1248,   2.2981,   0.1744,  -6.4129, 

In [71]:
scale_product = (q@k_t)/torch.sqrt(torch.Tensor([k_t.shape[-1]]))

In [72]:
scale_product.shape

torch.Size([2, 3, 8, 8])

In [73]:
scale_product

tensor([[[[ 0.4284, -0.1266, -0.1506, -0.0999, -0.9998, -0.7579, -1.0139,
           -0.3657],
          [ 2.5224,  1.2652,  0.1376,  1.0911, -2.1039, -1.0553,  0.9001,
            1.2798],
          [ 1.8668,  0.6027, -1.5116,  1.3909, -0.4851,  1.8078, -1.6001,
            2.1298],
          [ 0.0337, -0.0648,  2.7140, -0.6675,  1.6357, -0.3207,  0.0085,
           -0.6849],
          [ 1.3162,  2.7976, -2.3275,  2.2465, -1.3061,  1.4923,  0.8157,
           -0.5788],
          [-2.1125,  0.5396, -0.8780,  0.4969, -2.5769,  0.1561,  0.0433,
           -0.6606],
          [-0.8979,  1.2057, -0.4250,  0.4144,  0.2579,  0.2904,  3.5876,
            0.6531],
          [-0.2170, -1.4559,  0.0228,  1.5821, -0.1591,  3.3901, -0.8903,
           -1.8835]],

         [[-0.4415, -0.3851,  2.9998, -0.3855,  1.2980,  2.4487, -1.2799,
            3.1355],
          [-2.6728, -1.8026, -1.8119,  0.8125,  0.0617, -2.2673,  0.7963,
           -1.3605],
          [ 1.6338, -2.0747,  1.4839,  0.5899,  

In [74]:
attention_output = nn.Softmax(dim=-1)(scale_product)

In [75]:
attention_output

tensor([[[[2.5269e-01, 1.4506e-01, 1.4161e-01, 1.4898e-01, 6.0578e-02,
           7.7155e-02, 5.9727e-02, 1.1421e-01],
          [4.6742e-01, 1.3296e-01, 4.3053e-02, 1.1172e-01, 4.5766e-03,
           1.3059e-02, 9.2291e-02, 1.3492e-01],
          [2.3213e-01, 6.5578e-02, 7.9165e-03, 1.4423e-01, 2.2097e-02,
           2.1884e-01, 7.2457e-03, 3.0197e-01],
          [4.1459e-02, 3.7572e-02, 6.0490e-01, 2.0565e-02, 2.0578e-01,
           2.9088e-02, 4.0430e-02, 2.0209e-02],
          [1.0018e-01, 4.4070e-01, 2.6201e-03, 2.5398e-01, 7.2763e-03,
           1.1946e-01, 6.0727e-02, 1.5059e-02],
          [1.8046e-02, 2.5597e-01, 6.2019e-02, 2.4527e-01, 1.1342e-02,
           1.7444e-01, 1.5583e-01, 7.7080e-02],
          [8.7404e-03, 7.1631e-02, 1.4025e-02, 3.2466e-02, 2.7765e-02,
           2.8680e-02, 7.7547e-01, 4.1222e-02],
          [2.1176e-02, 6.1347e-03, 2.6915e-02, 1.2798e-01, 2.2439e-02,
           7.8055e-01, 1.0800e-02, 4.0003e-03]],

         [[1.0612e-02, 1.1228e-02, 3.3139e-01,

In [76]:
attention_output.shape

torch.Size([2, 3, 8, 8])

In [77]:
layer_output = attention_output @ v

In [78]:
layer_output.shape

torch.Size([2, 3, 8, 64])

In [79]:
concat_output = layer_output.view(2,3,d_model)

In [80]:
concat_output.size()

torch.Size([2, 3, 512])

In [81]:
w_concat = nn.Linear(d_model, d_model)

In [82]:
final_output = w_concat(concat_output)

In [83]:
final_output.shape

torch.Size([2, 3, 512])

### Layer Normalization experiment

In [96]:
final_output[-1].shape

torch.Size([3, 512])

In [106]:
final_output.mean(-1, keepdim=True).shape

torch.Size([2, 3, 1])

In [113]:
final_output.var(-1, keepdim=True)

tensor([[[0.0712],
         [0.0562],
         [0.0983]],

        [[0.0712],
         [0.0562],
         [0.0983]]], grad_fn=<VarBackward0>)

In [111]:
final_output.var(-1, keepdim=True, unbiased=False)

tensor([[[0.0711],
         [0.0561],
         [0.0981]],

        [[0.0711],
         [0.0561],
         [0.0981]]], grad_fn=<VarBackward0>)

torch.Size([512])

In [118]:
gamma = nn.Parameter(torch.ones(d_model))
beta = nn.Parameter(torch.zeros(d_model))

In [120]:
gamma.shape

torch.Size([512])

In [121]:
beta.shape

torch.Size([512])

In [122]:
eps=1e-12