In [1]:
import numpy as np
import scipy as sp
import torch 
import torch.nn as nn

In [15]:
word_1 = np.array([1, 0, 0])
word_2 = np.array([0, 1, 0])
word_3 = np.array([1, 1, 0])
word_4 = np.array([0, 0, 1])


In [16]:
W_Q = np.random.randint(3, size=(3, 3))
W_K = np.random.randint(3, size=(3, 3))
W_V = np.random.randint(3, size=(3, 3))

In [17]:
query_1 = word_1 @ W_Q
key_1 = word_1 @ W_K
value_1 = word_1 @ W_V
 
query_2 = word_2 @ W_Q
key_2 = word_2 @ W_K
value_2 = word_2 @ W_V
 
query_3 = word_3 @ W_Q
key_3 = word_3 @ W_K
value_3 = word_3 @ W_V
 
query_4 = word_4 @ W_Q
key_4 = word_4 @ W_K
value_4 = word_4 @ W_V

query_1.shape, key_1.shape, value_1.shape

((3,), (3,), (3,))

In [18]:
scores = np.array([np.dot(query_1, key_1), np.dot(query_1, key_2), np.dot(query_1, key_3), np.dot(query_1, key_4)])
scores

array([2, 2, 4, 4])

In [11]:
weights = sp.special.softmax(scores / key_1.shape[0] ** 0.5)
weights.shape

(4,)

In [31]:
from numpy import array
from numpy import random
from numpy import dot
from scipy.special import softmax

In [40]:
# encoder representations of four different words
word_1 = array([1, 0, 0])
word_2 = array([0, 1, 0])
word_3 = array([1, 1, 0])
word_4 = array([0, 0, 1])
word_1.shape

(3,)

In [41]:
# stacking the word embeddings into a single array
words = array([word_1, word_2, word_3, word_4])
words.shape

(4, 3)

In [43]:
# generating the weight matrices
random.seed(42)
W_Q = random.randint(3, size=(3, 3))
W_K = random.randint(3, size=(3, 3))
W_V = random.randint(3, size=(3, 3))
W_Q.shape

(3, 3)

In [44]:
# generating the queries, keys and values
Q = words @ W_Q
K = words @ W_K
V = words @ W_V
Q.shape, K.shape, V.shape

((4, 3), (4, 3), (4, 3))

In [45]:
# scoring the query vectors against all key vectors
scores = Q @ K.transpose()
scores.shape

(4, 4)

In [46]:
# computing the weights by a softmax operation
weights = softmax(scores / K.shape[1] ** 0.5, axis=1)
weights.shape

(4, 4)

In [47]:
# computing the attention by a weighted sum of the value vectors
attention = weights @ V
attention.shape

(4, 3)

In [39]:
 
print(attention)

[[0.98522025 1.74174051 0.75652026]
 [0.90965265 1.40965265 0.5       ]
 [0.99851226 1.75849334 0.75998108]
 [0.99560386 1.90407309 0.90846923]]


# scratch atteniton mechanism

In [1]:
import torch
import torch.nn as nn
import numpy as np

In [2]:
class EncoderSelfAttention(nn.Module):
    def __init__(self, input_size=256, hidden_size=256) -> None:
        super(EncoderSelfAttention, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.Q = nn.Linear(in_features=self.input_size, out_features=self.hidden_size)
        self.K = nn.Linear(in_features=self.input_size, out_features=self.hidden_size)
        self.V = nn.Linear(in_features=self.input_size, out_features=self.hidden_size)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, input_embeddings):
        query = self.Q(input_embeddings)
        key = self.K(input_embeddings)
        value = self.V(input_embeddings)
        latent_embeddings = torch.matmul(self.softmax(torch.matmul(query, torch.transpose(key, 0, 1))), value)
        return latent_embeddings

class DecoderSelfAttention(nn.Module):
    def __init__(self, input_size=256, hidden_size=256) -> None:
        super(DecoderSelfAttention, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.Q = nn.Linear(in_features=self.input_size, out_features=self.hidden_size)
        self.K = nn.Linear(in_features=self.input_size, out_features=self.hidden_size)
        self.V = nn.Linear(in_features=self.input_size, out_features=self.hidden_size)
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, encoder_embeddings, decoder_embeddings):
        query = self.Q(decoder_embeddings)
        key = self.K(encoder_embeddings)
        value = self.V(encoder_embeddings)
        latent_embeddings = torch.matmul(self.softmax(torch.matmul(query, torch.transpose(key, 0, 1))), value)
        return latent_embeddings

class Model(nn.Module):
    def __init__(self, encoder, decoder, input_vocab_size, target_vocab_size) -> None:
        super(Model, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.encoder_embeddings = nn.Embedding(input_vocab_size, self.encoder.hidden_size)
        self.decoder_embeddings = nn.Embedding(target_vocab_size, self.decoder.hidden_size)
        self.output_layer = nn.Linear(in_features=self.decoder.hidden_size, out_features=target_vocab_size)
        self.softmax = nn.Softmax(dim = 1)
        
    def forward(self, batch):
        input_embeddings, target_embeddings = self.encoder_embeddings(batch[0]), self.decoder_embeddings(batch[1])
        encoder_embeddings = self.encoder(input_embeddings)
        decoder_embeddings = self.decoder(encoder_embeddings, target_embeddings)
        output_probabilities = self.softmax(self.output_layer(decoder_embeddings))
        
        return output_probabilities

In [3]:
input_vocab_size = 100
target_vocab_size = 200
hidden_size = 256

encoder = EncoderSelfAttention(input_size=hidden_size, hidden_size=hidden_size)
decoder = DecoderSelfAttention(input_size=hidden_size, hidden_size=hidden_size)
model = Model(encoder, decoder, input_vocab_size, target_vocab_size)

In [4]:
input_sent = torch.randint(0, high=input_vocab_size, size=(5, ))
target_sent = torch.randint(0, high=target_vocab_size, size=(7, ))
input_sent.size(), target_sent.size()

(torch.Size([5]), torch.Size([7]))

In [5]:
model((input_sent, target_sent)).size()

torch.Size([7, 200])

# transformer module in pytorch

In [9]:
import torch
import os 
import pandas as pd
import torch.nn as nn
import numpy as np
import pytorch_lightning as pl
import torch.nn.functional as F
import gc
from language import prepareData, indexesFromSentence, tensorFromSentence

  return torch._C._cuda_getDeviceCount() > 0


In [15]:
attention_layer = nn.MultiheadAttention(256, num_heads=2)
input_embeddings = torch.rand(10, 256)
input_embeddings.shape

torch.Size([10, 256])

In [None]:
attn_output, attn_weights = attention_layer(input_embeddings, input_embeddings, input_embeddings)
attn_output.shape, attn_weights.shape

(torch.Size([10, 256]), torch.Size([10, 10]))

In [3]:
eng, fra, pairs =  prepareData('eng', 'fra', reverse=False)
pairs = np.array(pairs)

Reading lines...
Read 135842 sentence pairs
Trimmed to 135842 sentence pairs
Counting words...
Counted words:
eng 13043
fra 21334


In [155]:
eng_sentences= pairs[:, 0]
eng_sentences[:5]

array(['go .', 'run !', 'run !', 'wow !', 'fire !'], dtype='<U348')

In [3]:
# df = pd.DataFrame({'sentences': eng_sentences}).drop_duplicates(subset=['sentences'])
# df['labels'] = np.random.randint(0, 2, len(df))
# df.to_csv('./HappySad.csv', index=False)

In [10]:
# df = pd.read_csv('./HappySad.csv')
# df.head()

In [5]:
# df['labels'].value_counts()

In [8]:
# df['word_array'] = df['sentences'].str.split(' ')
# df['lengths'] = df['word_array'].map(len)
# # df['indexes'] = df['sentences'].apply(lambda x: indexesFromSentence(lang=eng, sentence=x))
# df.head()

In [9]:
# df.to_csv('./HappySad.csv', index=False)

In [4]:
df = pd.read_csv('./HappySad.csv')

In [5]:
df.head()

Unnamed: 0,sentences,labels,word_array,lengths
0,go .,1,"['go', '.']",2
1,run !,0,"['run', '!']",2
2,wow !,0,"['wow', '!']",2
3,fire !,0,"['fire', '!']",2
4,help !,0,"['help', '!']",2


In [6]:
indexes = (df['sentences'].apply(lambda x: np.array(indexesFromSentence(lang=eng, sentence=x))).values).tolist()
indexes[:5]

[array([2, 3]), array([4, 5]), array([6, 5]), array([7, 5]), array([8, 5])]

In [7]:
data = [(torch.from_numpy(val).view(-1), torch.tensor(label, dtype=torch.float32)) for val, label in zip(indexes, df['labels'].values)] 

In [8]:
small_size, large_size = len(data)-int(len(data)*0.8), int(len(data)*0.8) 
dataset = torch.utils.data.random_split(data, [small_size, large_size])

In [9]:
dataset

[<torch.utils.data.dataset.Subset at 0x7f6958fb66a0>,
 <torch.utils.data.dataset.Subset at 0x7f6958f4d400>]

In [10]:
smalldataloader, largedataloader = torch.utils.data.DataLoader(dataset[0]),torch.utils.data.DataLoader(dataset[1])

In [11]:
for i in smalldataloader:
    print(i[0].shape)
    break

torch.Size([1, 15])


In [12]:
eng.n_words

13043

In [1]:
import torch

In [2]:
rand = torch.randint(low=0, high=10, size=(2, 5))
emb = torch.nn.Embedding(10, 10)
attention = torch.nn.MultiheadAttention(10, 1, batch_first=True)
lstm = torch.nn.LSTM(10, 50)
linear = torch.nn.Linear(50, 2)
emb_bag = torch.nn.EmbeddingBag(10,50)

In [3]:
rand.shape

torch.Size([2, 5])

In [4]:
out_emb = emb(rand)
out_multi_head_attention = attention(out_emb, out_emb, out_emb)
# # out_lstm = lstm(out_emb)
# # out_linear = linear()

In [6]:
out_emb.shape

torch.Size([2, 5, 10])

In [7]:
out_multi_head_attention[0].shape, out_multi_head_attention[1].shape

(torch.Size([2, 5, 10]), torch.Size([2, 5, 5]))

In [86]:
# out_emb.shape

In [87]:
# out_multi_head_attention[0].shape, out_multi_head_attention[1].shape

In [88]:
# out_emb_bag = emb_bag(rand)

In [89]:
# out_emb_bag.shape

In [90]:
# out_emb.shape, out_lstm[0].shape, out_linear.shape

In [92]:
# out_lstm[1][0].shape, out_lstm[1][1].shape

In [41]:
rand = torch.rand(1 ,256)
layer_norm = nn.LayerNorm(256)
layer_norm(rand).size()

torch.Size([1, 256])

In [44]:
input = torch.rand(3, 5)
m = nn.MaxPool1d(3)
output = m(input)
output.size()

torch.Size([3, 1])

In [47]:
input

tensor([[0.2461, 0.3953, 0.8410, 0.9194, 0.1892],
        [0.5102, 0.0947, 0.4375, 0.5227, 0.8972],
        [0.0245, 0.2001, 0.3511, 0.2447, 0.9994]])

In [46]:
output

tensor([[0.8410],
        [0.5102],
        [0.3511]])

In [17]:
del model

In [27]:
rand = torch.rand(3,5)
rand.shape

torch.Size([3, 5])

In [28]:
rand

tensor([[0.6874, 0.8160, 0.5038, 0.5187, 0.9197],
        [0.0140, 0.0012, 0.0972, 0.8109, 0.1201],
        [0.3666, 0.0658, 0.1525, 0.4796, 0.8438]])

In [29]:
rand.max(dim=0)[0]

tensor([0.6874, 0.8160, 0.5038, 0.8109, 0.9197])

In [13]:
class AttnModel(pl.LightningModule):
    def __init__(self, lang, hidden_size=256):
        super(AttnModel, self).__init__()
        self.embeddings = nn.Embedding(lang.n_words, hidden_size)
        self.linear1 = nn.Linear(hidden_size, hidden_size)
        self.attention = nn.MultiheadAttention(hidden_size, 1)
        self.layer_norm = nn.LayerNorm(hidden_size)
        self.feed_forward = nn.Linear(hidden_size, hidden_size)
        self.linear2 = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()
        self.loss = nn.BCELoss()
    
    def forward(self, input):
        embeddings = self.embeddings(input) # length, hidden
        linear1 = self.relu(self.linear1(embeddings)) # length, hidden 
        
        self.attention_output, self.attention_weights = self.attention(linear1, 
                                                             linear1, 
                                                             linear1) # length, hidden
        add_and_norm1 = self.layer_norm(self.attention_output+linear1) # length, hidden
        
        feed_forward = self.relu(self.feed_forward(add_and_norm1)) # length, hidden
        add_and_norm2 = self.layer_norm(feed_forward+add_and_norm1) # length, hidden
         
        # max returns tuple of values and indices, hence we take value.
        attention_output_aggregated = self.attention_output.max(dim=0)[0] # length, hidden
        feed_forward = self.relu(self.feed_forward(attention_output_aggregated))
        linear2 = self.linear2(feed_forward)
        return self.sigmoid(linear2)
    
    def training_epoch_end(self, outputs):
        train_loss = torch.as_tensor([i['loss'] for i in outputs]).mean()
        self.log_dict({'train_loss_epoch': train_loss})
    
    def training_step(self, batch, batch_idx):
        input_tensor, target_tensor = batch[0][0], batch[1]
        
        output_tensor = self(input_tensor)
        loss = self.loss(output_tensor, target_tensor)
        
        return {'loss':loss}
    
    def configure_optimizers(self):
        optimizer = torch.optim.SGD(self.parameters(),
                                     lr=0.01)
        return optimizer
        

model = AttnModel(eng)

In [14]:
gc.collect()

0

In [15]:
# sanity check

batch = next(iter(smalldataloader))
X,y = batch[0][0], batch[1]
model_out = model(X)

In [16]:
model_out

tensor([0.5171], grad_fn=<SigmoidBackward0>)

In [17]:
X.shape, y.dtype, X.device, model.device

(torch.Size([15]), torch.float32, device(type='cpu'), device(type='cpu'))

In [18]:
tblogger = pl.loggers.TensorBoardLogger('.', version='trial2')
trainer = pl.Trainer(max_epochs=10, accelerator='gpu', logger=tblogger)
trainer.fit(model=model, train_dataloaders=smalldataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name         | Type               | Params
----------------------------------------------------
0 | embeddings   | Embedding          | 3.3 M 
1 | linear1      | Linear             | 65.8 K
2 | attention    | MultiheadAttention | 263 K 
3 | layer_norm   | LayerNorm          | 512   
4 | feed_forward | Linear             | 65.8 K
5 | linear2      | Linear             | 257   
6 | sigmoid      | Sigmoid            | 0     
7 | relu         | ReLU               | 0     
8 | loss         | BCELoss            | 0     
----------------------------------------------------
3.7 M     Trainable params
0         Non-trainable params
3.7 M     Total params
14.938    Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

2023-01-01 10:48:02.408607: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-01 10:48:07.900713: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-01-01 10:48:07.901181: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [21]:
X.shape

torch.Size([11])

In [22]:
model_out = model(X)

In [23]:
import tqdm

In [24]:
loss = 0
pbar = tqdm.tqdm(smalldataloader, total=len(smalldataloader))
for i in pbar:
    X, y = i[0][0], i[1]
    model_temp_out = model(X)
    cur_loss = model.loss(model_out, y)
    loss += cur_loss
    pbar.set_description(f'cur loss {cur_loss}')

cur loss 0.6977843642234802: 100%|██████████| 18705/18705 [00:52<00:00, 358.52it/s]


In [29]:
loss/len(smalldataloader)

tensor(0.6932, grad_fn=<DivBackward0>)