In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import sys
import os
sys.path.append(os.path.abspath("/Users/rutvikdhopate/Downloads/Jupyter_Files/Clinical-Trials-Eligibility/"))
from PyTorch_Files.encoder_model import SelfAttention, MultiHeadSelfAttention, FeedForwardNetwork, Encoder, Transformer, Classifier

In [2]:
sa = SelfAttention(d_model=128, d_k=8, d_v=8)
print(sa)
mhsa = MultiHeadSelfAttention(d_model=128, d_k=8, d_v=8, n_heads=4)
ffn = FeedForwardNetwork(d_model=128, d_ff=512)

SelfAttention(
  (W_q): Linear(in_features=128, out_features=8, bias=True)
  (W_k): Linear(in_features=128, out_features=8, bias=True)
  (W_v): Linear(in_features=128, out_features=8, bias=True)
  (W_o): Linear(in_features=8, out_features=128, bias=True)
)


In [10]:
batch = [
    {'input_ids': torch.tensor([1, 15, 1075, 126, 194, 430, 105, 873]), 
     'attention_mask': torch.tensor([1, 1, 1, 1, 1, 1, 1, 1]), 'label': torch.tensor(1)},
    {'input_ids': torch.tensor([1, 34, 105, 65, 14, 43, 5, 3]), 
     'attention_mask': torch.tensor([1, 1, 1, 1, 1, 1, 0, 0]), 'label': torch.tensor(0)},
    {'input_ids': torch.tensor([1, 1500, 1075, 126, 194, 430, 15, 3]), 
     'attention_mask': torch.tensor([1, 1, 1, 1, 1, 0, 1, 1]), 'label': torch.tensor(1)},
    {'input_ids': torch.tensor([1, 1050, 755, 16, 20, 4, 105, 83]), 
     'attention_mask': torch.tensor([1, 1, 1, 1, 1, 1, 1, 0]), 'label': torch.tensor(2)}
]

# Stack tensors into batch form
input_ids = torch.stack([b['input_ids'] for b in batch])         # [4, 8]
attention_mask = torch.stack([b['attention_mask'] for b in batch]) # [4, 8]
labels = torch.stack([b['label'] for b in batch])

embedding = nn.Embedding(num_embeddings=16000, embedding_dim=128)
embedded_data = embedding(input_ids)  # [4, 8, 128]

x = embedded_data
# Each batch has dimensions [batch_size, seq_len, embedding_dim]
print(f"Batch Size {embedded_data.size()}")

# On applying Self Attention for each batch, the size becomes [batch_size, seq_len, d_v]
print(f"Self Attention Size: {sa(embedded_data).size()}")

# Multi Head Self Attention Size -> [batch_size, seq_len, n_heads*d_v] because the outputs from each attention head concatenated along last dim
print(f"Multi Head Self Attention Size: {mhsa(embedded_data).size()}")
x = mhsa(x)
print(f"MHSA Output 1st example in batch: {x[0,:,:]}")

# FFN -> 
x = ffn(x)
print(f"FFN Output: {x[0,:,:]}")
print(f"Shape after FFN: {x.size()}")

Batch Size torch.Size([4, 8, 128])
Self Attention Size: torch.Size([4, 8, 128])
Multi Head Self Attention Size: torch.Size([4, 8, 128])
MHSA Output 1st example in batch: tensor([[ 0.0080, -0.0329,  0.1310,  ..., -0.1632, -0.0729, -0.2994],
        [-0.0079,  0.0272,  0.1288,  ..., -0.1005, -0.0491, -0.2562],
        [ 0.0012, -0.0258,  0.0709,  ..., -0.1141, -0.0418, -0.2869],
        ...,
        [ 0.0389, -0.0528,  0.0900,  ..., -0.1330, -0.0439, -0.2535],
        [-0.0131,  0.0079,  0.0973,  ..., -0.1163, -0.0261, -0.2475],
        [-0.0495, -0.0214,  0.1042,  ..., -0.0770, -0.0284, -0.2744]],
       grad_fn=<SliceBackward0>)
FFN Output: tensor([[-0.0119, -0.0200, -0.0801,  ..., -0.0386, -0.0356, -0.0175],
        [-0.0169, -0.0069, -0.0701,  ..., -0.0302, -0.0394, -0.0157],
        [-0.0084, -0.0069, -0.0892,  ..., -0.0476, -0.0333, -0.0169],
        ...,
        [-0.0127, -0.0153, -0.0802,  ..., -0.0328, -0.0356, -0.0089],
        [-0.0114, -0.0102, -0.0823,  ..., -0.0357, -0.0305

In [17]:
enc = Encoder(d_model=128, d_k=16, d_v=16, n_heads=8, d_ff=512)

batch = [
    {'input_ids': torch.tensor([1, 15, 1075, 126, 194, 430, 105, 873]), 
     'attention_mask': torch.tensor([1, 1, 1, 1, 1, 1, 1, 1]), 'label': torch.tensor(1)},
    {'input_ids': torch.tensor([1, 34, 105, 65, 14, 43, 5, 3]), 
     'attention_mask': torch.tensor([1, 1, 1, 1, 1, 1, 0, 0]), 'label': torch.tensor(0)},
    {'input_ids': torch.tensor([1, 1500, 1075, 126, 194, 430, 15, 3]), 
     'attention_mask': torch.tensor([1, 1, 1, 1, 1, 0, 1, 1]), 'label': torch.tensor(1)},
    {'input_ids': torch.tensor([1, 1050, 755, 16, 20, 4, 105, 83]), 
     'attention_mask': torch.tensor([1, 1, 1, 1, 1, 1, 1, 0]), 'label': torch.tensor(2)}
]

# Stack tensors into batch form
input_ids = torch.stack([b['input_ids'] for b in batch])         # [4, 8]
attention_mask = torch.stack([b['attention_mask'] for b in batch]) # [4, 8]
labels = torch.stack([b['label'] for b in batch])

embedding = nn.Embedding(num_embeddings=16000, embedding_dim=128)
embedded_data = embedding(input_ids)  # [4, 8, 128]

x = embedded_data
print(f"Output for the first example in the batch after one encoder layer:\n {enc(x[0,:,:])}")
print(enc(x).size())

Output for the first example in the batch after one encoder layer:
 tensor([[-0.6904,  0.2927,  0.1687,  ...,  1.3598,  0.3392,  0.2428],
        [ 1.0353,  0.1211,  0.2290,  ..., -0.3489, -0.2762,  1.8946],
        [-0.4611,  1.4712, -0.4305,  ..., -0.3563,  0.0455, -1.4414],
        ...,
        [ 1.0105,  0.9443,  0.1113,  ...,  0.5987,  0.1199,  0.1417],
        [-0.3019, -0.5907, -0.7737,  ...,  0.9657,  0.2028,  0.1290],
        [-0.1351, -2.6033,  1.6679,  ..., -0.5404, -0.1157, -0.0051]],
       grad_fn=<NativeLayerNormBackward0>)
torch.Size([4, 8, 128])


In [31]:
trans = Transformer(vocab_size=16000, d_model=128, d_k=16, d_v=16, n_heads=8, d_ff=512, n_layers=4)

batch = [
    {'input_ids': torch.tensor([1, 15, 1075, 126, 194, 430, 105, 873]), 
     'attention_mask': torch.tensor([1, 1, 1, 1, 1, 1, 1, 1]), 'label': torch.tensor(1)},
    {'input_ids': torch.tensor([1, 34, 105, 65, 14, 43, 5, 3]), 
     'attention_mask': torch.tensor([1, 1, 1, 1, 1, 1, 0, 0]), 'label': torch.tensor(0)},
    {'input_ids': torch.tensor([1, 1500, 1075, 126, 194, 430, 15, 3]), 
     'attention_mask': torch.tensor([1, 1, 1, 1, 1, 0, 1, 1]), 'label': torch.tensor(1)},
    {'input_ids': torch.tensor([1, 1050, 755, 16, 20, 4, 105, 83]), 
     'attention_mask': torch.tensor([1, 1, 1, 1, 1, 1, 1, 0]), 'label': torch.tensor(2)}
]

# Stack tensors into batch form
input_ids = torch.stack([b['input_ids'] for b in batch])         # [4, 8]
attention_mask = torch.stack([b['attention_mask'] for b in batch]) # [4, 8]
labels = torch.stack([b['label'] for b in batch])

"""
Each batch goes through all the layers of the transformer architecture.
At each encoder layer, x goes through MHSA -> then residual + layer norm -> then FFN -> then again residual + layer norm
"""

print(f"Output for the first example in the batch after n transformer layers:\n{trans(input_ids)[0,:,:]}")
print(trans(input_ids).size())


Output for the first example in the batch after n transformer layers:
tensor([[ 0.0622, -0.1400, -1.0187,  ..., -0.3090,  1.7392,  0.8000],
        [ 0.0126, -1.6979, -1.4032,  ...,  0.6270,  0.5518,  2.2303],
        [-0.7923,  0.8259,  0.2630,  ..., -0.1014, -0.5340, -0.8316],
        ...,
        [-0.1460, -1.4061,  0.4073,  ...,  0.1582, -0.5481, -1.2606],
        [-0.9957,  1.7721,  0.4970,  ..., -0.1635,  0.6927,  1.1278],
        [-1.2004,  0.4545, -2.3351,  ...,  0.0312,  0.1477,  1.4403]],
       grad_fn=<SliceBackward0>)
torch.Size([4, 8, 128])


In [49]:
clsfr = Classifier(vocab_size=16000, max_len=1024, d_model=128, d_k=16, d_v=16, n_heads=8, d_ff=512, n_layers=5, n_classes=3)

batch = [
    {'input_ids': torch.tensor([1, 15, 1075, 126, 194, 430, 105, 873]), 
     'attention_mask': torch.tensor([1, 1, 1, 1, 1, 1, 1, 1]), 'label': torch.tensor(1)},
    {'input_ids': torch.tensor([1, 34, 105, 65, 14, 43, 5, 3]), 
     'attention_mask': torch.tensor([1, 1, 1, 1, 1, 1, 0, 0]), 'label': torch.tensor(0)},
    {'input_ids': torch.tensor([1, 1500, 1075, 126, 194, 430, 15, 3]), 
     'attention_mask': torch.tensor([1, 1, 1, 1, 1, 0, 1, 1]), 'label': torch.tensor(1)},
    {'input_ids': torch.tensor([1, 1050, 755, 16, 20, 4, 105, 83]), 
     'attention_mask': torch.tensor([1, 1, 1, 1, 1, 1, 1, 0]), 'label': torch.tensor(2)}
]

# Stack tensors into batch form
input_ids = torch.stack([b['input_ids'] for b in batch])         # [4, 8]
attention_mask = torch.stack([b['attention_mask'] for b in batch]) # [4, 8]
labels = torch.stack([b['label'] for b in batch])

print("This is the output after the classification head")
"""
The classification head takes the output from the transformer and applies positional encoding and computes the mean along the first dimension
"""
out = clsfr(input_ids)
print(clsfr(input_ids, attention_mask))

This is the output after the classification head
tensor([[-0.1642,  0.0261, -0.3276],
        [-0.1357, -0.2390, -0.4923],
        [-0.3571,  0.1022, -0.3079],
        [-0.0125, -0.1384, -0.3777]], grad_fn=<AddmmBackward0>)


In [52]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=clsfr.parameters(), lr=1e-4)

loss = loss_fn(out, labels)
pred = out.argmax(dim=1)

In [54]:
loss_fn(out, labels)

tensor(1.0062, grad_fn=<NllLossBackward0>)

In [53]:
pred

tensor([1, 0, 1, 0])

In [47]:
labels

tensor([1, 0, 1, 2])

tensor(1.0062, grad_fn=<NllLossBackward0>)