In [1]:
import numpy as np
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Define the number of patches and the patch size
num_patches = 196  # 14 x 14 patches for a 224 x 224 image
patch_size = 16    # 16 x 16 pixels for each patch
num_channels = 3   # 3 channels for RGB images

# Define the train and val transforms
train_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.CenterCrop((224, 224)),
    transforms.ToTensor(),
    transforms.Lambda(lambda x: x.view(-1, patch_size * patch_size * num_channels)),
])
val_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.CenterCrop((224, 224)),
    transforms.ToTensor(),
    transforms.Lambda(lambda x: x.view(-1, patch_size * patch_size * num_channels)),
])

# Load the Tiny ImageNet dataset
train_dataset = ImageFolder('timn/train', transform=train_transforms)
val_dataset = ImageFolder('timn/val', transform=val_transforms)


In [16]:
from torch.utils.data import DataLoader

# Create data loaders for training and validation data
# Define the train and val loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4)
# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [None]:
# for i,j in train_loader:
#     break

In [17]:
class ClassToken():
  def __init__(self,batch_size,dim):
    self.token = np.random.normal(scale=0.1, size=(batch_size,1, dim))
  
  def prepend(self,words):
    print("X shape is : ",words.shape)
    print("Token shape is : ",self.token.shape)
    return np.concatenate((self.token,words),axis=1) # [batch_size,196,768]+[batch_size,1,768]



In [18]:
def softmax(X,axis):
  exp_X = np.exp(X)
  return exp_X/np.sum(exp_X,axis=axis,keepdims=True)


In [30]:
class ScaledDotProductAttention():
  def __init__(self):
    pass
  
  def attention(self,q,k,v):
    dk = len(q[0])
    attn= np.matmul(softmax(np.matmul(q,np.swapaxes(k, -1, -2))/np.sqrt(dk),axis=-1),v) # (1,dk) x (dk,1) x (1,dv) = (1,1) x (1,dv) = (1,dv)
    print("Attention shape : ",attn.shape)
    return attn

In [43]:
class MultiHeadAttention():
  def __init__(self,dmodel,h):
    self.h=h
    self.dk = dmodel//h 
    self.dv = dmodel//h
    self.WQ = []
    self.WK = []
    self.WV = []
    self.WO = np.random.normal(scale=0.1, size=(h*self.dv, dmodel))
    self.sdpa = ScaledDotProductAttention()
    self.WQ = np.random.normal(scale=0.1, size=(h,dmodel, self.dk))
    self.WK = np.random.normal(scale=0.1, size=(h,dmodel, self.dk))
    self.WV = np.random.normal(scale=0.1, size=(h,dmodel, self.dv))
    self.WQ = np.array(self.WQ)
    self.WK = np.array(self.WK)
    self.WV = np.array(self.WV)
    print("WQ shape : ",self.WQ.shape)
    pass

  def MultiHead(self,Q,K,V):
    head = []
    q = np.matmul(Q,self.WQ) # (1,197,768) * (8,768,768/8) -> (8,197,768/8)
    print("Query shape : ",q.shape)
    k = np.matmul(K,self.WK)
    v = np.matmul(V,self.WV)
    head= self.sdpa.attention(q,k,v)
    head = np.concatenate(head,axis=1)
    print("Head shape : ",head.shape)
    multihead = np.dot(head,self.WO) # (dmodel,dv*h)
    print("Multi-Head Shape : ",multihead.shape)
    return multihead


In [33]:
class AddAndNorm():
  def __init__(self,feat_size):
    self.epsilon = 1e-8
    self.gamma = np.ones(feat_size)
    self.beta = np.zeros(feat_size)
  def add(self,x,y):
    return x+y
  def norm(self,x):
    #print("Norming")
    return np.multiply(self.gamma,(x-np.mean(x))) / np.sqrt(np.var(x)+self.epsilon)+self.beta

class Norm():
  def __init__(self,feat_size):
    self.epsilon = 1e-8
    self.gamma = np.ones(feat_size)
    self.beta = np.zeros(feat_size)
  def norm(self,x):
    #print("Norming")
    return np.multiply(self.gamma,(x-np.mean(x))) / np.sqrt(np.var(x)+self.epsilon)+self.beta

In [34]:

import math
class FeedForward():
  def __init__(self,in_dim,out_dim):
    self.out_dim = out_dim
    self.W = np.random.normal(scale=np.sqrt(2.0 / in_dim), size=(in_dim, out_dim))
    self.b = np.zeros((out_dim,)) 

  def forward(self,X):
    return np.dot(X,self.W)+self.b
  
class MLP():
  def __init__(self,d_model,d_ff,activation):
    self.d_ff=d_ff
    self.ff1 = FeedForward(d_model,d_ff)
    self.ff2 = FeedForward(d_ff,d_model)
    if activation == "GELU":
      self.activation = self.GELU
    elif activation == "RELU":
      self.activation = self.RELU

  def RELU(self,x):
    return np.maximum(x,0)
  
  def GELU(self,x):
    # return 0.5 * x * (1+self.erf(x/np.sqrt(2)))
    return 0.5 * x * (1+np.vectorize(math.erf)(x/np.sqrt(2)))
  
  def forward(self,x):
    return self.GELU(self.ff2.forward(self.GELU(self.ff1.forward(x))))
  

In [89]:
def CrossEntropyLoss(y_pred,y_true):
    n = len(y_pred[-1])
    return -np.mean(y_true*np.log(y_pred) + (1-y_true)*np.log(1-y_pred))
    # return (-1/n) * (np.dot(y_true,np.log(y_pred).T) + np.dot(1-y_true,np.log(1-y_pred).T))

In [90]:
class Encoder():
  def __init__(self,batch_size=1,vocab_size=196,dmodel=768,h=12,d_ff=3072,num_classes = 200):
    # self.ie = InputEmbedding(vocab_size,dmodel)
    self.mha = MultiHeadAttention(dmodel,h)
    self.ffn = MLP(d_ff=d_ff,d_model=dmodel,activation='GELU')
    self.an = AddAndNorm((1,dmodel))
    self.cls = ClassToken(batch_size=batch_size,dim=dmodel)
    self.num_classes = num_classes
    self.batch_size = batch_size
  def encode(self,X):
    X = self.cls.prepend(X)
    print("Prepended Class Token, Shape: ",X.shape)
    X = self.an.norm(X)
    multi_head_op = self.mha.MultiHead(X,X,X)
    print("Performed Multi-Headed Attention")
    print("Multi Head OP shape : ",multi_head_op.shape),
    mhan=self.an.norm(self.an.add(X,multi_head_op))
    ffn_op = self.ffn.forward(mhan)
    print("Passed through Feed-Forward Layer")
    op =self.an.add(mhan,ffn_op)
    return op 
  
  def loss(self,y_true):
    one_hot_encoded = np.zeros((self.batch_size,self.num_classes))
    one_hot_encoded[np.arange(self.batch_size),y_true]=1 #set the value at index [i, y_true[i]]
    return CrossEntropyLoss(self.class_prob,one_hot_encoded)


  def pred(self,input):
    encoded = self.encode(input)
    X = encoded[:,0,:]
    classification_head = FeedForward(in_dim = X.shape[-1],out_dim =self.num_classes ) # 200 output classes
    self.logits = classification_head.forward(X)
    self.class_prob = softmax(self.logits, axis=-1)
    # print("Logits Shape : ",self.logits.shape) #Just because dimension of op matches doesn't mean it's right, axis = 0 would also give op (1,200) but it's not right, correct is axis = 1 i.e., last axis=-1.
    # print("Len Logits.shape - 1 is ",len(logits.shape)-1) #Not len(logits)-1
    # print("Class Prob Shape : ",class_prob.shape)
    y = np.argmax(self.class_prob,axis=-1)
    return y

vocab_size = 224*224 // (16*16)
print(vocab_size)
encoder = Encoder(vocab_size=vocab_size)



196
WQ shape :  (12, 768, 64)


In [25]:
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)


# Get the first image from the train loader
images, labels = next(iter(train_loader))

# Print the shape of the image tensor
print(images.shape)
input = images[0]

torch.Size([1, 196, 768])


In [26]:
print(images.numpy().shape)

(1, 196, 768)


In [50]:
print(labels.numpy())

[32]


In [None]:
# encoder_op = encoder.encode(images.numpy())
# print(encoder_op)

In [None]:
# print(encoder_op.shape)

(1, 197, 768)


In [91]:
pred = encoder.pred(images.numpy())

X shape is :  (1, 196, 768)
Token shape is :  (1, 1, 768)
Prepended Class Token, Shape:  (1, 197, 768)
Query shape :  (12, 197, 64)
Attention shape :  (12, 197, 64)
Head shape :  (197, 768)
Multi-Head Shape :  (197, 768)
Performed Multi-Headed Attention
Multi Head OP shape :  (197, 768)
Passed through Feed-Forward Layer


In [59]:
print(pred.shape)

(1,)


In [72]:
print(pred)

[112]


In [75]:
print(labels.numpy())

[32]


In [78]:
temp = np.zeros(200)
temp[10] = 1

In [79]:
print(temp)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.]


In [88]:
print(encoder.class_prob[0].shape)
print(temp.shape)

(200,)
(200,)


In [102]:
loss = encoder.loss(labels.numpy())
print(loss)

[[0.05019346]]


In [76]:
print(encoder.class_prob)

[[1.83659681e-04 1.42803809e-02 1.49912716e-02 2.40114072e-05
  9.37851953e-04 1.98097820e-02 1.14443860e-04 4.11329342e-03
  1.32011269e-03 5.78378344e-04 1.78620641e-05 2.04686582e-04
  2.06816249e-03 2.69366454e-04 1.40383945e-04 2.20324647e-03
  1.60257968e-04 2.27771578e-04 2.11758221e-05 4.18921676e-05
  6.06369287e-04 4.88965022e-05 1.58719998e-05 6.86537658e-05
  2.71628345e-03 5.82982884e-04 1.98867334e-03 5.63010053e-03
  1.47091511e-03 1.78469167e-02 2.40469537e-04 2.63055260e-04
  1.03512784e-03 1.05656528e-05 5.61429776e-05 1.52178847e-04
  1.30988687e-04 6.72176106e-05 1.78186406e-04 4.01635236e-04
  2.42524117e-02 4.40830901e-05 1.56386539e-02 8.37649332e-02
  8.89839113e-07 7.66777298e-04 9.37002406e-05 4.15835598e-03
  5.35968675e-02 1.83398810e-03 6.83115516e-04 9.26473484e-05
  1.25650070e-03 8.81574167e-03 8.41732025e-04 3.47895563e-05
  4.91263630e-04 5.06177216e-05 3.70362476e-04 4.77722889e-04
  8.60407460e-04 8.41741575e-04 2.71406553e-02 1.86821159e-04
  1.4599

In [None]:
# def normalizer(dataset):
#     mean = np.zeros(3)
#     std = np.zeros(3)
#     for images,_ in dataset:
#         print(np.mean(images.numpy(),axis=(0,1))
#         mean+= np.mean(images.numpy(),axis=(0,1))
#         std += np.std(images.numpy(),axis=(0,1))
#     mean /= len(dataset)
#     std /= len(dataset)

#     normalized_dataset = transforms.Normalize(mean=mean,std=std)(dataset)
#     return normalized_dataset

In [None]:
# train_dataset = normalizer(train_dataset)
# val_dataset = normalizer(val_dataset)

In [None]:
# from scipy.integrate import quadrature
# def f(self,t):
  #   return np.exp(-t**2)

  # def MonteCarlo(self,x,a,b):
  #   N=10000
  #   sample = np.random.uniform(a,b,N)
  #   y = self.f(sample)
  #   I = (b-a)*np.mean(y)
  #   return I

  # def gaussian_quadrature(self,x):
  #   return quadrature(self.f,0,x)
  
  # def erf(self,x):
  #   I,_ = np.vectorize(self.gaussian_quadrature)(x)
  #   # I = self.MonteCarlo(0,x)
  #   return (2*np.pi)* (I)