<a href="https://colab.research.google.com/github/kmalik22/colabs/blob/main/transformer_numpy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [48]:
import numpy as np
import torch
from typing import List

In [21]:
D_MODEL = 3
D_FF = int(D_MODEL * 2)
BSZ = 2

In [30]:
torchlin = torch.nn.Linear(2, 4, True)
torchlin.weight.shape

torch.Size([4, 2])

In [42]:
#class torch.nn.Linear(in_features, out_features, bias=True, device=None, dtype=None)
class MyLinear:
  def __init__(self, in_features, out_features, bias=True, debug=True):
    # create a matrix (in_features, out_features)
    # backprop. During forward, store activations in a buffer.
    self.in_features = in_features
    self.out_features = out_features
    self.has_bias = bias
    self.weight = np.random.normal(0, 1, size=(in_features, out_features))
    self.bias = np.zeros(shape=(out_features))
    self.stored_activations: List[np.array] = []
    self.weight_grad = None
    self.debug = debug

  def forward(self, activations: np.array):
    assert len(activations.shape) >= 2
    assert activations.shape[-1] == self.in_features
    self.stored_activations.append(activations)
    if self.debug:
       print(f"MyLinear.forward(), batch={len(self.stored_activations)}")
    return ( (activations @ self.weight) + self.bias)

  def backward(self, output_act_grad: np.array) -> np.array:
    """Computes weight grad internally. Returns input_act_grad
    """
    assert len(self.stored_activations > 0)
    input_acts = np.concat(self.stored_activations)
    if self.debug:
      print(f"MyLinear.bwd(), batches:{len(self.stored_activations)}, act.shape:{input_acts.shape}")
    # input_acts = (bsz in_features)
    # output_act_grad = (bsz out_features)
    # wts = (in_features out_features)
    # wgrad = (in_features out_features)
    bsz = input_acts.shape[0]
    assert input_acts.shape[1] == self.in_features, f"{input_acts.shape[1]=} {self.in_features=}"
    assert output_act_grad.shape == (bsz, self.out_features), f"{output_act_grad.shape=} {bsz}"
    wgrad = input_acts.transpose() @ output_act_grad

  def clear_grad_state(self):
    self.stored_activations = []
    self.weight_grad = None


def make_similar_linear(my_lin: MyLinear) -> torch.nn.Linear:
  """Returns a torch.nn.Linear class that has the same weights as MyLinear
  """
  rv = torch.nn.Linear(in_features = my_lin.in_features, out_features=my_lin.out_features, bias=my_lin.has_bias, dtype=torch.float32)
  # torch stores weight as (out_features, in_features), so need to transpose
  rv.load_state_dict(
      {
        "weight": torch.tensor(my_lin.weight.transpose()),
        "bias": torch.tensor(my_lin.bias),
      }
  )
  return rv


In [47]:
my_linear = MyLinear(D_MODEL, D_FF, True)
torch_linear = make_similar_linear(my_linear)




# Match Forward

In [12]:
random_act = np.random.normal(0, 1, size=(BSZ, D_MODEL)).astype(np.float32)

my_act = my_linear.forward(random_act)
torch_act = torch_linear(torch.tensor(random_act, dtype=torch.float32))
assert np.all(np.isclose(my_act, torch_act.detach().numpy()))