In [None]:
try:
  import google.colab
  IN_COLAB = True
  print("Running as a Colab notebook")
  %pip install git+https://github.com/neelnanda-io/Easy-Transformer.git@clean-transformer-demo
  # Install another version of node that makes PySvelte work way faster
  !curl -fsSL https://deb.nodesource.com/setup_16.x | sudo -E bash -; sudo apt-get install -y nodejs
  %pip install git+https://github.com/neelnanda-io/PySvelte.git
  %pip install fancy_einsum
  %pip install einops
except:
  IN_COLAB = False
  print("Running as a Jupyter notebook - intended for development only!")

Running as a Colab notebook
Collecting git+https://github.com/neelnanda-io/Easy-Transformer.git@clean-transformer-demo
  Cloning https://github.com/neelnanda-io/Easy-Transformer.git (to revision clean-transformer-demo) to /tmp/pip-req-build-8l1fz2lh
  Running command git clone --filter=blob:none --quiet https://github.com/neelnanda-io/Easy-Transformer.git /tmp/pip-req-build-8l1fz2lh
  Running command git checkout -b clean-transformer-demo --track origin/clean-transformer-demo
  Switched to a new branch 'clean-transformer-demo'
  Branch 'clean-transformer-demo' set up to track remote branch 'clean-transformer-demo' from 'origin'.
  Resolved https://github.com/neelnanda-io/Easy-Transformer.git to commit 1f25219e631aeb478d17075d47274db32c874e88
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting einops (from easy-transformer==0.1.0)
  Downloading einops-0.6.1-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m1.1 MB/s

In [None]:
import numpy as np
import einops
from fancy_einsum import einsum
import torch
import torch.nn as nn

In [None]:
a = np.array([[1,2],[3,4]])
print(a)
b = a[0,:]
print(b)

[[1 2]
 [3 4]]
[1 2]


# High-level questions
1/ einops & einsum 分别做什么不同的事情

`einsum` and `einops` are both tools used for expressing complex array manipulations, but they serve different purposes and have some key differences.

### einsum

`einsum` stands for "Einstein summation," and it's a function that provides a way to perform operations on multi-dimensional arrays (like matrices or tensors). The name comes from the Einstein summation convention, often used in physics.

Here's how it works:

- It takes a specified string as its input, detailing the subscript labels for the summation.
- It then performs the corresponding element-wise multiplication and summation according to these labels.

This function is a part of libraries like NumPy, TensorFlow, and PyTorch, and it allows for a very concise specification of various array operations. For example, matrix multiplication, tensor contraction, and more.

### einops

这里看的出来还是有挺本质的差别的，einsum is a function while einops is a lib (for many functions?)
`einops` is a library that provides a more human-readable way to work with tensor operations. While `einsum` can be powerful, it often ends up being a cryptic and hard-to-read way to specify tensor operations, especially for more complex tasks. `einops` aims to solve this by providing a more intuitive and flexible interface.

Some of the key differences between `einops` and `einsum` include:

1. **Syntax**: `einops` has a different syntax that is meant to be more readable and consistent. It provides operations like `rearrange`, `reduce`, and `repeat` that make it more explicit what the operation is doing.

- Good. I like them.

2. **Functionality**: While `einsum` is primarily focused on summation operations involving tensors, `einops` provides a broader set of functionalities for manipulating tensors, including rearranging dimensions, reducing dimensions with various aggregations, and repeating dimensions.

- Interesting. 可能最重要的，从功能上区分开二者。einsum主要做summation相关的；einops有rearrange, reduce, and repeat.

3. **Compatibility**: `einops` is designed to be compatible with a wide range of deep learning frameworks, such as TensorFlow, PyTorch, and more, aiming to provide a consistent interface across them. - ok

4. **Debugging**: `einops` often provides more informative error messages and allows for better debugging experiences.

5. **Performance**: `einops` might also offer optimized performance for certain operations, although this can be dependent on the specific task and the underlying framework.

### Conclusion

In summary, `einsum` is a specific function that provides a compact way to perform tensor contractions and other summation operations, following the Einstein summation convention. `einops` is a library that offers a more comprehensive, flexible, and human-readable way to perform tensor operations. It provides a different interface that prioritizes readability and ease of use, while still being powerful and efficient. If you're often working with complex tensor operations, `einops` might be a more user-friendly choice, while `einsum` provides a compact way to express these operations if you're comfortable with its syntax.

In [None]:
# https://einops.rocks/api/reduce/


>>> x = np.random.randn(100, 32, 64)

# perform max-reduction on the first axis
>>> y = reduce(x, 't b c -> b c', 'max') # 同 global average pooling，被省略的字母就是施加某种操作的dimension

# same as previous, but with clearer axes meaning
>>> y = reduce(x, 'time batch channel -> batch channel', 'max') # 这种可读性更强一点。
# 我不确定的地方在于 生成的dimension是不是这样子的：time batch channel -> 1 batch channel
>>> x = np.random.randn(10, 20, 30, 40)

# 2d max-pooling with kernel size = 2 * 2 for image processing
>>> y1 = reduce(x, 'b c (h1 h2) (w1 w2) -> b c h1 w1', 'max', h2=2, w2=2)
# 这个可读性其实并不怎么好，h1 w1 应该是y1本身的w & h 但是 h2 w2时max-pooling的dimensions

# if one wants to go back to the original height and width, depth-to-space trick can be applied
>>> y2 = rearrange(y1, 'b (c h2 w2) h1 w1 -> b c (h1 h2) (w1 w2)', h2=2, w2=2)
# interesting. Although the hell this is not readable at all
>>> assert parse_shape(x, 'b _ h w') == parse_shape(y2, 'b _ h w')

# Adaptive 2d max-pooling to 3 * 4 grid
>>> reduce(x, 'b c (h1 h2) (w1 w2) -> b c h1 w1', 'max', h1=3, w1=4).shape
(10, 20, 3, 4)
# 这应该是reshape在einops里的具体操作：把当下的dimensions、未来的dimensions、操作的种类说清楚，剩下的交给代码自己来做

# Global average pooling
>>> reduce(x, 'b c h w -> b c', 'mean').shape  # 被省略的字母似乎就是average pooling施影响力的 channels
(10, 20)

# Subtracting mean over batch for each channel
>>> y = x - reduce(x, 'b c h w -> () c () ()', 'mean')
# 我猜括号只是为了占位的，而且对于这一行代码，占位挺重要的，因为"c"在中间

# Subtracting per-image mean for each channel
>>> y = x - reduce(x, 'b c h w -> b c () ()', 'mean')
# 如果括号只是为了占位的话，以上代码应该同：
>>> y = x - reduce(x, 'b c h w -> b c ', 'mean') #NEPTUNE WORK


# yes yes 我看懂了这里的代码，per_image mean相对容易，其实是说，每一batch、每一channel (c) 都分别求平均值，然后从x那里减去；
# 但"mean over a batch for each channel" 是说，要对所有batches的每一个channels求均值



In [None]:
# Einsum practice
# https://rockt.github.io/2018/04/30/einsum


A = np.random.randn(100,32,24)
B = np.random.randn(24,96,100)
D = np.random.randn(100,32,24)
F = np.random.randn(100,32,1)
O = np.random.randn(100,24,64)
I = np.ones((2,2,2))

print('A:',A.shape)
print('B:',B.shape)


# matmul
C = einsum('a b c, c d e -> a b d e',A,B) # only letters allowed to represent dims (not numbers!)
print('C:', C.shape)

# element-wise
E = einsum('a b c, a b c -> a b c', A,D)
print('E:',E.shape)

# element-wise, broadcast
G = einsum('a b c, a b d -> a b c', A,F)
print('G:',G.shape)


# element-wise, broadcast, but the other way around
H = einsum('a b c, a b d -> a b d', A,F)
print('H:',G.shape)  # It looks you can not do broadcast the other way around.

# sum
# A_sum = einsum('a b c -> ',A)
# print('A:',A,'A_sum:',A_sum,'shape of A_sum:',A_sum.shape)
# I_sum = einsum('a b c->', I)
# print('I_sum:',I_sum,'I_sum shape',I_sum.shape)

# B_sum = einsum('a b c->', B)
# print('B_sum:',B_sum,'B_sum shape',B_sum.shape)

# row sum: sum of every row
#A_row_sum = einsum('a b c -> b c',A)
# print('A_row_sum shape:', A_row_sum.shape)

# Column sum: sum of every column (omit)

# Reshape  # transpose 则有固定的顺序
B_trans = einsum('a b c-> c b a',B)
print('B_trans:',B_trans.shape)

J = np.arange(6).reshape(2,3)
print('J:',J)

# Dot product between two vectors
M = N = np.arange(5)
MN = einsum('a, a ->',M,N)
print('MN & shape',MN,MN.shape)

# Dot product between two matrices
K = L = np.arange(6).reshape(2,3)
KL = einsum('a b, a b ->',K,L)

print('KL & shape',KL,KL.shape) # There is no shape for a scalar


# Outer product  --> omit, rarely used.

# BATCH MATRIX MULTIPLICATION --> Looks very useful in practice. You keep the batch dim but do matmul for other dims

AO = einsum('a b c, a c d -> a b d', A,O)
print("AO:",AO.shape)




A: (100, 32, 24)
B: (24, 96, 100)
C: (100, 32, 96, 100)
E: (100, 32, 24)
G: (100, 32, 24)
H: (100, 32, 24)
B_trans: (100, 96, 24)
J: [[0 1 2]
 [3 4 5]]
MN & shape 30 ()
KL & shape 55 ()
AO: (100, 32, 64)


In [None]:
# Attention - Aug 3rd. I didn't implement this myself. A bit complicated. Will do so once needed.
# Parameters
# -- [hidden_dimension]
bM, br, w = np.random_tensors([7], num=3, requires_grad=True)
# -- [hidden_dimension x hidden_dimension]
WY, Wh, Wr, Wt = random_tensors([7, 7], num=4, requires_grad=True)

# Single application of attention mechanism
def attention(Y, ht, rt1):
  # -- [batch_size x hidden_dimension]
  tmp = torch.einsum("ik,kl->il", [ht, Wh]) + torch.einsum("ik,kl->il", [rt1, Wr])
  Mt = F.tanh(torch.einsum("ijk,kl->ijl", [Y, WY]) + tmp.unsqueeze(1).expand_as(Y) + bM)
  # -- [batch_size x sequence_length]
  at = F.softmax(torch.einsum("ijk,k->ij", [Mt, w]))
  # -- [batch_size x hidden_dimension]
  rt = torch.einsum("ijk,ij->ik", [Y, at]) + F.tanh(torch.einsum("ij,jk->ik", [rt1, Wt]) + br)
  # -- [batch_size x hidden_dimension], [batch_size x sequence_dimension]
  return rt, at

# Sampled dummy inputs
# -- [batch_size x sequence_length x hidden_dimension]
Y = random_tensors([3, 5, 7])
# -- [batch_size x hidden_dimension]
ht, rt1 = random_tensors([3, 7], num=2)

rt, at = attention(Y, ht, rt1)
at  # -- print attention weights