In [19]:
import numpy as np
from numpy.random import randn
from itertools import count
import sys

np.random.seed(1)

# Export stuffs!
exported = dict()

In [20]:
expert_num = 64 # number of experts
expert_dim = 512
emb_dim = 256 # embedding size
token_num = 2048 # think token count, from unrolled batches possibly

# Bypass weight
b = 0.1

# Export sizes
exported = dict(
    expert_count=expert_num,
    expert_size=expert_dim,
    embedding_size=emb_dim,
    token_count=token_num,
    b=np.array([b], dtype=np.float32) # I don't care that those numbers are longs, but I want this multiplier to be float32.
)

# token count * embedding size
words = randn(token_num, emb_dim).astype(np.float32)
exported["src"] = words

In [21]:
np.savez('test.npz', **exported)
out = np.load('test.npz')
out['expert_count'].dtype

dtype('int64')

In [22]:
experts_w1 = randn(expert_num, emb_dim, expert_dim).astype(np.float32)
exported["experts_w1"] = experts_w1

experts_b1 = randn(expert_num, 1, expert_dim).astype(np.float32)
exported["experts_b1"] = experts_b1

experts_w2 = randn(expert_num, expert_dim, emb_dim).astype(np.float32)
exported["experts_w2"] = experts_w2

experts_b2 = randn(expert_num, 1, emb_dim).astype(np.float32)
exported["experts_b2"] = experts_b2

In [23]:
def scramble(a, axis=-1):
    """
    Shuffle `a` in-place along the given axis.

    Apply numpy.random.shuffle to the given axis of `a`.
    Each one-dimensional slice is shuffled independently.
    """
    b = a.swapaxes(axis, -1)
    # Shuffle `b` in-place along the last axis.  `b` is a view of `a`,
    # so `a` is shuffled in place, too.
    shp = b.shape[:-1]
    for ndx in np.ndindex(shp):
        np.random.shuffle(b[ndx])
    return

In [33]:
# Generate random binary router

router = np.zeros((token_num, expert_num)).astype(np.float32)
print(router.shape)
# Select K for TOP-K
k = 4

counter = 0
expert = 0
for t in range(token_num):
    for counter in range(k):
        router[t][expert + counter] = 1


np.set_printoptions(threshold=sys.maxsize)
#print(router)

# router_shuf = np.transpose(router)
scramble(router, axis=-1)
# router = np.transpose(router_shuf)

# Sanity check: sum per expert
print(f'Total is {np.sum(router)}, expected {token_num * k}')
print(f'Expect around {token_num * k / expert_num} per expert')
print(np.sum(router, axis=0))

# Order seems to be [token,expert]. Flip it back to [expert,token] to make the for-loop easier
router = np.transpose(router)
print(router.shape)

exported["router"] = router

(2048, 64)
Total is 8192.0, expected 8192
Expect around 128.0 per expert
[124. 134. 121. 128. 141. 127. 126. 132. 125. 136. 142. 121. 141. 143.
 111. 120. 122. 127. 112. 134. 135. 149. 129. 134. 136. 131. 139. 115.
 140. 115. 126. 115. 135. 130. 119. 125. 127. 118. 123. 115. 122. 135.
 143. 131. 115. 122. 129. 123. 131. 127. 121. 141. 112. 114. 138. 151.
 121. 118. 121. 114. 122. 148. 140. 130.]
(64, 2048)


In [18]:
# words(7) * emb_size(4)
total_output = b * words # … so that if a word doesn't go through any expert it will still have some value

# Repeat for every expert, looking at routed do determine which words go to said expert
for n, expert_w1, expert_b1, expert_w2, expert_b2, mask in zip(count(), experts_w1, experts_b1, experts_w2, experts_b2, router):
    print(f"Expert {n}:")
    assert mask.shape[0] == words.shape[0]
    
    # select all words where the mask for this expert is > 0
    expert_input = words[mask.nonzero()]
    
    # classic matmul op for feed-forward I guess? + relu
    expert_output1 = np.maximum(0.0, np.add(np.matmul(expert_input, expert_w1), expert_b1))
    expert_output2 = np.maximum(0.0, np.add(np.matmul(expert_output1, expert_w2), expert_b2))

    # Addition for all the selected experts (todo after loop: normalisation?)
    total_output[mask.nonzero()] += expert_output2

    print(f"  {expert_input.shape} * {expert_w1.shape} + {expert_b1.shape} = {expert_output1.shape}\n")
    print(f"  {expert_output1.shape} * {expert_w2.shape} + {expert_b2.shape} = {expert_output2.shape}\n")
    
    exported[f"expert_{n}_src"] = expert_input
    exported[f"expert_{n}_dst"] = expert_output2
    
print(f"Total output: {total_output.shape}")
exported["dst"] = total_output

Expert 0:
  (135, 256) * (256, 512) + (1, 512) = (135, 512)

  (135, 512) * (512, 256) + (1, 256) = (135, 256)

Expert 1:
  (141, 256) * (256, 512) + (1, 512) = (141, 512)

  (141, 512) * (512, 256) + (1, 256) = (141, 256)

Expert 2:
  (152, 256) * (256, 512) + (1, 512) = (152, 512)

  (152, 512) * (512, 256) + (1, 256) = (152, 256)

Expert 3:
  (120, 256) * (256, 512) + (1, 512) = (120, 512)

  (120, 512) * (512, 256) + (1, 256) = (120, 256)

Expert 4:
  (123, 256) * (256, 512) + (1, 512) = (123, 512)

  (123, 512) * (512, 256) + (1, 256) = (123, 256)

Expert 5:
  (115, 256) * (256, 512) + (1, 512) = (115, 512)

  (115, 512) * (512, 256) + (1, 256) = (115, 256)

Expert 6:
  (119, 256) * (256, 512) + (1, 512) = (119, 512)

  (119, 512) * (512, 256) + (1, 256) = (119, 256)

Expert 7:
  (128, 256) * (256, 512) + (1, 512) = (128, 512)

  (128, 512) * (512, 256) + (1, 256) = (128, 256)

Expert 8:
  (131, 256) * (256, 512) + (1, 512) = (131, 512)

  (131, 512) * (512, 256) + (1, 256) = (131

In [9]:
np.savez("data.npz", **exported)

In [None]:
experts_w2[0].shape

In [None]:
# Do grid search generation for various variable values. Save them to separate files.
# Do not run automatically LOL

expert_num_list = [8, 16, 32, 64, 128, 256] # number of experts
expert_dim_list = [256, 512, 1024, 2048, 4096]
emb_dim_list = [256, 512, 1024, 2048, 4096] # embedding size
token_num_list = [128, 256, 512, 1024, 2048, 4096, 8192] # think token count, from unrolled batches possibly


for expert_num in expert_num_list:
    for expert_dim in expert_dim_list:
        for emb_dim in emb_dim_list:
            for token_num in token_num_list:
                
                exported = dict()
                
                # Generate input
                words = randn(token_num, emb_dim).astype(np.float32)
                exported["src"] = words
                
                # Generate parameters
                experts_w1 = randn(expert_num, emb_dim, expert_dim).astype(np.float32)
                exported["experts_w1"] = experts_w1

                # expert (3) * emb_prime_size (4) * emb_size (5)
                experts_b1 = randn(expert_num, 1, expert_dim).astype(np.float32)
                exported["experts_b1"] = experts_b1

                # expert (3) * emb_size (5) * emb_size (5)
                experts_w2 = randn(expert_num, expert_dim, emb_dim).astype(np.float32)
                exported["experts_w2"] = experts_w2

                # expert (3) * emb_prime_size (4) * emb_size (5)
                experts_b2 = randn(expert_num, 1, emb_dim).astype(np.float32)
                exported["experts_b2"] = experts_b2

                b = 0.1

                # words(7) * emb_size(4)
                total_output = b * words # … so that if a word doesn't go through any expert it will still have some value

                # Repeat for every expert, looking at routed do determine which words go to said expert
                for n, expert_w1, expert_b1, expert_w2, expert_b2, mask in zip(count(), experts_w1, experts_b1, experts_w2, experts_b2, router):
                    #print(f"Expert {n}:")

                    # select all words where the mask for this expert is > 0
                    expert_input = words[mask.nonzero()]

                    # classic matmul op for feed-forward I guess? + relu
                    expert_output1 = np.maximum(0.0, np.add(np.matmul(expert_input, expert_w1), expert_b1))
                    expert_output2 = np.maximum(0.0, np.add(np.matmul(expert_output1, expert_w2), expert_b2))

                    # I did assignment here, but could also be addition
                    total_output[mask.nonzero()] = expert_output2

                    #print(f"  {expert_input.shape} * {expert_w1.shape} + {expert_b1.shape} = {expert_output1.shape}\n")
                    #print(f"  {expert_output1.shape} * {expert_w2.shape} + {expert_b2.shape} = {expert_output2.shape}\n")

                    exported[f"expert_{n}_src"] = expert_input
                    exported[f"expert_{n}_dst"] = expert_output2

                #print(f"Total output: {total_output.shape}")
                exported["dst"] = total_output
                
                file_name = "exp-num_" + str(expert_num) + "_exp-dim_" + str(expert_dim) + "_emb-dim_" + str(emb_dim) + "_token-num_" + str(token_num)
                np.savez_compressed(file_name + ".npz", **exported)
                print("Finished ", file_name)