# Generate Fake Tensor

- attn_time
- mlp_time

In [27]:
import numpy as np

# Generate synthetic attention time data
# Shape: (tp_degree_log, cp_degree_log, seq_len_log)
# tp_degree_log: [1,2,4,8] -> 4 values 
# cp_degree_log: [1,2,4,8] -> 4 values
# seq_len_log: [2^10, 2^11, ..., 2^20] -> 11 values
tp_degrees = [0, 1, 2, 3] # log2 of [1,2,4,8]
cp_degrees = [0, 1, 2, 3] # log2 of [1,2,4,8] 
seq_lens = np.arange(10, 21) # log2 of sequence lengths

attn_time = np.zeros((len(tp_degrees), len(cp_degrees), len(seq_lens)))

# Fill with synthetic data that follows these patterns:
# 1. Larger sequence lengths take more time
# 2. More parallelism (higher tp/cp) reduces time, but with diminishing returns
for i, tp in enumerate(tp_degrees):
    for j, cp in enumerate(cp_degrees):
        for k, seq_len in enumerate(seq_lens):
            # Base computation scales quadratically with sequence length
            base_time = 2**(2*seq_len - 20) # Normalize to make numbers reasonable
            
            # Parallelism benefit with diminishing returns
            parallel_factor = 1.0 / ((2**tp + 2**cp)**0.8)
            
            attn_time[i,j,k] = base_time * parallel_factor

# Generate synthetic MLP time data  
# Shape: (tp_degree_log, num_token_log)
mlp_time = np.zeros((len(tp_degrees), len(seq_lens)))

for i, tp in enumerate(tp_degrees):
    for j, seq_len in enumerate(seq_lens):
        # MLP computation scales linearly with sequence length
        base_time = 2**(seq_len - 10) # Normalize
        
        # TP parallelism benefit
        parallel_factor = 1.0 / (2**tp)**0.9
        
        mlp_time[i,j] = base_time * parallel_factor


In [28]:
attn_time

array([[[5.74349177e-01, 2.29739671e+00, 9.18958684e+00, 3.67583474e+01,
         1.47033389e+02, 5.88133558e+02, 2.35253423e+03, 9.41013692e+03,
         3.76405477e+04, 1.50562191e+05, 6.02248763e+05],
        [4.15243647e-01, 1.66097459e+00, 6.64389834e+00, 2.65755934e+01,
         1.06302374e+02, 4.25209494e+02, 1.70083798e+03, 6.80335190e+03,
         2.72134076e+04, 1.08853630e+05, 4.35414522e+05],
        [2.75945932e-01, 1.10378373e+00, 4.41513492e+00, 1.76605397e+01,
         7.06421587e+01, 2.82568635e+02, 1.13027454e+03, 4.52109815e+03,
         1.80843926e+04, 7.23375705e+04, 2.89350282e+05],
        [1.72427286e-01, 6.89709144e-01, 2.75883658e+00, 1.10353463e+01,
         4.41413852e+01, 1.76565541e+02, 7.06262163e+02, 2.82504865e+03,
         1.13001946e+04, 4.52007785e+04, 1.80803114e+05]],

       [[4.15243647e-01, 1.66097459e+00, 6.64389834e+00, 2.65755934e+01,
         1.06302374e+02, 4.25209494e+02, 1.70083798e+03, 6.80335190e+03,
         2.72134076e+04, 1.08853630e

In [29]:

# Save to files
np.save('attn_time.npy', attn_time)
np.save('mlp_time.npy', mlp_time)

print("Generated synthetic timing data:")
print("attn_time shape:", attn_time.shape)
print("mlp_time shape:", mlp_time.shape)
print("\nSample attn_time values:\n", attn_time[0,0,:5])
print("\nSample mlp_time values:\n", mlp_time[0,:5])


Generated synthetic timing data:
attn_time shape: (4, 4, 11)
mlp_time shape: (4, 11)

Sample attn_time values:
 [  0.57434918   2.29739671   9.18958684  36.75834736 147.03338944]

Sample mlp_time values:
 [ 1.  2.  4.  8. 16.]


In [30]:
!python simulator.py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


args: Namespace(data_path=None, tokenizer=None, batch_size=32, tp_degree=1, cp_degree=1, dp_degree=1, num_tokens_per_data=512, attn_time='attn_time.npy', mlp_time='mlp_time.npy', batch_samples=100)
Traceback (most recent call last):
  File "/global/homes/j/jundac/envs/d2/lib/python3.12/site-packages/huggingface_hub/utils/_http.py", line 409, in hf_raise_for_status
    response.raise_for_status()
  File "/global/homes/j/jundac/envs/d2/lib/python3.12/site-packages/requests/models.py", line 1024, in raise_for_status
    raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://huggingface.co/None/resolve/main/tokenizer_config.json

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/global/homes/j/jundac/envs/d2/lib/python3.12/site-packages/transformers/utils/hub.py", line 424, in cached_files
    hf_hub_download(
  File "/global/homes/j/jundac/envs/d2/lib/python3

# Test Data Generation Result

In [31]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")


In [32]:
from datasets import load_dataset

# data_path = "wikitext"
# 	`load_dataset('wikitext', 'wikitext-103-raw-v1')`
doc_dataset = load_dataset("wikitext", 'wikitext-103-raw-v1', streaming=True)
doc_dataset = doc_dataset["test"]

In [33]:
def get_data(num_tokens_per_data, tokenizer, doc_dataset):
    data_budget = num_tokens_per_data
    doc_lens = []
    for data in doc_dataset:
        text = data["text"]
        tokenized_text = tokenizer(text)
        token_count = len(tokenized_text["input_ids"])
        if token_count == 0:
            continue
        while data_budget < token_count:
            doc_lens.append(data_budget)
            yield doc_lens
            doc_lens = []
            data_budget = num_tokens_per_data
            token_count -= data_budget

        if data_budget >= token_count:
            doc_lens.append(token_count)
            data_budget -= token_count

In [34]:
gen = get_data(2 ** (10 + 4), tokenizer, doc_dataset)
data = next(gen)

In [35]:
print(data)

[8, 183, 188, 7, 11, 152, 218, 11, 206, 247, 8, 9, 9, 9, 6, 113, 123, 7, 192, 10, 98, 61, 64, 134, 68, 143, 179, 68, 9, 199, 29, 101, 180, 26, 62, 10, 188, 10, 183, 87, 50, 69, 7, 28, 9, 114, 71, 10, 110, 77, 84, 10, 269, 116, 37, 83, 7, 78, 97, 138, 250, 137, 71, 12, 82, 358, 299, 7, 228, 129, 36, 11, 148, 151, 130, 52, 9, 165, 161, 10, 218, 226, 8, 252, 240, 8, 96, 201, 142, 140, 8, 251, 8, 4, 10, 10, 9, 17, 10, 9, 12, 8, 11, 6, 17, 13, 8, 8, 8, 12, 140, 224, 7, 105, 214, 150, 171, 9, 419, 136, 185, 10, 424, 233, 10, 192, 196, 390, 205, 276, 9, 240, 212, 12, 288, 9, 148, 10, 255, 11, 220, 11, 242, 231, 79, 10, 166, 181, 7, 7, 374, 11, 112, 181, 92, 74]


# Generate Fake Data

In [36]:
# 50000//8
doclen_cnt = [
    96784, 1200, 350, 170, 110, 100, 80, 70, 
    60, 50 ,40 ,70, 60,30,80,70,40,20, 10,125
]
doclen_avg = [
    50000//8 * (i + 1)
    for i in range(len(doclen_cnt))
]
print(f"doclen_avg: {doclen_avg}")
print(f"doclen_cnt: {doclen_cnt}")

doclen_avg: [6250, 12500, 18750, 25000, 31250, 37500, 43750, 50000, 56250, 62500, 68750, 75000, 81250, 87500, 93750, 100000, 106250, 112500, 118750, 125000]
doclen_cnt: [96784, 1200, 350, 170, 110, 100, 80, 70, 60, 50, 40, 70, 60, 30, 80, 70, 40, 20, 10, 125]


In [37]:
import random
# set a seed
random.seed(42)
dataset = []
scope = 50000//8
for i in range(len(doclen_cnt)):
    for j in range(doclen_cnt[i]):
        k = random.randint(i * scope + 1, (i+1) * scope)
        dataset.append(k)
random.shuffle(dataset)

In [38]:
dataset[:10]

[2792, 4238, 642, 5298, 2888, 6033, 3408, 4108, 3978, 3404]

In [39]:
batches = []
current_batch = []
K = 1024
context_length = 16 * K
for i in dataset:
    if sum(current_batch) + i > context_length:
        batches.append(current_batch)
        current_batch = []
    current_batch.append(i)
if current_batch:
    batches.append(current_batch)

In [40]:
len(batches)

24221

In [41]:
print(sum(batches[0]))

15858


In [42]:
import json

with open('batches.json', 'w') as f:
    json.dump(batches, f)
