In [1]:
import numpy as np
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
from einops import rearrange

import stk
import stk.ops
import stk.random
import stk.matrix
from megablocks.layers.gelu import gelu

from model import GPT, GPTConfig, MoeMLP


In [3]:
config = GPTConfig(
    n_layer = 12,
    n_head = 12,
    n_embd = 768,
    bias = False,
    vocab_size=50304,

    # MoE configuration with VARIABLE-SIZE EXPERTS
    use_moe = True,
    num_experts = 8,
    num_experts_per_tok = 2,
    norm_topk_prob = True,
    block_size = 128,
    block_k = 64,
    expert_sizes = [(4, 2944), (4, 128)]  # 4 large (2944) + 4 small (128)
)

Subclass the MoE MLP layer and GPT layer to track token routing

In [None]:
class MoeMLPWithTracking(MoeMLP):
    """Add expert assignment tracking to the mlp layer's forward pass"""

    @torch.compiler.disable
    def forward(self, x):
        batch_size, seq_len, n_embd = x.shape

        x_flat = rearrange(x, 'batch_size seq_len n_embd -> (batch_size seq_len) n_embd')

        router_logits = self.router(x_flat)
        router_probs = F.softmax(router_logits, dim=-1, dtype=torch.float32)
        expert_weights, selected_experts = torch.topk(router_probs, self.num_experts_per_tok, dim=-1)

        if self.norm_topk_prob:
            expert_weights = expert_weights / expert_weights.sum(dim=-1, keepdim=True)
        expert_weights = expert_weights.to(x.dtype)
        expert_weights_flat = rearrange(expert_weights, '... -> (...)')
        selected_experts_flat = rearrange(selected_experts, '... -> (...)')

        bin_ids, indices, tokens_per_expert = self._sort_tokens_by_expert(selected_experts_flat)
        padded_bins, topology = self._create_topology(x_flat, tokens_per_expert)
        x_permuted = self._gather_tokens(x_flat, indices, bin_ids, tokens_per_expert, padded_bins)
        x_permuted = stk.ops.sdd(x_permuted, self.w1, topology)
        x_permuted = gelu(x_permuted)
        x_permuted = stk.ops.dsd(x_permuted, self.w2)

        x_permuted = self._scatter_tokens(x_permuted, indices, bin_ids, expert_weights_flat, tokens_per_expert, padded_bins)
        output = rearrange(x_permuted, '(batch_size seq_len) n_embd -> batch_size seq_len n_embd')

        router_z_loss = torch.logsumexp(router_logits, dim=-1).pow(2).mean()

        p_i = router_probs.mean(dim=0).to(torch.bfloat16) #cast back to bfloat16 to be able to dot product with f_i

        experts_flat = selected_experts.flatten()
        f_i = torch.zeros(self.num_experts, dtype=x.dtype, device=x.device)
        ones = torch.ones_like(experts_flat, dtype=x.dtype) / len(experts_flat)
        f_i.scatter_add(0, experts_flat, ones)
        load_balance_loss = self.num_experts * (f_i @ p_i)
        
        expert_assignments = rearrange(selected_experts, '(batch seq) k -> batch seq k', batch=batch_size, seq=seq_len)
        router_logits_reshaped = rearrange(router_logits, '(batch seq) num_experts -> batch seq num_experts', batch=batch_size, seq=seq_len)
        router_probs_reshaped = rearrange(router_probs, '(batch seq) num_experts -> batch seq num_experts', batch=batch_size, seq=seq_len)

        aux_loss = {
            'router_z_loss': router_z_loss,
            'load_balance_loss': load_balance_loss,
            'expert_assignments': expert_assignments,
            'router_logits': router_logits_reshaped,
            'router_probs': router_probs_reshaped,
        }
        
        return output, aux_loss, f_i

class GPTWithTracking(GPT):
    """Track expert assignments across layers"""

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.config.n_ctx, f"Cannot forward sequence of length {t}, context length is only {self.config.n_ctx}"
        pos = torch.arange(0, t, dtype=torch.long, device=device)

        tok_emb = self.transformer.wte(idx)
        pos_emb = self.transformer.wpe(pos)
        x = self.transformer.drop(tok_emb + pos_emb)
        
        # Initialize tracking structures
        combined_aux_loss = {}
        all_expert_usage = []
        all_expert_assignments = {}
        all_router_logits = {}
        all_router_probs = {}
        aux_loss_count = 0

        for layer_idx, block in enumerate(self.transformer.h):
            block_out = block(x)
            
            x, aux_loss, f_i = block_out
            
            if f_i is not None:
                all_expert_usage.append(f_i)
            
            all_expert_assignments[f'layer_{layer_idx}'] = aux_loss['expert_assignments']
            all_router_logits[f'layer_{layer_idx}'] = aux_loss['router_logits']
            all_router_probs[f'layer_{layer_idx}'] = aux_loss['router_probs']
            
            if layer_idx == 0:
                combined_aux_loss = {k: v.clone() for k, v in aux_loss.items() 
                                if k not in ['expert_assignments', 'router_logits', 'router_probs']}
            else:
                for key in aux_loss:
                    if key not in ['expert_assignments', 'router_logits', 'router_probs']:
                        combined_aux_loss[key] += aux_loss[key]
            
            aux_loss_count += 1

        for key in combined_aux_loss:
            combined_aux_loss[key] /= aux_loss_count

        if all_expert_usage:  # Check in case all f_i were None
            avg_expert_usage = torch.stack(all_expert_usage).mean(dim=0)
            combined_aux_loss['expert_usage'] = avg_expert_usage

        combined_aux_loss['expert_assignments'] = all_expert_assignments
        combined_aux_loss['router_logits'] = all_router_logits
        combined_aux_loss['router_probs'] = all_router_probs

        x = self.transformer.ln_f(x)
        if targets is not None:
            logits = self.lm_head(x)
            ce_loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
            loss = ce_loss
            if combined_aux_loss is not None:
                loss = loss + self.config.load_balance_loss_weight * combined_aux_loss['load_balance_loss'] + self.config.router_z_loss_weight * combined_aux_loss['router_z_loss']
                combined_aux_loss['ce_loss'] = ce_loss
        else:
            logits = self.lm_head(x[:, [-1], :])
            loss = None
            ce_loss = None

        return logits, loss, combined_aux_loss
        

In [None]:
#the three different seed checkpoints
checkpoint_path = "out-openwebtext/moe-8x2-variable-4x2944-4x128-seed1337/ckpt.pt"
# checkpoint_path = "out-openwebtext/moe-8x2-variable-4x2944-4x128-seed1223/ckpt.pt"
# checkpoint_path = "out-openwebtext/moe-8x2-variable-4x2944-4x128-seed42/ckpt.pt"

model = GPTWithTracking(config)

for block in model.transformer.h:
    if hasattr(block.mlp, 'expert_sizes'):
        old_mlp = block.mlp
        block.mlp = MoeMLPWithTracking(config)
        block.mlp.load_state_dict(old_mlp.state_dict())

checkpoint = torch.load(checkpoint_path, map_location='cpu')

state_dict = checkpoint['model']
if any(k.startswith('_orig_mod.') for k in state_dict.keys()):
    state_dict = {k.replace('_orig_mod.', ''): v for k, v in state_dict.items()}

model.load_state_dict(state_dict)

Collect routing statistics across the entire validation set per layer

In [None]:
import tiktoken
from collections import defaultdict
from tqdm import tqdm

tokenizer = tiktoken.get_encoding('gpt2')

val_data_path = "data/openwebtext/val.bin"
val_data = np.memmap(val_data_path, dtype=np.uint16, mode='r')

device = 'cuda'
model = model.to(device)
model.eval()

num_layers = config.n_layer
token_stats_per_layer = {}
expert_sizes = model.transformer.h[0].mlp.expert_sizes

for layer_idx in range(num_layers):
    token_stats_per_layer[f'layer_{layer_idx}'] = defaultdict(lambda: {
        'expert_counts': np.zeros(config.num_experts, dtype=np.int64),
        'total_occurrences': 0,
        'total_entropy': 0.0,
        'expert_size_sum': 0.0,
    })

batch_size = 8
seq_len = 1024
total_tokens = len(val_data)
num_batches = total_tokens // seq_len

for batch_idx in tqdm(range(num_batches)):
    start_idx = batch_idx * seq_len
    end_idx = start_idx + seq_len
    batch_tokens = torch.from_numpy(val_data[start_idx:end_idx].astype(np.int64)).unsqueeze(0).to(device)

    with torch.inference_mode():
        logits, loss, aux_loss = model(batch_tokens, targets=batch_tokens)
    
    output_probs = F.softmax(logits[0], dim=-1) 
    epsilon = 1e-10 # can't do log(0) so need epsilon
    output_entropy = -(output_probs * torch.lot(output_probs + epsilon)).sum(dim=-1).cpu().numpy()

    for layer_idx in range(num_layers):
        layer_name = f'layer_{layer_idx}'
        layer_assignments = aux_loss['expert_assignments'][layer_name][0].cpu().numpy()  # (seq_len, k)
        token_stats = token_stats_per_layer[layer_name]
        
        # Update statistics for each token
        for pos in range(seq_len):
            token_id = int(batch_tokens[0, pos].item())
            expert_ids = layer_assignments[pos]  # k experts for this position
            
            # Update counts
            token_stats[token_id]['total_occurrences'] += 1
            token_stats[token_id]['total_entropy'] += output_entropy[pos]
            
            # Track which experts were used
            for expert_id in expert_ids:
                token_stats[token_id]['expert_counts'][expert_id] += 1
                token_stats[token_id]['expert_size_sum'] += expert_sizes[expert_id]

print(f"\nCollected statistics for each layer")
for layer_name, token_stats in token_stats_per_layer.items():
    num_unique_tokens = len(token_stats)
    total_occurrences = sum(s['total_occurrences'] for s in token_stats.values())

In [None]:
# Track expert COMBINATIONS per layer (not just individual counts)
from collections import Counter

# Initialize combination tracking for each layer
token_combinations_per_layer = {}
for layer_idx in range(num_layers):
    token_combinations_per_layer[f'layer_{layer_idx}'] = defaultdict(Counter)

print(f"Tracking expert combinations for {num_batches} batches across {num_layers} layers...")

# Re-process to track combinations
for batch_idx in tqdm(range(num_batches)):
    start_idx = batch_idx * seq_len
    end_idx = start_idx + seq_len
    batch_tokens = torch.from_numpy(val_data[start_idx:end_idx].astype(np.int64)).unsqueeze(0).to(device)
    
    with torch.inference_mode():
        logits, loss, aux_loss = model(batch_tokens, targets=batch_tokens)
    
    # Track combinations for each layer
    for layer_idx in range(num_layers):
        layer_name = f'layer_{layer_idx}'
        layer_assignments = aux_loss['expert_assignments'][layer_name][0].cpu().numpy()
        
        for pos in range(seq_len):
            token_id = int(batch_tokens[0, pos].item())
            expert_ids = tuple(sorted(layer_assignments[pos]))  # Sort for consistency
            token_combinations_per_layer[layer_name][token_id][expert_ids] += 1

print("\nDone tracking combinations!")

# Now print the most common combination for each token across all layers
print("\n" + "="*200)
print(f"{'Token ID':<10} {'Token':<20} {'Avg Size':<12} {'FLOPs':<15} {'Layer 0':<20} {'Layer 1':<20} {'Layer 2':<20} {'Layer 3':<20} {'Layer 4':<20} {'Layer 5':<20} {'Layer 6':<20} {'Layer 7':<20}")
print("="*200)

# Get unique tokens that appeared in any layer
all_token_ids = set()
for layer_combos in token_combinations_per_layer.values():
    all_token_ids.update(layer_combos.keys())

# Track counts for summary
large_expert_tokens = 0
small_expert_tokens = 0
total_flops = 0

# Sort by token ID for consistent ordering
for token_id in sorted(all_token_ids):  # Show ALL tokens
    # Decode token - handle byte-level tokens that don't decode cleanly
    try:
        token_text = tokenizer.decode([token_id])
        # Replace problematic characters
        token_text = token_text.replace('\n', '\\n').replace('\t', '\\t').replace('\r', '\\r')
        # Check if it's a weird byte-level token
        if '�' in token_text or not token_text.isprintable():
            # Show the token ID representation instead
            token_text = f"<{token_id}>"
        if len(token_text) > 18:
            token_text = token_text[:17] + '…'
    except:
        token_text = f"<{token_id}>"
    
    # Calculate average expert SIZE across all layers
    total_size = 0
    layer_count = 0
    for layer_idx in range(num_layers):
        layer_name = f'layer_{layer_idx}'
        combos = token_combinations_per_layer[layer_name][token_id]
        if combos:
            most_common = combos.most_common(1)[0][0]
            # Sum the sizes of the experts in this layer
            layer_size = sum(expert_sizes[e] for e in most_common)
            total_size += layer_size
            layer_count += 1
    
    avg_size = total_size / layer_count if layer_count > 0 else 0
    
    # Calculate FLOPs: 4 * hidden_size * total_size
    flops = 4 * config.n_embd * total_size
    total_flops += flops
    # Count for summary
    if avg_size >= 2560:
        large_expert_tokens += 1
    else:
        small_expert_tokens += 1
    
    # Get most common combination for each layer
    row = [f"{token_id:<10}", f"{token_text:<20}", f"{avg_size:<12.1f}", f"{flops:<15,}"]
    for layer_idx in range(num_layers):
        layer_name = f'layer_{layer_idx}'
        combos = token_combinations_per_layer[layer_name][token_id]
        
        if combos:
            most_common = combos.most_common(1)[0][0]  # Get the tuple of experts
            # Format with expert sizes: (5(128),7(128))
            formatted = "(" + ",".join([f"{e}({expert_sizes[e]})" for e in most_common]) + ")"
            row.append(f"{formatted:<20}")
        else:
            row.append(f"{'N/A':<20}")
    
    print("".join(row))

print("="*200)
print(f"\nTotal unique tokens: {len(all_token_ids)}")
print(f"\nAverage Size Summary:")
print(f"  Tokens with avg size >= 2560 (mostly large experts): {large_expert_tokens} ({100*large_expert_tokens/len(all_token_ids):.2f}%)")
print(f"  Tokens with avg size < 2560 (mostly small experts):  {small_expert_tokens} ({100*small_expert_tokens/len(all_token_ids):.2f}%)")
print(f"  Average number of FLOPs per token: {total_flops/len(all_token_ids):.0f} ({100*(total_flops/len(all_token_ids))/(8*4*640*2560):.2f}% of baseline)")

In [None]:
import pandas as pd

# Initialize combination tracking for each layer
token_combinations_per_layer = {}
for layer_idx in range(num_layers):
    token_combinations_per_layer[f'layer_{layer_idx}'] = defaultdict(Counter)

print(f"Tracking expert combinations for {num_batches} batches across {num_layers} layers...")

# Re-process to track combinations
for batch_idx in tqdm(range(num_batches)):
    start_idx = batch_idx * seq_len
    end_idx = start_idx + seq_len
    batch_tokens = torch.from_numpy(val_data[start_idx:end_idx].astype(np.int64)).unsqueeze(0).to(device)
    
    with torch.inference_mode():
        logits, loss, aux_loss = model(batch_tokens, targets=batch_tokens)
    
    # Track combinations for each layer
    for layer_idx in range(num_layers):
        layer_name = f'layer_{layer_idx}'
        layer_assignments = aux_loss['expert_assignments'][layer_name][0].cpu().numpy()
        
        for pos in range(seq_len):
            token_id = int(batch_tokens[0, pos].item())
            expert_ids = tuple(sorted(layer_assignments[pos]))
            token_combinations_per_layer[layer_name][token_id][expert_ids] += 1

print("\nDone tracking combinations! Building dataframe...")

# Get unique tokens
all_token_ids = set()
for layer_combos in token_combinations_per_layer.values():
    all_token_ids.update(layer_combos.keys())

# Build data for dataframe
data = []
for token_id in all_token_ids:
    # Decode token
    try:
        token_text = tokenizer.decode([token_id])
        token_text = token_text.replace('\n', '\\n').replace('\t', '\\t').replace('\r', '\\r')
        if '�' in token_text or not token_text.isprintable():
            token_text = f"<{token_id}>"
        if len(token_text) > 18:
            token_text = token_text[:17] + '…'
    except:
        token_text = f"<{token_id}>"
    
    # Calculate average expert SIZE across all layers
    total_size = 0
    layer_count = 0
    layer_data = {}
    
    for layer_idx in range(num_layers):
        layer_name = f'layer_{layer_idx}'
        combos = token_combinations_per_layer[layer_name][token_id]
        if combos:
            most_common = combos.most_common(1)[0][0]
            layer_size = sum(expert_sizes[e] for e in most_common)
            total_size += layer_size
            layer_count += 1
            # Format with expert sizes: (5(128),7(128))
            formatted = "(" + ",".join([f"{e}({expert_sizes[e]})" for e in most_common]) + ")"
            layer_data[f'layer_{layer_idx}'] = formatted
        else:
            layer_data[f'layer_{layer_idx}'] = 'N/A'
    
    avg_size = total_size / layer_count if layer_count > 0 else 0
    flops = 4 * config.n_embd * total_size
    
    row = {
        'token_id': token_id,
        'token': token_text,
        'avg_size': avg_size,
        'flops': flops,
        **layer_data
    }
    data.append(row)

# Create DataFrame and sort by FLOPs
df = pd.DataFrame(data)
df = df.sort_values('flops', ascending=True).reset_index(drop=True)

print(f"\nDataFrame created with {len(df)} tokens, sorted by FLOPs (low to high)")
print(f"\nFirst 20 rows (lowest FLOPs):")
print(df.head(20).to_string())

print(f"\n\nLast 20 rows (highest FLOPs):")
print(df.tail(20).to_string())

# Summary statistics
print(f"\n{'='*80}")
print(f"Summary Statistics:")
print(f"{'='*80}")
print(f"Total unique tokens: {len(df)}")
print(f"\nFLOPs distribution:")
print(f"  Min:    {df['flops'].min():,.0f}")
print(f"  25%:    {df['flops'].quantile(0.25):,.0f}")
print(f"  Median: {df['flops'].median():,.0f}")
print(f"  75%:    {df['flops'].quantile(0.75):,.0f}")
print(f"  Max:    {df['flops'].max():,.0f}")
print(f"  Mean:   {df['flops'].mean():,.0f}")

print(f"\nAverage Size distribution:")
print(f"  Tokens with avg_size >= 2560: {(df['avg_size'] >= 2560).sum()} ({100*(df['avg_size'] >= 2560).sum()/len(df):.2f}%)")
print(f"  Tokens with avg_size < 2560:  {(df['avg_size'] < 2560).sum()} ({100*(df['avg_size'] < 2560).sum()/len(df):.2f}%)")

baseline_flops = 8 * 4 * 640 * 2560  # num_layers * 4 * hidden_size * expert_size
print(f"\nAverage FLOPs per token: {df['flops'].mean():.0f} ({100*df['flops'].mean()/baseline_flops:.2f}% of baseline)")

# Store the dataframe for further analysis
expert_combinations_df = df
sweep_value = '-'.join(checkpoint_path.split('-')).split('/')[-2]

df.to_csv(f'{sweep_value}_expert_combinations.csv', index=False)

In [None]:
# Analyze per-token statistics FOR EACH LAYER
for layer_idx in range(num_layers):
    layer_name = f'layer_{layer_idx}'
    token_stats = token_stats_per_layer[layer_name]
    
    print(f"\n{'='*80}")
    print(f"LAYER {layer_idx} ANALYSIS")
    print(f"{'='*80}\n")
    
    # Compute derived metrics for each token
    token_analysis = {}
    
    for token_id, stats in token_stats.items():
        if stats['total_occurrences'] > 0:
            # Average entropy
            avg_entropy = stats['total_entropy'] / stats['total_occurrences']
            
            # Expert distribution (normalized)
            expert_distribution = stats['expert_counts'] / stats['expert_counts'].sum()
            
            # Most common expert
            most_common_expert = np.argmax(stats['expert_counts'])
            
            # Average expert size
            avg_expert_size = stats['expert_size_sum'] / stats['expert_counts'].sum()
            
            token_analysis[token_id] = {
                'avg_entropy': avg_entropy,
                'occurrences': stats['total_occurrences'],
                'expert_distribution': expert_distribution,
                'most_common_expert': most_common_expert,
                'avg_expert_size': avg_expert_size,
            }
    
    # Plot distribution of average expert sizes
    all_expert_sizes = np.array([a['avg_expert_size'] for a in token_analysis.values()])
    all_occurrences = np.array([a['occurrences'] for a in token_analysis.values()])
    
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    fig.suptitle(f'Layer {layer_idx} Token Routing Analysis', fontsize=16)
    
    # Unweighted histogram (by unique tokens)
    ax1 = axes[0]
    ax1.hist(all_expert_sizes, bins=50, alpha=0.7, edgecolor='black')
    ax1.axvline(x=128, color='blue', linestyle='--', label='Small (128)', linewidth=2)
    ax1.axvline(x=2944, color='red', linestyle='--', label='Large (2944)', linewidth=2)
    ax1.axvline(x=np.mean(all_expert_sizes), color='green', linestyle='--', label=f'Mean ({np.mean(all_expert_sizes):.0f})', linewidth=2)
    ax1.set_xlabel('Average Expert Size per Token')
    ax1.set_ylabel('Number of Unique Tokens')
    ax1.set_title('Distribution by Unique Tokens (Unweighted)')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Weighted histogram (by token occurrences)
    ax2 = axes[1]
    ax2.hist(all_expert_sizes, bins=50, weights=all_occurrences, alpha=0.7, edgecolor='black', color='orange')
    ax2.axvline(x=128, color='blue', linestyle='--', label='Small (128)', linewidth=2)
    ax2.axvline(x=2944, color='red', linestyle='--', label='Large (2944)', linewidth=2)
    weighted_mean = np.average(all_expert_sizes, weights=all_occurrences)
    ax2.axvline(x=weighted_mean, color='green', linestyle='--', label=f'Weighted Mean ({weighted_mean:.0f})', linewidth=2)
    ax2.set_xlabel('Average Expert Size per Token')
    ax2.set_ylabel('Total Token Occurrences')
    ax2.set_title('Distribution by Token Occurrences (Weighted)')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    # Pie chart - weighted breakdown
    ax3 = axes[2]
    large_expert_occurrences = sum(all_occurrences[all_expert_sizes > 1536])
    small_expert_occurrences = sum(all_occurrences[all_expert_sizes <= 1536])
    total_occurrences = large_expert_occurrences + small_expert_occurrences
    
    ax3.pie([large_expert_occurrences, small_expert_occurrences],
            labels=['Large experts\n(>1536)', 'Small experts\n(≤1536)'],
            autopct='%1.1f%%',
            colors=['red', 'blue'])
    ax3.set_title(f'Token Occurrences by Expert Size\n(Total: {total_occurrences:,} tokens)')
    
    plt.tight_layout()
    plt.show()
    
    # Statistics
    print(f"Average Expert Size Statistics (Unweighted):")
    print(f"  Mean: {np.mean(all_expert_sizes):.2f}")
    print(f"  Median: {np.median(all_expert_sizes):.2f}")
    print(f"  Std: {np.std(all_expert_sizes):.2f}")
    
    print(f"\nAverage Expert Size Statistics (Weighted by occurrences):")
    print(f"  Weighted Mean: {weighted_mean:.2f}")
    
    # Count how many tokens go to mostly large vs mostly small experts
    large_expert_tokens = sum(1 for s in all_expert_sizes if s > 1536)
    small_expert_tokens = sum(1 for s in all_expert_sizes if s <= 1536)
    print(f"\nUnique Token routing breakdown:")
    print(f"  Unique tokens routing mostly to LARGE experts: {large_expert_tokens} ({100*large_expert_tokens/len(all_expert_sizes):.1f}%)")
    print(f"  Unique tokens routing mostly to SMALL experts: {small_expert_tokens} ({100*small_expert_tokens/len(all_expert_sizes):.1f}%)")
    
    print(f"\nWeighted by occurrences:")
    print(f"  Token occurrences routed to LARGE experts: {large_expert_occurrences:,} ({100*large_expert_occurrences/total_occurrences:.1f}%)")
    print(f"  Token occurrences routed to SMALL experts: {small_expert_occurrences:,} ({100*small_expert_occurrences/total_occurrences:.1f}%)")

print("\n" + "="*80)
print("SUMMARY ACROSS ALL LAYERS")
print("="*80)