In [2]:
print('start')
!pip install transformers==4.51.3 numpy tqdm accelerate datasets scikit-learn 
print('done')

start
Collecting transformers==4.51.3
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting accelerate
  Downloading accelerate-1.10.1-py3-none-any.whl.metadata (19 kB)
Collecting datasets
  Downloading datasets-4.1.1-py3-none-any.whl.metadata (18 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers==4.51.3)
  Downloading huggingface_hub-0.35.0-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers==4.51.3)
  Downloading regex-2025.9.18-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers==4.51.3)
  Downloading tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting safetensor

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from pathlib import Path
import json
from tqdm import tqdm
import os
from accelerate import init_empty_weights
from functools import partial
import random
from torch.utils.data import DataLoader
from datasets import load_dataset
from torch.linalg import svd, norm

class SVDQwen3MoeSparseMoeBlock(nn.Module):
    def __init__(self, config,dtype= torch.bfloat16):
        super().__init__()
        self.num_experts = config.num_experts
        self.top_k = config.num_experts_per_tok
        self.norm_topk_prob = config.norm_topk_prob
        self.hidden_dim = config.hidden_size #2048
        self.intermediate_dim = config.moe_intermediate_size #768
        # gating
        self.dtype = dtype
        
        self.gate = nn.Linear(config.hidden_size, config.num_experts, bias=False, dtype=self.dtype)
        # share v
        self.low_rank = int(self.intermediate_dim * self.hidden_dim * 0.3 / (self.intermediate_dim + self.hidden_dim))
        share_v = False
        if share_v == True:
            self.experts_v1_shared_gate = nn.Linear(self.hidden_dim, self.low_rank, bias=False, dtype=self.dtype)
            self.experts_v2_shared_up = nn.Linear(self.hidden_dim, self.low_rank, bias=False, dtype=self.dtype)
            self.experts_v3_shared_down = nn.Linear(self.intermediate_dim, self.low_rank, bias=False, dtype=self.dtype)
        else:
            self.experts_v1_shared_gate = None
            self.experts_v2_shared_up = None
            self.experts_v3_shared_down = None
        self.beta = 0.9
        self.experts = nn.ModuleList(
            [moe_layer(config, False, self.experts_v1_shared_gate, self.experts_v2_shared_up, self.experts_v3_shared_down)  for _ in range(self.num_experts)])


    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        """ """
        batch_size, sequence_length, hidden_dim = hidden_states.shape
        print(f'hidden_states: {hidden_states.shape}')
        hidden_states = hidden_states.view(-1, hidden_dim)
        print(f'hidden_states: {hidden_states.shape}')
        
        router_logits = self.gate(hidden_states)

        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
        # print(f'top k 전 routing_weights shape: {routing_weights.shape}')
        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
        # print(f'top k 후 routing_weights: {routing_weights.shape}{routing_weights}')
        # print(f'selected_experts: {selected_experts}')
        
        #------------------------------------------------------
        if self.top_k > 1:
            top1_weights = routing_weights[:, 0:1]
            # print(f'top1_weights: {top1_weights.shape}{top1_weights}')

            other_weights = routing_weights[:, 1:]
            # print(f'other_weights: {other_weights.shape}{other_weights}')

            # Shape: (num_tokens, top_k - 1)
            pruning_mask = other_weights < self.beta * top1_weights
            # print(f'pruning_mask: {pruning_mask.shape} {pruning_mask}')
            
            # Set the weights of the pruned experts to 0
            routing_weights[:, 1:].masked_fill_(pruning_mask, 0)   
        #------------------------------------------------------     
        
        if self.norm_topk_prob:  # only diff with mixtral sparse moe block!
            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        routing_weights = routing_weights.to(hidden_states.dtype)

        final_hidden_states = torch.zeros(
            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
        )

        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
        print(f'expert_mask: {expert_mask.shape}, {expert_mask}')
        
        #------------------------------------------------------
        if self.top_k > 1:
            # expert_mask shape: (num_experts, top_k, num_tokens)  
            # pruning_mask shape: (num_tokens, top_k - 1) - True where we want to prune
            # expert_mask[:, 1:, :] shape: (num_experts, top_k-1, num_tokens)
            
            # Transpose pruning_mask to (top_k-1, num_tokens) and add expert dimension
            pruning_mask_expanded = pruning_mask.t().unsqueeze(0)  # Shape: (1, top_k-1, num_tokens)
            
            # Use masked_fill_ to zero out pruned positions
            # We want to set to 0 where pruning_mask_expanded is True
            expert_mask[:, 1:, :].masked_fill_(pruning_mask_expanded, 0)
        #------------------------------------------------------
        
        expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
        # print(f'expert_hit: {expert_hit}')
        
        for expert_idx in expert_hit:
            print(f'expert_idx: {expert_idx}')
            expert_layer = self.experts[expert_idx]
            idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))

            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]

            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
        
        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
        return final_hidden_states, router_logits     
    
    def SVD(self, modules, num_experts=128, dtype=torch.bfloat16):
        self.gate.weight = modules.gate.weight
        
        # Process gate_proj weights
        for i in range(num_experts):
            if i % 10 == 0:
                print(f'Processing gate_proj for expert {i}/{num_experts}')
            
            # Keep original weight on GPU, but move float copy to CPU for SVD if still OOM
            # For now, let's keep it on GPU but manage carefully
            original_weight = modules.experts[i].gate_proj.weight
            
            # 1. Upcast and perform SVD
            u, s, v = torch.svd_lowrank(original_weight.float(), q=self.low_rank)
            
            # 2. Create the first low-rank matrix, assign it, and immediately delete v
            self.experts[i].v1.weight = nn.Parameter(v.T.to(dtype))
            del v
            
            # 3. Create the second low-rank matrix, assign it, and immediately delete u and s
            US_top = u @ torch.diag(s)
            self.experts[i].us1.weight = nn.Parameter(US_top.to(dtype))
            del u, s, US_top
            
            if i % 20 == 0:
                torch.cuda.empty_cache()

        # Process up_proj weights
        for i in range(num_experts):
            if i % 10 == 0:
                print(f'Processing up_proj for expert {i}/{num_experts}')
            
            original_weight = modules.experts[i].up_proj.weight
            u, s, v = torch.svd_lowrank(original_weight.float(), q=self.low_rank)
            
            self.experts[i].v2.weight = nn.Parameter(v.T.to(dtype))
            del v
            
            US_top = u @ torch.diag(s)
            self.experts[i].us2.weight = nn.Parameter(US_top.to(dtype))
            del u, s, US_top

            if i % 20 == 0:
                torch.cuda.empty_cache()

        # Process down_proj weights
        for i in range(num_experts):
            if i % 10 == 0:
                print(f'Processing down_proj for expert {i}/{num_experts}')
            
            original_weight = modules.experts[i].down_proj.weight
            u, s, v = torch.svd_lowrank(original_weight.float(), q=self.low_rank)
            
            self.experts[i].v3.weight = nn.Parameter(v.T.to(dtype))
            del v
            
            US_top = u @ torch.diag(s)
            self.experts[i].us3.weight = nn.Parameter(US_top.to(dtype))
            del u, s, US_top

            if i % 20 == 0:
                torch.cuda.empty_cache()
    

class moe_layer(nn.Module):
  def __init__(self, config, share_v, shared_vgate, shared_vup, shared_vdown ):
      super().__init__()

      self.intermediate_dim = config.moe_intermediate_size #768
      self.hidden_dim = config.hidden_size #2048
      self.dtype = torch.bfloat16
      self.low_rank = int(self.intermediate_dim * self.hidden_dim * 0.3 / (self.intermediate_dim + self.hidden_dim))
      # self.delta_ratio = 0.3
      self.act_fn = nn.SiLU()
      # self.low_rank =
      # if share_v == True:
      #   self.v1 = shared_vgate
      #   # self.experts_v1_shared_gate = nn.Linear(self.hidden_dim, self.low_rank, bias=False, dtype=torch.bfloat16)         
      #   self.us1 = nn.Linear(self.low_rank, self.intermediate_dim, bias=False, dtype=torch.bfloat16)
      #   self.v2 = shared_vup
      #   self.us2 = nn.Linear(self.low_rank, self.intermediate_dim, bias=False, dtype=torch.bfloat16)
      #   self.v3 = shared_vdown
      #   self.us3 = nn.Linear(self.low_rank, self.hidden_dim, bias=False, dtype=torch.bfloat16)
      # else:
      self.v1 = nn.Linear(self.hidden_dim, self.low_rank, bias=False, dtype=torch.bfloat16) #2048,167
      self.us1 = nn.Linear(self.low_rank, self.intermediate_dim, bias=False, dtype=torch.bfloat16) #167,768
      self.v2 = nn.Linear(self.hidden_dim, self.low_rank, bias=False, dtype=torch.bfloat16)
      self.us2 = nn.Linear(self.low_rank, self.intermediate_dim, bias=False, dtype=torch.bfloat16)
      self.v3 = nn.Linear(self.intermediate_dim, self.low_rank, bias=False, dtype=torch.bfloat16)
      self.us3 = nn.Linear(self.low_rank, self.hidden_dim, bias=False, dtype=torch.bfloat16)
      # self.low_rank = int(self.intermediate_dim * self.hidden_dim * self.delta_ratio / (self.intermediate_dim + self.hidden_dim))

  def forward(self,hidden_states):
      hidden_states = hidden_states.to(self.v1.weight.dtype)
      gate = self.us1(self.v1(hidden_states))
      up = self.us2(self.v2(hidden_states))
      x = self.us3(self.v3(self.act_fn(gate) * up))
      print('gate, up, down 과정 연산 완료')
      return x




In [4]:
class DummyConfig:
    def __init__(self):
        self.architectures = ["custommodel"]
        self.attention_bias = False
        self.attention_dropout = 0.0
        self.bos_token_id = 151643
        self.decoder_sparse_step = 1
        self.eos_token_id = 151645
        self.head_dim = 128
        self.hidden_act = "silu"
        self.hidden_size = 2048
        self.initializer_range = 0.02
        self.intermediate_size = 6144
        self.max_position_embeddings = 40960
        self.max_window_layers = 48
        self.mlp_only_layers = []
        self.model_type = "qwen3_moe"
        self.moe_intermediate_size = 768
        self.norm_topk_prob = True
        self.num_attention_heads = 32
        self.num_experts = 128
        self.num_experts_per_tok = 8
        self.num_hidden_layers = 48
        self.num_key_value_heads = 4
        self.output_router_logits = False
        self.rms_norm_eps = 1e-6
        self.rope_scaling = None
        self.rope_theta = 1000000.0
        self.router_aux_loss_coef = 0.001
        self.sliding_window = None
        self.tie_word_embeddings = False
        self.torch_dtype = "bfloat16"
        self.transformers_version = "4.51.0"
        self.use_cache = True
        self.use_sliding_window = False
        self.vocab_size = 151936

        # Qwen3MoeSparseMoeBlock 관련 필수 값
        self.hidden_dim = self.hidden_size
        self.intermediate_dim = self.intermediate_size
        self.norm_topk_prob = True


config = DummyConfig()

# mlp = Qwen3MoeMLP(config)
# block = Qwen3MoeSparseMoeBlock(config)
# svd_block = SVDQwen3MoeSparseMoeBlock(config)
# svd_block.SVD(block)

In [5]:
from transformers import AutoModelForCausalLM
from transformers import AutoConfig, AutoModelForCausalLM
from huggingface_hub import login


config = AutoConfig.from_pretrained("Qwen/Qwen3-30B-A3B")  # grabs config only
# model = AutoModelForCausalLM.from_config(config)  # random weights
print('done')

config.json:   0%|          | 0.00/963 [00:00<?, ?B/s]

done


In [6]:
model_name = "Qwen/Qwen3-30B-A3B"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 16 files:   0%|          | 0/16 [00:00<?, ?it/s]

model-00003-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00008-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00005-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00004-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00002-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00001-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00007-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00006-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00009-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00010-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00011-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00012-of-00016.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00013-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00014-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00016-of-00016.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

model-00015-of-00016.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/16 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "backend:cudaMallocAsync"

In [7]:
# with open(expert_freq_path, 'r') as f:
#     expert_freq = json.load(f)
# svd_scale = torch.load(svd_scale_path, map_location='cpu')
# fisher_info = torch.load(fisher_path, map_location="cpu")
import gc
for i in tqdm(range(len(model.model.layers)), desc="Merging layers"):
    if i % 5 == 0:
        before_mem = torch.cuda.memory_allocated() / 1e9
        print(f"Layer {i} - Before: {before_mem:.2f}GB")
    
    # 기존 MLP
    old_mlp = model.model.layers[i].mlp
    device = old_mlp.gate.weight.device
    
    # 새 MoE 블록 생성
    Merge_MoE_Block = SVDQwen3MoeSparseMoeBlock(model.config).to(device)
    Merge_MoE_Block.SVD(old_mlp)
    
    # 교체 전 기존 MLP 명시적 삭제
    # model.model.layers[i].mlp = None  # 참조 끊기
    gc.collect()
    torch.cuda.empty_cache()          # 캐시 정리
    
    #cache 메모리 줄여주는 부분
    old_mlp.to('cpu')
    gc.collect()
    torch.cuda.empty_cache()
    model.model.layers[i].mlp = Merge_MoE_Block
    del Merge_MoE_Block
    del old_mlp #cpu에서 제거
    
    # 메모리 정리
    torch.cuda.empty_cache()
    
    if i % 5 == 0:
        after_mem = torch.cuda.memory_allocated() / 1e9
        print(f"Layer {i} - After: {after_mem:.2f}GB, Saved: {before_mem-after_mem:.2f}GB")


Merging layers:   0%|          | 0/48 [00:00<?, ?it/s]

Layer 0 - Before: 61.06GB
Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj f

Merging layers:   2%|▏         | 1/48 [00:07<05:40,  7.25s/it]

Layer 0 - After: 60.23GB, Saved: 0.84GB
Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Proces

Merging layers:   4%|▍         | 2/48 [00:14<05:27,  7.12s/it]

Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj for expert 120/128
Processi

Merging layers:   6%|▋         | 3/48 [00:21<05:18,  7.08s/it]

Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj for expert 120/128
Processi

Merging layers:   8%|▊         | 4/48 [00:28<05:10,  7.07s/it]

Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj for expert 120/128
Processi

Merging layers:  10%|█         | 5/48 [00:35<05:05,  7.10s/it]

Layer 5 - Before: 56.84GB
Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj f

Merging layers:  12%|█▎        | 6/48 [00:42<04:59,  7.12s/it]

Layer 5 - After: 55.99GB, Saved: 0.85GB
Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Proces

Merging layers:  15%|█▍        | 7/48 [00:49<04:51,  7.12s/it]

Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj for expert 120/128
Processi

Merging layers:  17%|█▋        | 8/48 [00:56<04:44,  7.10s/it]

Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj for expert 120/128
Processi

Merging layers:  19%|█▉        | 9/48 [01:04<04:37,  7.13s/it]

Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj for expert 120/128
Processi

Merging layers:  21%|██        | 10/48 [01:11<04:32,  7.17s/it]

Layer 10 - Before: 52.60GB
Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj 

Merging layers:  23%|██▎       | 11/48 [01:18<04:25,  7.17s/it]

Layer 10 - After: 51.75GB, Saved: 0.85GB
Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Proce

Merging layers:  25%|██▌       | 12/48 [01:25<04:17,  7.16s/it]

Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj for expert 120/128
Processi

Merging layers:  27%|██▋       | 13/48 [01:32<04:10,  7.16s/it]

Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj for expert 120/128
Processi

Merging layers:  29%|██▉       | 14/48 [01:39<04:02,  7.14s/it]

Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj for expert 120/128
Processi

Merging layers:  31%|███▏      | 15/48 [01:47<03:56,  7.18s/it]

Layer 15 - Before: 48.36GB
Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj 

Merging layers:  33%|███▎      | 16/48 [01:54<03:49,  7.19s/it]

Layer 15 - After: 47.52GB, Saved: 0.85GB
Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Proce

Merging layers:  35%|███▌      | 17/48 [02:01<03:42,  7.17s/it]

Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj for expert 120/128
Processi

Merging layers:  38%|███▊      | 18/48 [02:08<03:34,  7.15s/it]

Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj for expert 120/128
Processi

Merging layers:  40%|███▉      | 19/48 [02:15<03:27,  7.15s/it]

Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj for expert 120/128
Processi

Merging layers:  42%|████▏     | 20/48 [02:22<03:20,  7.16s/it]

Layer 20 - Before: 44.13GB
Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj 

Merging layers:  44%|████▍     | 21/48 [02:30<03:12,  7.14s/it]

Layer 20 - After: 43.28GB, Saved: 0.85GB
Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Proce

Merging layers:  46%|████▌     | 22/48 [02:37<03:05,  7.13s/it]

Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj for expert 120/128
Processi

Merging layers:  48%|████▊     | 23/48 [02:44<02:58,  7.13s/it]

Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj for expert 120/128
Processi

Merging layers:  50%|█████     | 24/48 [02:51<02:50,  7.12s/it]

Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj for expert 120/128
Processi

Merging layers:  52%|█████▏    | 25/48 [02:58<02:44,  7.14s/it]

Layer 25 - Before: 39.89GB
Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj 

Merging layers:  54%|█████▍    | 26/48 [03:05<02:37,  7.14s/it]

Layer 25 - After: 39.04GB, Saved: 0.85GB
Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Proce

Merging layers:  56%|█████▋    | 27/48 [03:12<02:29,  7.13s/it]

Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj for expert 120/128
Processi

Merging layers:  58%|█████▊    | 28/48 [03:19<02:22,  7.14s/it]

Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj for expert 120/128
Processi

Merging layers:  60%|██████    | 29/48 [03:27<02:15,  7.14s/it]

Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj for expert 120/128
Processi

Merging layers:  62%|██████▎   | 30/48 [03:34<02:08,  7.15s/it]

Layer 30 - Before: 35.65GB
Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj 

Merging layers:  65%|██████▍   | 31/48 [03:41<02:01,  7.17s/it]

Layer 30 - After: 34.81GB, Saved: 0.85GB
Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Proce

Merging layers:  67%|██████▋   | 32/48 [03:48<01:54,  7.17s/it]

Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj for expert 120/128
Processi

Merging layers:  69%|██████▉   | 33/48 [03:55<01:47,  7.15s/it]

Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj for expert 120/128
Processi

Merging layers:  71%|███████   | 34/48 [04:02<01:39,  7.14s/it]

Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj for expert 120/128
Processi

Merging layers:  73%|███████▎  | 35/48 [04:10<01:33,  7.19s/it]

Layer 35 - Before: 31.42GB
Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj 

Merging layers:  75%|███████▌  | 36/48 [04:17<01:26,  7.19s/it]

Layer 35 - After: 30.57GB, Saved: 0.85GB
Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Proce

Merging layers:  77%|███████▋  | 37/48 [04:24<01:18,  7.16s/it]

Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj for expert 120/128
Processi

Merging layers:  79%|███████▉  | 38/48 [04:31<01:11,  7.15s/it]

Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj for expert 120/128
Processi

Merging layers:  81%|████████▏ | 39/48 [04:38<01:04,  7.15s/it]

Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj for expert 120/128
Processi

Merging layers:  83%|████████▎ | 40/48 [04:45<00:57,  7.17s/it]

Layer 40 - Before: 27.18GB
Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj 

Merging layers:  85%|████████▌ | 41/48 [04:53<00:50,  7.15s/it]

Layer 40 - After: 26.33GB, Saved: 0.85GB
Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Proce

Merging layers:  88%|████████▊ | 42/48 [04:59<00:42,  7.03s/it]

Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj for expert 120/128
Processi

Merging layers:  90%|████████▉ | 43/48 [05:06<00:34,  6.92s/it]

Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj for expert 120/128
Processi

Merging layers:  92%|█████████▏| 44/48 [05:13<00:27,  6.87s/it]

Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj for expert 120/128
Processi

Merging layers:  94%|█████████▍| 45/48 [05:19<00:20,  6.83s/it]

Layer 45 - Before: 22.94GB
Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj 

Merging layers:  96%|█████████▌| 46/48 [05:26<00:13,  6.82s/it]

Layer 45 - After: 22.10GB, Saved: 0.85GB
Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Proce

Merging layers:  98%|█████████▊| 47/48 [05:33<00:06,  6.80s/it]

Processing gate_proj for expert 0/128
Processing gate_proj for expert 10/128
Processing gate_proj for expert 20/128
Processing gate_proj for expert 30/128
Processing gate_proj for expert 40/128
Processing gate_proj for expert 50/128
Processing gate_proj for expert 60/128
Processing gate_proj for expert 70/128
Processing gate_proj for expert 80/128
Processing gate_proj for expert 90/128
Processing gate_proj for expert 100/128
Processing gate_proj for expert 110/128
Processing gate_proj for expert 120/128
Processing up_proj for expert 0/128
Processing up_proj for expert 10/128
Processing up_proj for expert 20/128
Processing up_proj for expert 30/128
Processing up_proj for expert 40/128
Processing up_proj for expert 50/128
Processing up_proj for expert 60/128
Processing up_proj for expert 70/128
Processing up_proj for expert 80/128
Processing up_proj for expert 90/128
Processing up_proj for expert 100/128
Processing up_proj for expert 110/128
Processing up_proj for expert 120/128
Processi

Merging layers: 100%|██████████| 48/48 [05:40<00:00,  7.09s/it]


In [9]:
torch.cuda.memory_allocated() / 1e9

61.064270336

In [9]:
torch.cuda.memory_allocated() / 1e9

20.426838528

In [10]:
from collections import defaultdict
import torch
import gc
def show_tensor_memory():
    """현재 메모리에 있는 텐서들 분석"""
    print("=== Tensors in Memory ===")
    
    tensor_info = defaultdict(list)
    total_memory = 0
    
    # 모든 객체를 순회하며 텐서 찾기
    for obj in gc.get_objects():
        if torch.is_tensor(obj):
            tensor_type = str(obj.dtype)
            tensor_device = str(obj.device)
            tensor_shape = tuple(obj.shape)
            tensor_size = obj.numel() * obj.element_size() / 1024**2  # MB
            
            tensor_info[(tensor_type, tensor_device)].append({
                'shape': tensor_shape,
                'size_mb': tensor_size
            })
            total_memory += tensor_size
    
    print(f"Total tensors memory: {total_memory:.2f} MB")
    print()
    
    # 타입별, 디바이스별로 그룹화하여 출력
    for (dtype, device), tensors in tensor_info.items():
        total_size = sum(t['size_mb'] for t in tensors)
        print(f"{dtype} on {device}: {len(tensors)} tensors, {total_size:.2f} MB")
        
        # 큰 텐서들만 출력 (10MB 이상)
        large_tensors = [t for t in tensors if t['size_mb'] > 10]
        if large_tensors:
            print("  Large tensors (>10MB):")
            for tensor in sorted(large_tensors, key=lambda x: x['size_mb'], reverse=True)[:5]:
                print(f"    Shape: {tensor['shape']}, Size: {tensor['size_mb']:.2f} MB")
        print()
def show_gpu_memory():
    """GPU 메모리 사용량 기본 정보"""
    if torch.cuda.is_available():
        print("=== GPU Memory Usage ===")
        for i in range(torch.cuda.device_count()):
            print(f"GPU {i} ({torch.cuda.get_device_name(i)}):")
            allocated = torch.cuda.memory_allocated(i) / 1024**3
            cached = torch.cuda.memory_reserved(i) / 1024**3
            max_memory = torch.cuda.get_device_properties(i).total_memory / 1024**3
            
            print(f"  Allocated: {allocated:.2f} GB")
            print(f"  Cached: {cached:.2f} GB") 
            print(f"  Total: {max_memory:.2f} GB")
            print(f"  Free: {max_memory - cached:.2f} GB")
            print()
    else:
        print("CUDA not available")        

In [11]:
show_gpu_memory()

=== GPU Memory Usage ===
GPU 0 (NVIDIA A100 80GB PCIe):
  Allocated: 19.00 GB
  Cached: 74.85 GB
  Total: 79.14 GB
  Free: 4.29 GB



In [45]:
import torch
import gc

total_gpu_mem = 0  # 바이트 단위

for obj in gc.get_objects():
    try:
        if torch.is_tensor(obj) and obj.is_cuda:
            mem = obj.numel() * obj.element_size()  # 바이트
            total_gpu_mem += mem
        elif hasattr(obj, 'data') and torch.is_tensor(obj.data) and obj.data.is_cuda:
            mem = obj.data.numel() * obj.data.element_size()
            total_gpu_mem += mem
    except:
        pass

print(f"총 GPU 메모리 사용량: {total_gpu_mem / 1024**3:.2f} GB")


  return isinstance(obj, torch.Tensor)
  elif hasattr(obj, 'data') and torch.is_tensor(obj.data) and obj.data.is_cuda:


총 GPU 메모리 사용량: 19.09 GB


In [36]:
torch.cuda.empty_cache()


In [46]:
import torch
import gc

# 1️⃣ 옵티마이저, gradient 등 inference에 필요 없는 객체 제거
if 'optimizer' in globals():
    del optimizer

for p in model.parameters():
    p.grad = None  # gradient 제거

# 2️⃣ 모델을 CPU로 이동 → GPU 메모리 완전히 해제
model = model.cpu()

# 3️⃣ 캐시 제거
torch.cuda.empty_cache()

# 4️⃣ CPU 가비지 컬렉션
gc.collect()

# 5️⃣ 다시 모델을 GPU로 올림 (inference용)
model = model.cuda()
torch.cuda.empty_cache()

# 6️⃣ GPU 상태 확인
allocated = torch.cuda.memory_allocated() / 1024**3
cached = torch.cuda.memory_reserved() / 1024**3
print(f"Allocated: {allocated:.2f} GB")
print(f"Cached:    {cached:.2f} GB")


Allocated: 19.10 GB
Cached:    74.96 GB


In [38]:
import gc
gc.collect()

508

In [40]:
for p in model.parameters():
    p.grad = None

In [41]:
show_gpu_memory()

=== GPU Memory Usage ===
GPU 0 (NVIDIA A100 80GB PCIe):
  Allocated: 19.10 GB
  Cached: 74.32 GB
  Total: 79.14 GB
  Free: 4.82 GB



In [28]:
import gc
import torch
from collections import defaultdict

def show_tensor_memory():
    """
    현재 메모리에 있는 모든 PyTorch 텐서의 정보를 집계하여 보여줍니다.
    ReferenceError를 처리하도록 수정되었습니다.
    """
    tensor_stats = defaultdict(lambda: {'count': 0, 'size_mb': 0})
    
    # 모든 객체를 순회하며 텐서 찾기
    for obj in gc.get_objects():
        try:
            # is_tensor 호출 및 이후 로직을 try 블록 안에 넣습니다.
            if torch.is_tensor(obj):
                tensor_type = str(obj.dtype)
                tensor_device = str(obj.device)
                size_in_bytes = obj.numel() * obj.element_size()
                size_in_mb = size_in_bytes / (1024 * 1024)
                
                key = f"{tensor_type} on {tensor_device}"
                tensor_stats[key]['count'] += 1
                tensor_stats[key]['size_mb'] += size_in_mb

        except ReferenceError:
            # 이미 사라진 객체에 대한 참조 에러는 무시하고 계속 진행합니다.
            continue
            
    # 결과 출력
    print("--- PyTorch Tensor Memory Usage ---")
    total_size_mb = 0
    for key, stats in sorted(tensor_stats.items()):
        total_size_mb += stats['size_mb']
        print(f"{key}: {stats['count']} tensors, {stats['size_mb']:.2f} MB")
    print("-" * 35)
    print(f"Total Allocated Tensor Memory: {total_size_mb:.2f} MB")
    print("-" * 35)

# 함수 호출
show_tensor_memory()

--- PyTorch Tensor Memory Usage ---
torch.bfloat16 on cuda:0: 37443 tensors, 19546.77 MB
torch.float32 on cpu: 1 tensors, 0.00 MB
torch.float32 on cuda:0: 1 tensors, 0.00 MB
torch.int64 on cuda:0: 4 tensors, 0.00 MB
-----------------------------------
Total Allocated Tensor Memory: 19546.77 MB
-----------------------------------


In [31]:
import torch
import gc
import os
import psutil
import time

def aggressive_cleanup():
    """강력한 메모리 정리"""
    print("=== Aggressive Memory Cleanup ===")
    
    # 1. Python 가비지 컬렉션
    print("Running garbage collection...")
    collected = gc.collect()
    print(f"Collected {collected} objects")
    
    # 2. CUDA 메모리 정리
    if torch.cuda.is_available():
        print("Clearing CUDA cache...")
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
        
        # 각 GPU별로 정리
        for i in range(torch.cuda.device_count()):
            with torch.cuda.device(i):
                torch.cuda.empty_cache()
                torch.cuda.synchronize()
        
        print("CUDA cache cleared")
    
    # 3. 메모리 상태 확인
    show_gpu_memory()

In [32]:
aggressive_cleanup()

=== Aggressive Memory Cleanup ===
Running garbage collection...
Collected 833 objects
Clearing CUDA cache...
CUDA cache cleared
=== GPU Memory Usage ===
GPU 0 (NVIDIA A100 80GB PCIe):
  Allocated: 19.10 GB
  Cached: 74.32 GB
  Total: 79.14 GB
  Free: 4.82 GB



In [12]:
import torch
import torch.nn as nn

def get_model_size(model: nn.Module):
    """모델 파라미터의 총 메모리 사용량을 MB 단위로 계산합니다."""
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    
    # 버퍼(예: BatchNorm의 running_mean) 크기도 포함
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    size_all_mb = (param_size + buffer_size) / 1024**2
    print(f"Model Size: {size_all_mb:.2f} MB")
    return size_all_mb

# --- 사용 예시 ---
# model = YourModelClass()  # 여기에 본인 모델 객체를 넣으세요.
get_model_size(model)

Model Size: 19472.40 MB


19472.402587890625

In [34]:
import torch

torch.cuda.empty_cache()   # 캐시 해제 (권장, 안전)
torch.cuda.reset_peak_memory_stats()  # peak memory 초기화


In [19]:
show_gpu_memory()

=== GPU Memory Usage ===
GPU 0 (NVIDIA A100 80GB PCIe):
  Allocated: 19.02 GB
  Cached: 74.87 GB
  Total: 79.14 GB
  Free: 4.27 GB



In [12]:
model_inputs

{'input_ids': tensor([[151644,    872,    198,  14990, 151645,    198, 151644,  77091,    198]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [10]:
model

Qwen3MoeForCausalLM(
  (model): Qwen3MoeModel(
    (embed_tokens): Embedding(151936, 2048)
    (layers): ModuleList(
      (0-47): 48 x Qwen3MoeDecoderLayer(
        (self_attn): Qwen3MoeAttention(
          (q_proj): Linear(in_features=2048, out_features=4096, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=4096, out_features=2048, bias=False)
          (q_norm): Qwen3MoeRMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3MoeRMSNorm((128,), eps=1e-06)
        )
        (mlp): SVDQwen3MoeSparseMoeBlock(
          (gate): Linear(in_features=2048, out_features=128, bias=False)
          (experts): ModuleList(
            (0-127): 128 x moe_layer(
              (act_fn): SiLU()
              (v1): Linear(in_features=2048, out_features=167, bias=False)
              (us1): Linear(in_features=167, out_features=768, bias=False)
             

In [14]:
model.to('cuda')

Qwen3MoeForCausalLM(
  (model): Qwen3MoeModel(
    (embed_tokens): Embedding(151936, 2048)
    (layers): ModuleList(
      (0-47): 48 x Qwen3MoeDecoderLayer(
        (self_attn): Qwen3MoeAttention(
          (q_proj): Linear(in_features=2048, out_features=4096, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=4096, out_features=2048, bias=False)
          (q_norm): Qwen3MoeRMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3MoeRMSNorm((128,), eps=1e-06)
        )
        (mlp): SVDQwen3MoeSparseMoeBlock(
          (gate): Linear(in_features=2048, out_features=128, bias=False)
          (experts): ModuleList(
            (0-127): 128 x moe_layer(
              (act_fn): SiLU()
              (v1): Linear(in_features=2048, out_features=167, bias=False)
              (us1): Linear(in_features=167, out_features=768, bias=False)
             

In [15]:

print('start')
with torch.autocast(device_type="cuda",dtype=torch.bfloat16):
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=100
    )
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 
print('out gen')


start
hidden_states: torch.Size([1, 9, 2048])
hidden_states: torch.Size([9, 2048])
expert_mask: torch.Size([128, 8, 9]), tensor([[[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]],

        ...,

        [[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 1, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]

In [17]:
# parsing thinking content
try:
    # rindex finding 151668 (</think>)
    index = len(output_ids) - output_ids[::-1].index(151668)
except ValueError:
    index = 0

thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
print('decoded')
content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

print("thinking content:", thinking_content)
print("content:", content)

decoded
thinking content: 
content: end: - no.__end__ __ _ _____ ____:_:_:_:__:_:_:_:_:_:_:_: :_:_:_:_:_:_:_:_: :_:_: ::__:_: :_ :_ : : : :_: :_:_: :_:_: : : : : : :_:_:_: : : : : : :_:_:_:_:_:_:_:_: :_:_: : :_: : : : : : _:


In [16]:
print('a')

a


In [33]:
torch.cuda.empty_cache()


In [None]:
model = model.half()

In [26]:
model.model.layers[i].mlp.experts[i].gate_proj.weight.dtype

torch.bfloat16

In [22]:
model.dtype

torch.bfloat16

In [None]:
!pip install huggingface_hub -q
from huggingface_hub import login


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
