In [1]:
from safetensors import safe_open


state = dict()

with safe_open("/models/Llama-3.2-1B-Instruct/model.safetensors", 'pt', device='cpu') as f:
	for key in f.keys():
		state[key] = f.get_tensor(key)

for k, t in state.items():
	print(k, t.shape)


model.embed_tokens.weight torch.Size([128256, 2048])
model.layers.0.input_layernorm.weight torch.Size([2048])
model.layers.0.mlp.down_proj.weight torch.Size([2048, 8192])
model.layers.0.mlp.gate_proj.weight torch.Size([8192, 2048])
model.layers.0.mlp.up_proj.weight torch.Size([8192, 2048])
model.layers.0.post_attention_layernorm.weight torch.Size([2048])
model.layers.0.self_attn.k_proj.weight torch.Size([512, 2048])
model.layers.0.self_attn.o_proj.weight torch.Size([2048, 2048])
model.layers.0.self_attn.q_proj.weight torch.Size([2048, 2048])
model.layers.0.self_attn.v_proj.weight torch.Size([512, 2048])
model.layers.1.input_layernorm.weight torch.Size([2048])
model.layers.1.mlp.down_proj.weight torch.Size([2048, 8192])
model.layers.1.mlp.gate_proj.weight torch.Size([8192, 2048])
model.layers.1.mlp.up_proj.weight torch.Size([8192, 2048])
model.layers.1.post_attention_layernorm.weight torch.Size([2048])
model.layers.1.self_attn.k_proj.weight torch.Size([512, 2048])
model.layers.1.self_at

In [None]:
# prune weights

N_HIDDEN_SIZE = 512
N_INTERMEDIATE_SIZE = 2048
N_K_SIZE = 128

new_state = {}

for k, t in state.items():
	if k.startswith('model.layers.'):
		n = int(k.split('.')[2])
		if n > 3:
			continue

	if '.input_layernorm.' in k or 'model.norm.' in k or '.post_attention_layernorm.' in k:
		nt = t[:N_HIDDEN_SIZE].contiguous()
	elif '.self_attn.o_proj.' in k:
		nt = t[:N_HIDDEN_SIZE, :N_HIDDEN_SIZE].contiguous()
	elif '.self_attn.q_proj.' in k:
		nt = t[:N_HIDDEN_SIZE, :N_HIDDEN_SIZE].contiguous()
	elif '.mlp.down_proj.' in k:
		nt = t[:N_HIDDEN_SIZE, :N_INTERMEDIATE_SIZE].contiguous()
	elif '.mlp.gate_proj.' in k or '.mlp.up_proj.' in k:
		nt = t[:N_INTERMEDIATE_SIZE, :N_HIDDEN_SIZE].contiguous()
	elif '.self_attn.k_proj.' in k or '.self_attn.v_proj.' in k:
		nt = t[:N_K_SIZE, :N_HIDDEN_SIZE].contiguous()
	else:
		nt = t[:, :N_HIDDEN_SIZE].contiguous()
	new_state[k] = nt

for k, t in new_state.items():
	print(k, t.shape)

model.embed_tokens.weight torch.Size([128256, 512])
model.layers.0.input_layernorm.weight torch.Size([512])
model.layers.0.mlp.down_proj.weight torch.Size([512, 2048])
model.layers.0.mlp.gate_proj.weight torch.Size([2048, 512])
model.layers.0.mlp.up_proj.weight torch.Size([2048, 512])
model.layers.0.post_attention_layernorm.weight torch.Size([512])
model.layers.0.self_attn.k_proj.weight torch.Size([128, 512])
model.layers.0.self_attn.o_proj.weight torch.Size([512, 2048])
model.layers.0.self_attn.q_proj.weight torch.Size([2048, 512])
model.layers.0.self_attn.v_proj.weight torch.Size([128, 512])
model.layers.1.input_layernorm.weight torch.Size([512])
model.layers.1.mlp.down_proj.weight torch.Size([512, 2048])
model.layers.1.mlp.gate_proj.weight torch.Size([2048, 512])
model.layers.1.mlp.up_proj.weight torch.Size([2048, 512])
model.layers.1.post_attention_layernorm.weight torch.Size([512])
model.layers.1.self_attn.k_proj.weight torch.Size([128, 512])
model.layers.1.self_attn.o_proj.weight

In [3]:
from safetensors.torch import save_file


save_file(new_state, "/models/Llama-3.2-100M/model.safetensors", metadata={'format': 'pt'})

In [4]:
## test
from transformers import AutoModelForCausalLM


model = AutoModelForCausalLM.from_pretrained('/models/Llama-3.2-100M')
model

  from .autonotebook import tqdm as notebook_tqdm


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 512)
    (layers): ModuleList(
      (0-3): 4 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=512, out_features=2048, bias=False)
          (k_proj): Linear(in_features=512, out_features=128, bias=False)
          (v_proj): Linear(in_features=512, out_features=128, bias=False)
          (o_proj): Linear(in_features=2048, out_features=512, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=512, out_features=2048, bias=False)
          (up_proj): Linear(in_features=512, out_features=2048, bias=False)
          (down_proj): Linear(in_features=2048, out_features=512, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((512,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((512,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNor

In [5]:
import torch


model.generate(torch.tensor([[2,3,4]]).long())

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


tensor([[ 2,  3,  4, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77, 77,
         77, 77]])

---
## test

In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("/models/Llama-3.2-100M")
model = AutoModelForCausalLM.from_pretrained("/models/Llama-3.2-100M")

ids = tokenizer.encode("Hello, ", return_tensors="pt")
output = model.generate(ids)

tokenizer.decode(output[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


'<|begin_of_text|>Hello, 2017. My name is Pat Gaffel, I am a PhD and'