In [None]:
!huggingface-cli login

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
import torch
import transformers

from llama_prompt import PromptLlama, PromptGenerator


In [2]:
model_name = "meta-llama/Llama-3.2-1B"

In [3]:
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)

In [4]:
prompt_gen = PromptGenerator(prompt_len=3, dim=config.hidden_size)
promp_llama = PromptLlama.from_pretrained(model_name, prompt=prompt_gen)

In [5]:
text = "Das ist ein Test: "
tok_out = tokenizer(text, return_tensors="pt")
input_ids = tok_out["input_ids"]
attention_mask = tok_out["attention_mask"]

In [6]:
promp_llama(input_ids, attention_mask=attention_mask)

torch.Size([1, 3, 2048]) torch.Size([1, 7, 2048])


RuntimeError: Tensors must have same number of dimensions: got 4 and 2

In [7]:
output = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=20, num_beams=1)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [8]:
tokenizer.decode(output[0].tolist())

'<|begin_of_text|>Das ist ein Test: 3D-Modellierung und 3D-Modellierung\n3D-Modell'

In [20]:
config

LlamaConfig {
  "_name_or_path": "meta-llama/Llama-3.2-1B",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "head_dim": 64,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "max_position_embeddings": 131072,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 16,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-05,
  "rope_scaling": {
    "factor": 32.0,
    "high_freq_factor": 4.0,
    "low_freq_factor": 1.0,
    "original_max_position_embeddings": 8192,
    "rope_type": "llama3"
  },
  "rope_theta": 500000.0,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.48.3",
  "use_cache": true,
  "vocab_size": 128256
}

## Generate Prefix
Via the `past_key_values` argumment, we can append an additional prefix to every layer in the transformer.

In [47]:
class PrefixGenerator:

  def __init__(self, config, batch_size=1, seq_len=12) -> None:
    self.n_layer = config.num_hidden_layers
    self.batch_size = batch_size
    self.num_heads = config.num_key_value_heads
    self.seq_len = seq_len
    self.embed_size_per_head = config.head_dim

  def generate_prefix(self):
    # from the documentation:
    # Tuple of tuple(torch.FloatTensor) of length config.n_layers, with each tuple having 2 tensors of shape (batch_size, num_heads, sequence_length, embed_size_per_head)).
    size = (self.batch_size, self.num_heads, self.seq_len, self.embed_size_per_head)
    prefix = tuple(
        (torch.rand(size), torch.rand(size))
        for _ in range(self.n_layer)
        )
    return prefix

In [48]:
prefix_gen = PrefixGenerator(config, batch_size=1)

In [49]:
p = prefix_gen.generate_prefix()

In [50]:
p[0][0].shape

torch.Size([1, 8, 12, 64])

In [86]:
cache_position =  torch.arange(19, dtype=torch.int64)

In [87]:
cache_position[-1]

tensor(18)

In [62]:
torch.zeros(12).shape,  input_ids[0].shape

(torch.Size([12]), torch.Size([7]))

In [68]:
input_ids = torch.cat((torch.zeros(1, 12), input_ids), dim=1)

AttributeError: module 'torch' has no attribute 'Int'

In [71]:
attention_mask = torch.cat((torch.zeros(1, 12), attention_mask), dim=1)

In [73]:
attention_mask.shape

torch.Size([1, 19])

In [88]:
cache = DynamicCache.from_legacy_cache(p)
input_ids = input_ids.long()
attention_mask = attention_mask.long()
output = model(input_ids, attention_mask=attention_mask, past_key_values=cache, cache_position=cache_position)

RuntimeError: The size of tensor a (31) must match the size of tensor b (19) at non-singleton dimension 3

In [42]:
from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache