## Imports

In [1]:
import os
from luna.utils.llama import LLaMATokenizer, LLaMAForCausalLM
from transformers import LlamaTokenizer, LlamaForCausalLM, AutoTokenizer, AutoModelForCausalLM

import torch


luna_models= {  'tokenizer': LLaMATokenizer, 'model': LLaMAForCausalLM}
hf_llama_models = { 'tokenizer': LlamaTokenizer, 'model': LlamaForCausalLM}
hf_auto_models = { 'tokenizer': AutoTokenizer, 'model': AutoModelForCausalLM}

backends = {'luna': luna_models, 'hf_llama': hf_llama_models, 'hf_auto': hf_auto_models}

from huggingface_hub import login

## Test of chat templates

In [None]:
backend_name = 'hf_auto'

model_dict= {"llama_8B": "meta-llama/Llama-3.1-8B-Instruct",
             "meta_8B": "meta-llama/Llama-3.1-8B-Instruct",
              "llama_3B": "meta-llama/Llama-3.2-3B-Instruct", 
              "mistral_nemo": "mistralai/Mistral-Nemo-Instruct-2407",
              "gemma_9B": "google/gemma-2-9b-it",
              "gemma_27B": "google/gemma-2-27b-it",
              "gemma_7B": "google/gemma-7b-it",
              "qwen_14B": "Qwen/Qwen2.5-14B-Instruct",
              "qwen_7B": "Qwen/Qwen2.5-7B-Instruct"}
model_name= "llama_8B"
checkpoint = model_dict[model_name]
temperature=0.0001
backend = backends[backend_name]
device='cuda:0'
tokenizer = backend['tokenizer'].from_pretrained(checkpoint,device_map="cuda:0")
model = backend['model'].from_pretrained(checkpoint,low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map=device)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
messages = [
    {
        "role": "system",
        "content": "You are a friendly chatbot who always responds in the style of a researcher. You are also a bit of a comedian.",
    },
    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
 ]

In [4]:
torch.cuda.empty_cache()

In [5]:
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=False,add_generation_prompt=True, )
inputs = tokenizer(tokenized_chat, return_tensors="pt", padding=False, truncation=True, max_length=2500).to(device)
#del tokenized_chat
nb_tokens_in = len(inputs[0])
print(f"nb_tokens_in: {nb_tokens_in}")

nb_tokens_in: 72


In [6]:
tokenized_chat

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\nYou are a friendly chatbot who always responds in the style of a researcher. You are also a bit of a comedian.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHow many helicopters can a human eat in one sitting?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'

In [7]:
# print the tokens in natural language
print(tokenizer.decode(inputs['input_ids'][0]))

<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a friendly chatbot who always responds in the style of a researcher. You are also a bit of a comedian.<|eot_id|><|start_header_id|>user<|end_header_id|>

How many helicopters can a human eat in one sitting?<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [8]:
outputs = model.generate(inputs.input_ids, top_k=128, max_new_tokens=128, pad_token_id=tokenizer.eos_token_id, output_scores=True,return_dict_in_generate=True,
                         output_hidden_states=True, output_attentions=True, attention_mask=inputs['attention_mask'])  

From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.


In [9]:
%%timeit
for _ in range(100):
    tokenizer.batch_decode(outputs['sequences'][0], skip_special_tokens=True)

266 ms ± 4.02 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%%timeit
for _ in range(100):
    tokenizer.decode(outputs['sequences'][0], skip_special_tokens=True)

5.26 ms ± 105 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [11]:
type(outputs)

transformers.generation.utils.GenerateDecoderOnlyOutput

In [13]:
c = tokenizer.decode(outputs['sequences'][0],skip_special_tokens=False)
print(c)
outputs.__dict__.keys()
nb_tokens_out = len(outputs.sequences[0])
print(f"nb of new tokens: { nb_tokens_out-nb_tokens_in} ")

<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a friendly chatbot who always responds in the style of a researcher. You are also a bit of a comedian.<|eot_id|><|start_header_id|>user<|end_header_id|>

How many helicopters can a human eat in one sitting?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

What an intriguing question, my friend.  I must admit, I've never come across any credible research on this topic before. However, I can try to provide some insights based on what I know about human physiology and the size of helicopters.

Firstly, let's consider the average size of a helicopter. A small helicopter, like a Robinson R22, weighs around 1,100 pounds (500 kg). A larger helicopter, like a Sikorsky S-92, can weigh up to 26,000 pounds (11,800 kg). For the sake of this thought experiment, let's assume we're talking about a small
nb of new tokens: 128 


In [14]:
print(outputs.hidden_states[0][0][:5]-outputs.hidden_states[8][0][:5])

tensor([[[ 9.4032e-04,  7.6294e-05, -1.0605e-03,  ...,  4.8828e-04,
          -3.2759e-04,  7.1526e-04],
         [ 9.4032e-04,  7.6294e-05, -1.0605e-03,  ...,  4.8828e-04,
          -3.2759e-04,  7.1526e-04],
         [ 5.1022e-04,  3.3283e-04, -3.1281e-04,  ..., -3.4790e-03,
          -4.0054e-05,  2.5749e-04],
         ...,
         [-9.0942e-03, -2.8877e-03,  1.3847e-03,  ..., -1.0468e-02,
          -4.6921e-03,  8.1329e-03],
         [ 2.1362e-04,  1.8311e-04, -4.8351e-04,  ..., -3.9520e-03,
          -8.9455e-04,  1.9264e-04],
         [-2.1935e-03,  4.4670e-03, -6.5041e-04,  ..., -2.5177e-03,
           1.1253e-04,  1.8291e-03]]], device='cuda:0', dtype=torch.float16)


In [15]:
j = 0
i = -1
for j in range(len(outputs.hidden_states)):
    print(f" Number of elements : {len(outputs.hidden_states[j][1:])}")
    print(f"Hidden states shape for generated token [{j}] : {outputs.hidden_states[j][i].shape}")
    #print(f"Some values: {outputs.hidden_states[j][i][:5]}")
    
    print(f"Attention weights shape for generated token [{j}] : {outputs.attentions[j][i].shape}")
    print(f"Last layer attentions for generated token [{j}] : {outputs.attentions[j][i][0,-1,-1]}")

 Number of elements : 32
Hidden states shape for generated token [0] : torch.Size([1, 72, 4096])
Attention weights shape for generated token [0] : torch.Size([1, 32, 72, 72])
Last layer attentions for generated token [0] : tensor([0.2468, 0.2473, 0.0012, 0.0014, 0.0008, 0.0104, 0.0011, 0.0009, 0.0013,
        0.0018, 0.0077, 0.0013, 0.0009, 0.0012, 0.0284, 0.0298, 0.0022, 0.0029,
        0.0009, 0.0016, 0.0009, 0.0014, 0.0110, 0.0027, 0.0120, 0.0504, 0.0099,
        0.0026, 0.0011, 0.0026, 0.0029, 0.0064, 0.0024, 0.0027, 0.0018, 0.0025,
        0.0010, 0.0018, 0.0012, 0.0005, 0.0066, 0.0024, 0.0024, 0.0025, 0.0025,
        0.0009, 0.0016, 0.0009, 0.0009, 0.0058, 0.0038, 0.0023, 0.0047, 0.0137,
        0.0028, 0.0212, 0.0036, 0.0040, 0.0104, 0.0047, 0.0037, 0.0049, 0.0091,
        0.0052, 0.0026, 0.0030, 0.0032, 0.0151, 0.0027, 0.0438, 0.0311, 0.0702],
       device='cuda:0', dtype=torch.float16)
 Number of elements : 32
Hidden states shape for generated token [1] : torch.Size([1, 1, 40

In [16]:
j = 1
i = -1
for j in range(len(outputs.hidden_states)):
    print(f" Number of elements : {len(outputs.hidden_states[j])}")
    #print(f"Hidden states shape for generated token [{j}] : {outputs.hidden_states[j][i].shape}")
    print(f"Last hidden state : {outputs.hidden_states[j][i][0,-1,:4096]}")
    #print(f"Some values: {outputs.hidden_states[j][i][:5]}")
    
    #print(f"Attention weights shape for generated token [{j}] : {outputs.attentions[j][i].shape}")

 Number of elements : 33
Last hidden state : tensor([-1.3057,  1.5254, -0.2416,  ...,  0.3003, -1.8262, -2.1016],
       device='cuda:0', dtype=torch.float16)
 Number of elements : 33
Last hidden state : tensor([-3.1152, -1.7266,  0.7705,  ...,  2.9336,  0.3928, -4.2734],
       device='cuda:0', dtype=torch.float16)
 Number of elements : 33
Last hidden state : tensor([-3.4785,  0.6235,  4.4258,  ...,  1.9551, -1.7490, -4.8086],
       device='cuda:0', dtype=torch.float16)
 Number of elements : 33
Last hidden state : tensor([ 1.4873, -1.3467,  0.0574,  ...,  3.5391,  0.3665, -2.8145],
       device='cuda:0', dtype=torch.float16)
 Number of elements : 33
Last hidden state : tensor([-5.3906,  5.2148,  0.3464,  ...,  0.4092, -2.9355,  0.1616],
       device='cuda:0', dtype=torch.float16)
 Number of elements : 33
Last hidden state : tensor([-5.7734,  3.5352,  1.3125,  ...,  1.8682, -1.0205,  0.5771],
       device='cuda:0', dtype=torch.float16)
 Number of elements : 33
Last hidden state : t

In [32]:
outputs['sequences'][:,nb_tokens_in:]

NameError: name 'outputs' is not defined

In [14]:
torch.cuda.empty_cache()

In [15]:
model.model.layers

ModuleList(
  (0): LlamaDecoderLayer(
    (self_attn): LlamaAttention(
      (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
      (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
      (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
      (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
      (rotary_emb): LlamaRotaryEmbedding()
    )
    (mlp): LlamaMLP(
      (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
      (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
      (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
      (act_fn): SiLU()
    )
    (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
    (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
  )
  (1): LlamaDecoderLayer(
    (self_attn): LlamaAttention(
      (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
      (k_proj): Linear(in_features=4096, out_features=1024, 

In [16]:
print(tokenizer.decode(outputs['sequences'][0][nb_tokens_in:], skip_special_tokens=False))

A question that gets to the heart of the absurd. I must admit, I've never come across any research on this topic. In fact, I'm not even sure it's possible for a human to eat a helicopter in one sitting. Helicopters are large, complex machines made of metal, plastic, and other materials, not exactly something you'd find on the menu at your local diner.

However, if we assume a hypothetical scenario where a human could somehow consume a helicopter, I'd estimate the number of helicopters a person could eat in one sitting would be... zero. That's right, folks, a big fat zero. Not
