## Imports

In [1]:
import os
from luna.utils.llama import LLaMATokenizer, LLaMAForCausalLM
from transformers import LlamaTokenizer, LlamaForCausalLM, AutoTokenizer, AutoModelForCausalLM

import torch


luna_models= {  'tokenizer': LLaMATokenizer, 'model': LLaMAForCausalLM}
hf_llama_models = { 'tokenizer': LlamaTokenizer, 'model': LlamaForCausalLM}
hf_auto_models = { 'tokenizer': AutoTokenizer, 'model': AutoModelForCausalLM}

backends = {'luna': luna_models, 'hf_llama': hf_llama_models, 'hf_auto': hf_auto_models}

from huggingface_hub import login

In [2]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Test of chat templates

In [65]:
backend_name = 'hf_auto'
checkpoint = "meta-llama/Llama-3.1-8B-Instruct"

backend = backends[backend_name]
device='cuda:0'
tokenizer = backend['tokenizer'].from_pretrained(checkpoint,device_map="auto")
model = backend['model'].from_pretrained(checkpoint,low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map=device)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [66]:
messages = [
    {
        "role": "system",
        "content": "You are a friendly chatbot who always responds in the style of a researcher. You are also a bit of a comedian.",
    },
    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
 ]

In [114]:
torch.cuda.empty_cache()

In [111]:
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=False,add_generation_prompt=True, )
inputs = tokenizer(tokenized_chat, return_tensors="pt", padding=False, truncation=True, max_length=2500).to(device)
del tokenized_chat
nb_tokens_in = len(inputs[0])
print(f"nb_tokens_in: {nb_tokens_in}")

nb_tokens_in: 72


In [177]:
outputs = model.generate(inputs.input_ids, top_k=32, max_new_tokens=128, pad_token_id=tokenizer.eos_token_id, output_scores=True,return_dict_in_generate=True,
                         output_hidden_states=True, output_attentions=True, attention_mask=inputs['attention_mask'])  

In [133]:
type(outputs)

transformers.generation.utils.GenerateDecoderOnlyOutput

In [131]:
print(tokenizer.decode(outputs['sequences'][0],skip_special_tokens=False))
outputs.__dict__.keys()
nb_tokens_out = len(outputs.sequences[0])
print(f"nb of new tokens: { nb_tokens_out-nb_tokens_in} ")

<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a friendly chatbot who always responds in the style of a researcher. You are also a bit of a comedian.<|eot_id|><|start_header_id|>user<|end_header_id|>

How many helicopters can a human eat in one sitting?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

(chuckling) Ahah, I think we may have a bit of a "flying" question on our hands here! (pausing for comedic effect) As a researcher, I must inform you that it's not possible for a human to eat helicopters in one sitting. In fact, it's not possible for a human to eat a helicopter at all, as they are complex machines made of metal, plastic, and other materials that aren't exactly digestible.

But, if we're looking for a more theoretical answer, let's consider the nutritional value of a helicopter. (smiling) Unfortunately, there isn't much to go
nb of new tokens: 128 


In [173]:
print(outputs.hidden_states[0][0][:5]-outputs.hidden_states[8][0][:5])

tensor([[[ 1.0853e-03,  1.1581e-02, -5.1003e-03,  ..., -1.0773e-02,
          -5.3549e-04,  9.4299e-03],
         [ 1.0853e-03,  1.1581e-02, -5.1003e-03,  ..., -1.0773e-02,
          -5.3549e-04,  9.4299e-03],
         [ 6.5517e-04,  1.1841e-02, -4.3526e-03,  ..., -1.4740e-02,
          -2.4796e-04,  8.9722e-03],
         ...,
         [-8.9417e-03,  8.6212e-03, -2.6550e-03,  ..., -2.1729e-02,
          -4.9019e-03,  1.6846e-02],
         [ 3.5858e-04,  1.1688e-02, -4.5242e-03,  ..., -1.5213e-02,
          -1.1024e-03,  8.9111e-03],
         [-2.0485e-03,  1.5976e-02, -4.6921e-03,  ..., -1.3779e-02,
          -9.5367e-05,  1.0544e-02]]], device='cuda:0', dtype=torch.float16)


In [182]:
j = 0
i = -1
for j in range(len(outputs.hidden_states)):
    print(f" Number of elements : {len(outputs.hidden_states[j])}")
    print(f"Hidden states shape for generated token [{j}] : {outputs.hidden_states[j][i].shape}")
    #print(f"Some values: {outputs.hidden_states[j][i][:5]}")
    
    print(f"Attention weights shape for generated token [{j}] : {outputs.attentions[j][i].shape}")

 Number of elements : 33
Hidden states shape for generated token [0] : torch.Size([1, 72, 4096])
Attention weights shape for generated token [0] : torch.Size([1, 32, 72, 72])
 Number of elements : 33
Hidden states shape for generated token [1] : torch.Size([1, 1, 4096])
Attention weights shape for generated token [1] : torch.Size([1, 32, 1, 73])
 Number of elements : 33
Hidden states shape for generated token [2] : torch.Size([1, 1, 4096])
Attention weights shape for generated token [2] : torch.Size([1, 32, 1, 74])
 Number of elements : 33
Hidden states shape for generated token [3] : torch.Size([1, 1, 4096])
Attention weights shape for generated token [3] : torch.Size([1, 32, 1, 75])
 Number of elements : 33
Hidden states shape for generated token [4] : torch.Size([1, 1, 4096])
Attention weights shape for generated token [4] : torch.Size([1, 32, 1, 76])
 Number of elements : 33
Hidden states shape for generated token [5] : torch.Size([1, 1, 4096])
Attention weights shape for generated

In [187]:
j = 1
i = -1
for j in range(len(outputs.hidden_states)):
    print(f" Number of elements : {len(outputs.hidden_states[j])}")
    #print(f"Hidden states shape for generated token [{j}] : {outputs.hidden_states[j][i].shape}")
    print(f"Last hidden state : {outputs.hidden_states[j][i][0,-1,:4096]}")
    #print(f"Some values: {outputs.hidden_states[j][i][:5]}")
    
    #print(f"Attention weights shape for generated token [{j}] : {outputs.attentions[j][i].shape}")

 Number of elements : 33
Last hidden state : tensor([-1.3076,  1.5225, -0.2357,  ...,  0.3049, -1.8154, -2.0996],
       device='cuda:0', dtype=torch.float16)
 Number of elements : 33
Last hidden state : tensor([ 0.2549, -1.9189,  2.3262,  ...,  4.1211,  0.2391, -4.5898],
       device='cuda:0', dtype=torch.float16)
 Number of elements : 33
Last hidden state : tensor([-7.0078,  3.7402,  1.7207,  ...,  2.6328, -0.7285, -1.0400],
       device='cuda:0', dtype=torch.float16)
 Number of elements : 33
Last hidden state : tensor([-6.5820,  2.1250, -0.2280,  ...,  1.4365, -1.4990, -2.0879],
       device='cuda:0', dtype=torch.float16)
 Number of elements : 33
Last hidden state : tensor([-1.5742,  2.7383, -4.1289,  ...,  2.5371,  2.4668, -3.2344],
       device='cuda:0', dtype=torch.float16)
 Number of elements : 33
Last hidden state : tensor([-2.5391,  1.0801, -3.5723,  ...,  4.4727,  1.9434, -0.9648],
       device='cuda:0', dtype=torch.float16)
 Number of elements : 33
Last hidden state : t

In [183]:
outputs['sequences'][:,nb_tokens_in:]

tensor([[   32,  3488,   430,  5334,   311,   279,  4851,   315,   279,  5030,
           482,   477,  1288,   358,  2019,    11,   279, 23152,    30,   353,
           331, 84796, 22242, 31140,    11,   358,  2011,  6179,   499,   430,
           433,   596,   539,  3284,   369,   264,  3823,   311,  8343,   264,
         36125,   304,   832, 11961,    13, 16183, 24904,   388,   527,  6485,
         12933,  1903,   315,  9501,    11, 12466,    11,   323,  1023,  7384,
            11,   539, 67740,  3673,    13,   763,  2144,    11, 19969,   311,
         25024,   264, 36125,  1053,   387,  5115,  1131,   359, 38128,    11,
           311,  2019,   279,  3325,   382, 11458,    11,   422,   584,  2351,
          7556,   922,   264, 59159, 15398,  1405,   264,  3823,  1436, 17354,
         78825, 21552,   264, 36125,    11,   358,  4265, 16430,   279,  4320,
           311,   387,   330, 14486,  1210,  3011,   596,  1314,    11,  7315,
         59432,   649,   387, 35661,   304,   832, 1

In [184]:
torch.cuda.empty_cache()

In [174]:
model.model.layers

ModuleList(
  (0): LlamaDecoderLayer(
    (self_attn): LlamaAttention(
      (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
      (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
      (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
      (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
      (rotary_emb): LlamaRotaryEmbedding()
    )
    (mlp): LlamaMLP(
      (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
      (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
      (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
      (act_fn): SiLU()
    )
    (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
    (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
  )
  (1): LlamaDecoderLayer(
    (self_attn): LlamaAttention(
      (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
      (k_proj): Linear(in_features=4096, out_features=1024, 

In [110]:
print(tokenizer.decode(outputs['sequences'][0][nb_tokens_in:], skip_special_tokens=False))

(laughs) Ah, I think we have a case of "fowl" humor here! I'm happy to help, but I have to say, eating a helicopter is quite an...unconventional question.

As a researcher,
