## Imports

In [None]:
import os
from transformers import LlamaTokenizer, LlamaForCausalLM, AutoTokenizer, AutoModelForCausalLM

import torch


hf_llama_models = { 'tokenizer': LlamaTokenizer, 'model': LlamaForCausalLM}
hf_auto_models = { 'tokenizer': AutoTokenizer, 'model': AutoModelForCausalLM}

backends = {'hf_llama': hf_llama_models, 'hf_auto': hf_auto_models}

from huggingface_hub import login

## Test of chat templates

In [None]:
backend_name = 'hf_auto'
checkpoint = "Khawn2u/Llama-3.1-8b-Chain-Of-Thought-GGUF"

backend = backends[backend_name]
device='cuda:0'
tokenizer = backend['tokenizer'].from_pretrained(checkpoint,device_map="auto")
model = backend['model'].from_pretrained(checkpoint,low_cpu_mem_usage=True, torch_dtype=torch.float16, device_map=device)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
messages = [
    {
        "role": "system",
        "content": "You are a friendly chatbot who always responds in the style of a researcher. You are also a bit of a comedian.",
    },
    {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
 ]

In [5]:
torch.cuda.empty_cache()

In [6]:
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=False,add_generation_prompt=True, )
inputs = tokenizer(tokenized_chat, return_tensors="pt", padding=False, truncation=True, max_length=2500).to(device)
del tokenized_chat
nb_tokens_in = len(inputs[0])
print(f"nb_tokens_in: {nb_tokens_in}")

nb_tokens_in: 72


In [7]:
outputs = model.generate(inputs.input_ids, top_k=32, max_new_tokens=128, pad_token_id=tokenizer.eos_token_id, output_scores=True,return_dict_in_generate=True,
                         output_hidden_states=True, output_attentions=True, attention_mask=inputs['attention_mask'])  

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
From v4.47 onwards, when a model cache is to be returned, `generate` will return a `Cache` instance instead by default (as opposed to the legacy tuple of tuples format). If you want to keep returning the legacy format, please set `return_legacy_cache=True`.


In [9]:
print(tokenizer.decode(outputs['sequences'][0],skip_special_tokens=False))
outputs.__dict__.keys()
nb_tokens_out = len(outputs.sequences[0])
print(f"nb of new tokens: { nb_tokens_out-nb_tokens_in} ")

<|begin_of_text|><|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

You are a friendly chatbot who always responds in the style of a researcher. You are also a bit of a comedian.<|eot_id|><|start_header_id|>user<|end_header_id|>

How many helicopters can a human eat in one sitting?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

A question that gets to the heart of the absurd. I must admit, I've never come across any research on this topic. In fact, I'm not even sure it's possible for a human to eat a helicopter in one sitting. Helicopters are large, complex machines made of metal, plastic, and other materials, not exactly something you'd find on the menu at your local diner.

However, if we assume a hypothetical scenario where a human could somehow consume a helicopter, I'd estimate the number of helicopters a person could eat in one sitting would be... zero. That's right, folks, a big fat zero. Not
nb of

In [10]:
print(outputs.hidden_states[0][0][:5]-outputs.hidden_states[8][0][:5])

tensor([[[-0.0010, -0.0014, -0.0027,  ...,  0.0022, -0.0040,  0.0052],
         [-0.0010, -0.0014, -0.0027,  ...,  0.0022, -0.0040,  0.0052],
         [-0.0014, -0.0012, -0.0019,  ..., -0.0018, -0.0037,  0.0048],
         ...,
         [-0.0110, -0.0044, -0.0002,  ..., -0.0088, -0.0084,  0.0126],
         [-0.0017, -0.0013, -0.0021,  ..., -0.0023, -0.0046,  0.0047],
         [-0.0042,  0.0030, -0.0023,  ..., -0.0008, -0.0036,  0.0063]]],
       device='cuda:0', dtype=torch.float16)


In [11]:
j = 0
i = -1
for j in range(len(outputs.hidden_states)):
    print(f" Number of elements : {len(outputs.hidden_states[j][1:])}")
    print(f"Hidden states shape for generated token [{j}] : {outputs.hidden_states[j][i].shape}")
    #print(f"Some values: {outputs.hidden_states[j][i][:5]}")
    
    print(f"Attention weights shape for generated token [{j}] : {outputs.attentions[j][i].shape}")
    print(f"Last layer attentions for generated token [{j}] : {outputs.attentions[j][i][0,-1,-1]}")

 Number of elements : 32
Hidden states shape for generated token [0] : torch.Size([1, 72, 4096])
Attention weights shape for generated token [0] : torch.Size([1, 32, 72, 72])
Last layer attentions for generated token [0] : tensor([0.2471, 0.2476, 0.0012, 0.0014, 0.0008, 0.0104, 0.0011, 0.0009, 0.0013,
        0.0019, 0.0077, 0.0013, 0.0009, 0.0012, 0.0284, 0.0297, 0.0022, 0.0029,
        0.0009, 0.0016, 0.0009, 0.0014, 0.0110, 0.0027, 0.0119, 0.0503, 0.0099,
        0.0026, 0.0011, 0.0026, 0.0029, 0.0064, 0.0024, 0.0027, 0.0018, 0.0025,
        0.0010, 0.0018, 0.0012, 0.0005, 0.0066, 0.0024, 0.0024, 0.0025, 0.0025,
        0.0009, 0.0016, 0.0009, 0.0010, 0.0058, 0.0037, 0.0023, 0.0047, 0.0137,
        0.0028, 0.0212, 0.0036, 0.0039, 0.0104, 0.0047, 0.0037, 0.0049, 0.0091,
        0.0052, 0.0026, 0.0030, 0.0032, 0.0150, 0.0027, 0.0438, 0.0311, 0.0701],
       device='cuda:0', dtype=torch.float16)
 Number of elements : 32
Hidden states shape for generated token [1] : torch.Size([1, 1, 40

In [12]:
j = 1
i = -1
for j in range(len(outputs.hidden_states)):
    print(f" Number of elements : {len(outputs.hidden_states[j])}")
    #print(f"Hidden states shape for generated token [{j}] : {outputs.hidden_states[j][i].shape}")
    print(f"Last hidden state : {outputs.hidden_states[j][i][0,-1,:4096]}")
    #print(f"Some values: {outputs.hidden_states[j][i][:5]}")
    
    #print(f"Attention weights shape for generated token [{j}] : {outputs.attentions[j][i].shape}")

 Number of elements : 33
Last hidden state : tensor([-1.3076,  1.5225, -0.2357,  ...,  0.3049, -1.8154, -2.0996],
       device='cuda:0', dtype=torch.float16)
 Number of elements : 33
Last hidden state : tensor([ 0.2549, -1.9189,  2.3262,  ...,  4.1211,  0.2391, -4.5898],
       device='cuda:0', dtype=torch.float16)
 Number of elements : 33
Last hidden state : tensor([-7.0078,  3.7402,  1.7207,  ...,  2.6328, -0.7285, -1.0400],
       device='cuda:0', dtype=torch.float16)
 Number of elements : 33
Last hidden state : tensor([-6.5820,  2.1250, -0.2280,  ...,  1.4365, -1.4990, -2.0879],
       device='cuda:0', dtype=torch.float16)
 Number of elements : 33
Last hidden state : tensor([-1.5742,  2.7383, -4.1289,  ...,  2.5371,  2.4668, -3.2344],
       device='cuda:0', dtype=torch.float16)
 Number of elements : 33
Last hidden state : tensor([-2.5391,  1.0801, -3.5723,  ...,  4.4727,  1.9434, -0.9648],
       device='cuda:0', dtype=torch.float16)
 Number of elements : 33
Last hidden state : t

In [13]:
outputs['sequences'][:,nb_tokens_in:]

tensor([[   32,  3488,   430,  5334,   311,   279,  4851,   315,   279, 32677,
            13,   358,  2011, 17113,    11,   358,  3077,  2646,  2586,  4028,
           904,  3495,   389,   420,  8712,    13,   763,  2144,    11,   358,
          2846,   539,  1524,  2771,   433,   596,  3284,   369,   264,  3823,
           311,  8343,   264, 36125,   304,   832, 11961,    13, 16183, 24904,
           388,   527,  3544,    11,  6485, 12933,  1903,   315,  9501,    11,
         12466,    11,   323,  1023,  7384,    11,   539,  7041,  2555,   499,
          4265,  1505,   389,   279,  5130,   520,   701,  2254, 89206,   382,
         11458,    11,   422,   584,  9855,   264, 59159, 15398,  1405,   264,
          3823,  1436, 17354, 25024,   264, 36125,    11,   358,  4265, 16430,
           279,  1396,   315, 59432,   264,  1732,  1436,  8343,   304,   832,
         11961,  1053,   387,  1131,  7315,    13,  3011,   596,  1314,    11,
         15687,    11,   264,  2466,  8834,  7315,  

In [14]:
torch.cuda.empty_cache()

In [15]:
model.model.layers

ModuleList(
  (0): LlamaDecoderLayer(
    (self_attn): LlamaAttention(
      (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
      (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
      (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
      (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
      (rotary_emb): LlamaRotaryEmbedding()
    )
    (mlp): LlamaMLP(
      (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
      (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
      (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
      (act_fn): SiLU()
    )
    (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
    (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
  )
  (1): LlamaDecoderLayer(
    (self_attn): LlamaAttention(
      (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
      (k_proj): Linear(in_features=4096, out_features=1024, 

In [16]:
print(tokenizer.decode(outputs['sequences'][0][nb_tokens_in:], skip_special_tokens=False))

A question that gets to the heart of the absurd. I must admit, I've never come across any research on this topic. In fact, I'm not even sure it's possible for a human to eat a helicopter in one sitting. Helicopters are large, complex machines made of metal, plastic, and other materials, not exactly something you'd find on the menu at your local diner.

However, if we assume a hypothetical scenario where a human could somehow consume a helicopter, I'd estimate the number of helicopters a person could eat in one sitting would be... zero. That's right, folks, a big fat zero. Not
