In [None]:
from transformers import AutoTokenizer, BitsAndBytesConfig
from transformers.models.llama.modeling_llama import LlamaForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct")
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = LlamaForCausalLM.from_pretrained("/root/divergent/llama3-from-qwen2",
                                         device_map="auto",
                                         quantization_config=quantization_config,
                                         )

In [None]:
model.config

In [None]:
device = "cuda"

messages = [
    {"role": "user", "content": "What is the meaning of life?"},
]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device)

outputs = model.generate(
    input_ids,
    max_new_tokens=24,
    eos_token_id=tokenizer.eos_token_id,
    do_sample=False,
    temperature=None,
    top_p=None,
)
response = outputs[0][input_ids.shape[-1]:]
print(tokenizer.decode(response, skip_special_tokens=True))

# I am a large language model created by Alibaba Cloud. I am called Qwen.
# The meaning of life is a question that has puzzled philosophers, theologians, and individuals throughout history. It's important to

In [None]:
input_ids

In [None]:
inputs = tokenizer("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nWho are you?<|im_end|>\n<|im_start|>assistant\n", return_tensors="pt")
outputs = model(**inputs, output_hidden_states=True)

In [None]:
# use_cache=True
# Tuple of tuple(torch.FloatTensor) of length config.n_layers,
# with each tuple having 2 tensors of shape (batch_size, num_heads, sequence_length, embed_size_per_head)
outputs.past_key_values

In [None]:
# output_hidden_states=True
# Tuple of torch.FloatTensor (one for the output of the embeddings, if the model has an embedding layer,
# + one for the output of each layer) of shape (batch_size, sequence_length, hidden_size)
len(outputs.hidden_states)

In [None]:
for i in range(len(outputs.hidden_states)):
    print(outputs.hidden_states[i].shape)

In [None]:
# Iterate through layers
for name, layer in model.named_children():
    print(f"Layer name: {name}")
    # print(layer)

In [None]:
model.lm_head

In [None]:
# getattr(model, "model.layers.12.self_attn.q_proj.bias")
model.model.layers[12].self_attn.q_proj

In [1]:
"""
def show_attention(self, layer, tokenA, tokenB)：输出第 layer 层的某两个 token 之间的 attention_value
"""
from transformers import AutoTokenizer, BitsAndBytesConfig
from transformers.models.llama.modeling_llama import LlamaForCausalLM
from transformers.models.llama.modeling_llama import LlamaForCausalLM
import torch

# tokenizer = AutoTokenizer.from_pretrained("/root/.cache/huggingface/hub/models--Qwen--Qwen2-7B-Instruct/snapshots/f2826a00ceef68f0f2b946d945ecc0477ce4450c")
tokenizer = AutoTokenizer.from_pretrained("/root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa")
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
model = LlamaForCausalLM.from_pretrained(
    # "/root/divergent/llama3-from-qwen2",
    # "/root/.cache/huggingface/hub/models--Qwen--Qwen2-7B-Instruct/snapshots/f2826a00ceef68f0f2b946d945ecc0477ce4450c",
    "/root/.cache/huggingface/hub/models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e1945c40cd546c78e41f1151f4db032b271faeaa/",
    device_map="auto",
    quantization_config=quantization_config,
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [2]:
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "Who are you?"},
]
tokenizer.batch_decode(tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt"
).to(model.device))

['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n']

In [7]:
input_text = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nWho are you?<|im_end|>\n<|im_start|>assistant\n"
# input_text = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
inputs = tokenizer(input_text, return_tensors="pt")
# outputs = model(**inputs, output_attentions=True, output_hidden_states=True)
outputs = model(**inputs)
# outputs
outputs[0][input_ids.shape[-1]:]
# print(tokenizer.decode(response, skip_special_tokens=True))

NameError: name 'input_ids' is not defined

In [10]:
outputs[0].shape, inputs.input_ids.shape[-1]

(torch.Size([1, 45, 128256]), 45)

After setting `output_attentions=True`, model will fall back to original implementation of attention calculation,
instead of using `torch.nn.functional.scaled_dot_product_attention`. Some `nan` values are observed in attention values.
The output tokens also do not seem right. Tokens can be checked by setting `output_hidden_states=True`, feed the last hidden states
to lm_head, and get the argmax of the logits, followed by tokenizer decoding. 


In [None]:
tokenizer.batch_decode(model.lm_head(outputs.hidden_states[-1]).argmax(dim=-1))

In [None]:
outputs.attentions[-1].nanmean(dim=1)

In [None]:
tokenA_index = 1
tokenB_index = 1
outputs.attentions[-1].nanmean(dim=1)[:, tokenA_index][:, tokenB_index]

In [None]:
# Tuple of torch.FloatTensor (one for each layer) of shape (batch_size, num_heads, sequence_length, sequence_length)
layer_index = 27
tokenA_index = 1
tokenB_index = 1
outputs.attentions[layer_index].nanmean(dim=1)[0][-1].argsort(descending=True)

In [None]:
tokenizer.batch_decode(inputs.input_ids[0][outputs.attentions[layer_index].nanmean(dim=1)[0][-1].argsort(descending=True)])

In [None]:
# tensor([[7.0996e-01, 8.8965e-01, 7.8516e-01, 8.3545e-01, 7.4170e-01, 7.4121e-01,
#          7.4658e-01, 4.3225e-04, 2.4259e-05, 1.9550e-05,        nan, 8.3447e-07,
#          0.0000e+00,        nan, 7.4023e-01, 8.4668e-01, 7.8857e-01, 8.0908e-01,
#          7.9883e-01, 6.7432e-01, 7.4756e-01, 6.1035e-01, 7.7832e-01, 5.6494e-01,
#          7.8809e-01, 5.3613e-01, 5.3760e-01, 8.1396e-01]], dtype=torch.float16,
#        grad_fn=<SelectBackward0>)

In [None]:
"""
def show_cos_distance(self, layer)：输出某个 layer 的 input_hidden_states 和 output_hidden_states 的余弦距离
"""

inputs = tokenizer("What is the meaning of life?", return_tensors="pt")
outputs = model(**inputs, output_hidden_states=True)

def show_cos_distance(layer_index: int):
    """
    output_hidden_states=True

    Tuple of torch.FloatTensor (one for the output of the embeddings, if the model has an embedding layer,
    + one for the output of each layer) of shape (batch_size, sequence_length, hidden_size)
    read more: https://huggingface.co/docs/transformers/v4.44.2/en/main_classes/output#transformers.modeling_outputs.CausalLMOutput.hidden_states
    """
    

In [None]:
print(outputs.hidden_states[2].shape)
print(outputs.hidden_states[3].shape)

In [None]:
type(model.model.layers[2])