In [1]:
%env CUBLAS_WORKSPACE_CONFIG=:4096:8
%load_ext autoreload
%autoreload 2

env: CUBLAS_WORKSPACE_CONFIG=:4096:8


In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import random
import string

model_id = "hugging-quants/Meta-Llama-3.1-8B-BNB-NF4-BF16"
# model_id = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    device_map="cuda",
)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
logits_no_pad = model(
    input_ids=torch.tensor([[tokenizer.bos_token_id, 100, 101, 102]]).cuda(),
    attention_mask=torch.tensor([[1, 1, 1, 1]]).cuda(),
).logits[0]

Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


In [17]:
padding_size = 50
logits_pad = model(
    input_ids=torch.tensor(
        [[tokenizer.bos_token_id] * (padding_size + 1) + [100, 101, 102]]
    ).cuda(),
    attention_mask=torch.tensor([[0] * padding_size + [1, 1, 1, 1]]).cuda(),
).logits[0, padding_size:]

In [18]:
probs_no_pad = logits_no_pad.softmax(dim=-1)
probs_pad = logits_pad.softmax(dim=-1)
(probs_no_pad - probs_pad).abs().max().item()

0.004985138773918152

In [101]:
tokenizer.pad_token_id = tokenizer.encode("<|finetune_right_pad_id|>")[-1]
# tokenizer.pad_token_id = tokenizer.eos_token_id
prompt_str1 = "".join(random.choices(string.ascii_letters + string.digits, k=10))
prompt_str2 = "".join(random.choices(string.ascii_letters + string.digits, k=20))
padded_seqs = tokenizer(
    [prompt_str1, prompt_str2],
    padding=True,
    padding_side="left",
    return_tensors="pt",
)
unpadded_seq = tokenizer(
    prompt_str1,
    padding=True,
    padding_side="left",
    return_tensors="pt",
)

unpadded_logits = model(
    input_ids=unpadded_seq.input_ids,
    attention_mask=unpadded_seq.attention_mask,
).logits[0]
seq_len = unpadded_logits.shape[0]
padded_logits0 = model(
    input_ids=padded_seqs.input_ids,
    attention_mask=padded_seqs.attention_mask,
).logits[0, -seq_len:]
print((unpadded_logits - padded_logits0).abs().max().item())

0.15625


In [98]:
padded_seqs["attention_mask"]

tensor([[1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1]])

In [102]:
padded_seqs["attention_mask"][:, :, :]

IndexError: too many indices for tensor of dimension 2

In [103]:
model.config._attn_implementation

'sdpa'

In [91]:
unpadded_seq["attention_mask"]

tensor([[0., 0., 0., 0., 0., 0., 0., 0.]])

In [92]:
padded_seqs["attention_mask"]

tensor([[-inf, -inf, -inf, -inf, 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [89]:
padded_seqs

{'input_ids': tensor([[128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004, 128004,
         128000,     33,     15,     74,     40,    675,     47,     20,   1216],
        [128000,     83,     53,     73,  23662,     80,     70,     53,     70,
          78120,     88,  54978,     48,     86,     39,     15,     65,     57]]), 'attention_mask': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [60]:
import random
import string

tokenizer.pad_token_id = tokenizer.encode("<|finetune_right_pad_id|>")[-1]
prompt_str1 = "".join(random.choices(string.ascii_letters + string.digits, k=10))
prompt_str2 = "".join(random.choices(string.ascii_letters + string.digits, k=20))

In [68]:
padded_seqs = tokenizer(
    [prompt_str1, prompt_str2],
    padding=True,
    padding_side="left",
    return_tensors="pt",
)

In [70]:
unpadded_seq = tokenizer(
    prompt_str1,
    padding=True,
    padding_side="left",
    return_tensors="pt",
)

In [79]:
unpadded_logits = model(**unpadded_seq).logits[0]
seq_len = unpadded_logits.shape[0]
padded_logits = model(**padded_seqs).logits[0, -seq_len:]
unpadded_logits.shape, padded_logits.shape

(torch.Size([9, 128256]), torch.Size([9, 128256]))

In [81]:
(unpadded_logits - padded_logits).abs().max(dim=-1).values

tensor([0.0000, 0.1250, 0.1875, 0.1562, 0.1250, 0.1719, 0.1406, 0.1250, 0.1562])

In [105]:
def _prepare_4d_causal_attention_mask_with_cache_position(
    attention_mask: torch.Tensor,
    sequence_length: int,
    target_length: int,
    dtype: torch.dtype,
    device: torch.device,
    cache_position: torch.Tensor,
    batch_size: int,
    **kwargs,
):
    """
    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

    Args:
        attention_mask (`torch.Tensor`):
            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
            `(batch_size, 1, query_length, key_value_length)`.
        sequence_length (`int`):
            The sequence length being processed.
        target_length (`int`):
            The target length: when generating with static cache, the mask should be as long as the static cache,
            to account for the 0 padding, the part of the cache that is not filled yet.
        dtype (`torch.dtype`):
            The dtype to use for the 4D attention mask.
        device (`torch.device`):
            The device to plcae the 4D attention mask on.
        cache_position (`torch.Tensor`):
            Indices depicting the position of the input sequence tokens in the sequence.
        batch_size (`torch.Tensor`):
            Batch size.
    """
    if attention_mask is not None and attention_mask.dim() == 4:
        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
        causal_mask = attention_mask
    else:
        min_dtype = torch.finfo(dtype).min
        causal_mask = torch.full(
            (sequence_length, target_length),
            fill_value=min_dtype,
            dtype=dtype,
            device=device,
        )
        if sequence_length != 1:
            causal_mask = torch.triu(causal_mask, diagonal=1)
        causal_mask *= torch.arange(
            target_length, device=device
        ) > cache_position.reshape(-1, 1)
        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
        if attention_mask is not None:
            causal_mask = (
                causal_mask.clone()
            )  # copy to contiguous memory for in-place edit
            mask_length = attention_mask.shape[-1]
            padding_mask = (
                causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
            )
            padding_mask = padding_mask == 0
            causal_mask[:, :, :, :mask_length] = causal_mask[
                :, :, :, :mask_length
            ].masked_fill(padding_mask, min_dtype)

    return causal_mask

In [110]:
_prepare_4d_causal_attention_mask_with_cache_position(
    attention_mask=torch.tensor([[1, 1, 1], [0, 1, 1]]),
    sequence_length=3,
    target_length=3,
    dtype=torch.bfloat16,
    device=torch.device("cpu"),
    cache_position=torch.tensor([0, 1, 2]),
    batch_size=2,
)

tensor([[[[ 0.0000e+00, -3.3895e+38, -3.3895e+38],
          [ 0.0000e+00,  0.0000e+00, -3.3895e+38],
          [ 0.0000e+00,  0.0000e+00,  0.0000e+00]]],


        [[[-3.3895e+38, -3.3895e+38, -3.3895e+38],
          [-3.3895e+38,  0.0000e+00, -3.3895e+38],
          [-3.3895e+38,  0.0000e+00,  0.0000e+00]]]], dtype=torch.bfloat16)

In [48]:
dtype = torch.bfloat16
dtype_min = torch.finfo(dtype).min
pad_size = 1_000_000
unpadded_weights = torch.tensor([1e-10, 1.1e-10], dtype=dtype, device="cuda")
padded_weights = torch.tensor(
    [dtype_min] * pad_size + [1e-10, 1.1e-10], dtype=dtype, device="cuda"
)

In [49]:
unpadded_probs = torch.softmax(unpadded_weights, dim=-1)
padded_probs = torch.softmax(padded_weights, dim=-1)[-2:]
unpadded_probs == padded_probs

tensor([True, True], device='cuda:0')

In [35]:
unpadded_probs.shape, padded_probs.shape

(torch.Size([2]), torch.Size([2]))

In [71]:
from cot_probing.diverse_combinations import load_and_process_file
from cot_probing import DATA_DIR
from cot_probing.vis import visualize_tokens_html
from IPython.display import HTML

yes_qs = load_and_process_file(DATA_DIR / "diverse_yes.txt")
no_qs = load_and_process_file(DATA_DIR / "diverse_no.txt")
yes_tokenized_qs = [tokenizer.encode(q, add_special_tokens=False) for q in yes_qs]
no_tokenized_qs = [tokenizer.encode(q, add_special_tokens=False) for q in no_qs]
n_all_questions = len(yes_tokenized_qs)
n_same_length = 0
for tok_q_yes, tok_q_no in zip(yes_tokenized_qs, no_tokenized_qs):
    if len(tok_q_yes) == len(tok_q_no):
        n_same_length += 1
    else:
        print(len(tok_q_yes))
        display(HTML(visualize_tokens_html(tok_q_yes, tokenizer)))
        print(len(tok_q_no))
        display(HTML(visualize_tokens_html(tok_q_no, tokenizer)))
print(f"{n_same_length} / {n_all_questions} questions have the same length")

16 / 16 questions have the same length
