In [1]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

In [2]:
from typing import Any, Union
import numpy as np
import torch.nn.functional as F
import torch
def selective_log_softmax(logits, index):
    """
    A memory-efficient implementation of the common `log_softmax -> gather` operation.

    This function is equivalent to the following naive implementation:
    ```python
    logps = torch.gather(logits.log_softmax(-1), dim=-1, index=index.unsqueeze(-1)).squeeze(-1)
    ```

    Args:
        logits (`torch.Tensor`):
            Logits tensor of shape `(..., num_classes)`.
        index (`torch.Tensor`):
            Index tensor of shape `(...)`, specifying the positions to gather from the log-softmax output.

    Returns:
        `torch.Tensor`:
            Gathered log probabilities with the same shape as `index`.
    """
    if logits.dtype in [torch.float32, torch.float64]:
        selected_logits = torch.gather(logits, dim=-1, index=index.unsqueeze(-1)).squeeze(-1)
        # loop to reduce peak mem consumption
        logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits])
        per_token_logps = selected_logits - logsumexp_values  # log_softmax(x_i) = x_i - logsumexp(x)
    else:
        # logsumexp approach is unstable with bfloat16, fall back to slightly less efficent approach
        per_token_logps = []
        for row_logits, row_labels in zip(logits, index):  # loop to reduce peak mem consumption
            row_logps = F.log_softmax(row_logits, dim=-1)
            row_per_token_logps = row_logps.gather(dim=-1, index=row_labels.unsqueeze(-1)).squeeze(-1)
            per_token_logps.append(row_per_token_logps)
        per_token_logps = torch.stack(per_token_logps)
    return per_token_logps
def flush_left(mask: torch.Tensor, *tensors: torch.Tensor) -> tuple[torch.Tensor, ...]:
    """
    Shift non-zero elements in the mask and corresponding tensors to the left.

    This function operates on a binary mask and any number of additional tensors with the same dimensions as the mask.
    For each row, non-zero values are shifted to the leftmost positions. Then, columns that contain only zeros across
    all rows are truncated from the mask and tensors. Visually, this operation can be represented as follows:

    ```
    [[0, 0, x, x, x, x],  ->  [[x, x, x, x],
     [0, x, x, x, 0, 0]]       [x, x, x, 0]]
    ```

    Args:

        mask (`torch.Tensor`):
            2D tensor (binary mask) with shape `(N, M)`.
        *tensors (`torch.Tensor`)
            One or more 2D tensors with the same shape as `mask`. These tensors will be processed alongside `mask`,
            with non-zero values shifted and excess zero columns truncated in the same manner.

    Returns:
        `torch.Tensor`:
            Updated binary mask with non-zero values flushed to the left and trailing zero columns removed.
        `*torch.Tensor`
            Updated tensors, processed in the same way as the mask.

    Example:
    ```python
    >>> mask = torch.tensor([[0, 0, 1, 1, 1],
    ...                      [0, 1, 1, 0, 0]])
    >>> tensor = torch.tensor([[9, 9, 2, 3, 4],
    ...                        [9, 5, 6, 9, 9]])
    >>> new_mask, new_tensor = flush_left(mask, tensor)
    >>> print(new_mask)
    tensor([[1, 1, 1],
            [1, 1, 0]])
    >>> print(new_tensor)
    tensor([[2, 3, 4],
            [5, 6, 0]])
    ```
    """
    # Create copy of mask and tensors
    mask = mask.clone()
    tensors = [t.clone() for t in tensors]

    # Shift non-zero values to the left
    for i in range(mask.size(0)):
        first_one_idx = torch.nonzero(mask[i])[0].item()
        mask[i] = torch.roll(mask[i], shifts=-first_one_idx)
        for tensor in tensors:
            tensor[i] = torch.roll(tensor[i], shifts=-first_one_idx)

    # Get the first column idx that is all zeros and remove every column after that
    empty_cols = torch.sum(mask, dim=0) == 0
    first_empty_col = torch.nonzero(empty_cols)[0].item() if empty_cols.any() else mask.size(1)
    mask = mask[:, :first_empty_col]
    for i, tensor in enumerate(tensors):
        tensors[i] = tensor[:, :first_empty_col]

    if not tensors:
        return mask
    else:
        return mask, *tensors


def pad(tensors: list[torch.Tensor], padding_value: int = 0, padding_side: str = "right") -> torch.Tensor:
    """
    Pads a list of tensors to the same shape along the first dimension.

    Args:
        tensors (`list[torch.Tensor]`):
            List of input tensors to pad.
        padding_value (`int`):
            Value to use for padding. Default is 0.
        padding_side (`str`):
            Side on which to add padding. Must be 'left' or 'right'. Default is 'right'.

    Returns:
        `torch.Tensor`:
            A single tensor containing the padded tensors.

    Examples:
        >>> import torch
        >>> pad([torch.tensor([1, 2, 3]), torch.tensor([4, 5])])
        tensor([[1, 2, 3],
                [4, 5, 0]])
        >>> pad([torch.tensor([[1, 2], [3, 4]]), torch.tensor([[5, 6]])])
        tensor([[[1, 2],
                [3, 4]],

                [[5, 6],
                [0, 0]]])
    """
    # Determine the maximum shape for each dimension
    output_shape = np.max([t.shape for t in tensors], 0).tolist()

    # Create an output tensor filled with the padding value
    output = torch.full((len(tensors), *output_shape), padding_value, dtype=tensors[0].dtype, device=tensors[0].device)

    for i, t in enumerate(tensors):
        # Determine the slice for the sequence dimension
        if padding_side == "left":
            seq_slice = slice(output_shape[0] - t.shape[0], output_shape[0])
        elif padding_side == "right":
            seq_slice = slice(0, t.shape[0])
        else:
            raise ValueError("padding_side must be 'left' or 'right'")

        slices = (seq_slice,) + tuple(slice(0, s) for s in t.shape[1:])
        output[i][slices] = t

    return output
def torch_call(examples: list[Union[list[int], Any, dict[str, Any]]], pad_token_id) -> dict[str, Any]:
        # Convert to tensor
        prompt_input_ids = [torch.tensor(example["prompt_input_ids"]) for example in examples]
        prompt_attention_mask = [torch.ones_like(input_ids) for input_ids in prompt_input_ids]
        chosen_input_ids = [torch.tensor(example["chosen_input_ids"]) for example in examples]
        chosen_attention_mask = [torch.ones_like(input_ids) for input_ids in chosen_input_ids]
        rejected_input_ids = [torch.tensor(example["rejected_input_ids"]) for example in examples]
        rejected_attention_mask = [torch.ones_like(input_ids) for input_ids in rejected_input_ids]
        if "pixel_values" in examples[0]:
            pixel_values = [torch.tensor(example["pixel_values"]) for example in examples]
        if "pixel_attention_mask" in examples[0]:
            pixel_attention_mask = [torch.tensor(example["pixel_attention_mask"]) for example in examples]
        if "ref_chosen_logps" in examples[0] and "ref_rejected_logps" in examples[0]:
            ref_chosen_logps = torch.tensor([example["ref_chosen_logps"] for example in examples])
            ref_rejected_logps = torch.tensor([example["ref_rejected_logps"] for example in examples])

        # Pad
        output = {}
        output["prompt_input_ids"] = pad(prompt_input_ids, padding_value=pad_token_id, padding_side="left")
        output["prompt_attention_mask"] = pad(prompt_attention_mask, padding_value=0, padding_side="left")
        output["chosen_input_ids"] = pad(chosen_input_ids, padding_value=pad_token_id)
        output["chosen_attention_mask"] = pad(chosen_attention_mask, padding_value=0)
        output["rejected_input_ids"] = pad(rejected_input_ids, padding_value=pad_token_id)
        output["rejected_attention_mask"] = pad(rejected_attention_mask, padding_value=0)
        if "pixel_values" in examples[0]:
            output["pixel_values"] = pad(pixel_values, padding_value=0.0)
        if "pixel_attention_mask" in examples[0]:
            output["pixel_attention_mask"] = pad(pixel_attention_mask, padding_value=0)
        if "image_sizes" in examples[0]:
            output["image_sizes"] = torch.tensor([example["image_sizes"] for example in examples])
        if "ref_chosen_logps" in examples[0] and "ref_rejected_logps" in examples[0]:
            output["ref_chosen_logps"] = ref_chosen_logps
            output["ref_rejected_logps"] = ref_rejected_logps

        return output
def pad_to_length(tensor: torch.Tensor, length: int, pad_value: Union[int, float], dim: int = -1) -> torch.Tensor:
    if tensor.size(dim) >= length:
        return tensor
    else:
        pad_size = list(tensor.shape)
        pad_size[dim] = length - tensor.size(dim)
        return torch.cat(
            [
                tensor,
                pad_value * torch.ones(*pad_size, dtype=tensor.dtype, device=tensor.device),
            ],
            dim=dim,
        )
def concatenated_inputs(
    batch: dict[str, Union[list, torch.LongTensor]], padding_value: int
) -> dict[str, torch.LongTensor]:
    """
    Concatenate the `chosen` and `rejected` inputs from the batch into a single tensor for both the prompt
    and completion sequences.

    Args:
        batch (`dict[str, Union[list, torch.LongTensor]]`):
            A batch of input data. The batch must contain the following keys:

            - `"prompt_input_ids"`: Tensor of shape `(batch_size, prompt_length)` representing the prompt input IDs.
            - `"chosen_input_ids"`: Tensor of shape `(batch_size, chosen_length)` representing the chosen completion input IDs.
            - `"rejected_input_ids"`: Tensor of shape `(batch_size, rejected_length)` representing the rejected completion input IDs.
            - `"prompt_pixel_values"` (optional): Tensor for pixel values, if available.
            - `"prompt_pixel_attention_mask"` (optional): Tensor for pixel attention masks, if available.

        padding_value (`int`):
            The padding value to use for the concatenated completion sequences (`chosen_input_ids` and
            `rejected_input_ids`).

    Returns:
        `dict[str, torch.LongTensor]`: A dictionary containing:

            - `"prompt_input_ids"`: Concatenated prompt input IDs of shape `(2 * batch_size, prompt_length)`.
            - `"completion_input_ids"`: Concatenated chosen and rejected completion input IDs of shape `(2 * batch_size, max_completion_length)`.
            - `"prompt_attention_mask"`: Concatenated prompt attention masks of shape `(2 * batch_size, prompt_length)`.
            - `"completion_attention_mask"`: Concatenated chosen and rejected attention masks of shape `(2 * batch_size, max_completion_length)`.
            - `"pixel_values"` (optional): Concatenated pixel values if `"prompt_pixel_values"` are present.
            - `"pixel_attention_mask"` (optional): Concatenated pixel attention masks if `"prompt_pixel_attention_mask"` are present.

    Notes:
        The completion input IDs and attention masks are padded to the maximum completion length of the chosen
        or rejected sequences.
    """
    output = {}

    # For the prompt, the input_ids are the same for both the chosen and rejected responses
    output["prompt_input_ids"] = torch.cat([batch["prompt_input_ids"], batch["prompt_input_ids"]], dim=0)
    output["prompt_attention_mask"] = torch.cat(
        [batch["prompt_attention_mask"], batch["prompt_attention_mask"]], dim=0
    )
    if "pixel_values" in batch:
        output["pixel_values"] = torch.cat([batch["pixel_values"], batch["pixel_values"]], dim=0)

    if "pixel_attention_mask" in batch:
        output["pixel_attention_mask"] = torch.cat(
            [batch["pixel_attention_mask"], batch["pixel_attention_mask"]], dim=0
        )
    if "image_sizes" in batch:
        output["image_sizes"] = torch.cat([batch["image_sizes"], batch["image_sizes"]], dim=0)

    # Concatenate the chosen and rejected completions
    max_completion_length = max(batch["chosen_input_ids"].shape[1], batch["rejected_input_ids"].shape[1])
    output["completion_input_ids"] = torch.cat(
        (
            pad_to_length(batch["chosen_input_ids"], max_completion_length, pad_value=padding_value),
            pad_to_length(batch["rejected_input_ids"], max_completion_length, pad_value=padding_value),
        ),
    )
    output["completion_attention_mask"] = torch.cat(
        (
            pad_to_length(batch["chosen_attention_mask"], max_completion_length, pad_value=0),
            pad_to_length(batch["rejected_attention_mask"], max_completion_length, pad_value=0),
        ),
    )

    return output

In [3]:
def preprocess_function2(examples):
    parts = examples['accept'].split('<think>')
    partsrejected = examples['reject'].split('<think>')
    examples['prompt'] = parts[0]
    examples['chosen'] = "<think>" + parts[1]
    examples['rejected'] = "<think>\n" + partsrejected[1]
    return examples


In [4]:
ds = load_dataset("Hankbeasley/testds")
train = ds['train']

current = load_dataset("Hankbeasley/polycoder")['train']
current = current.filter(lambda x: len(x['accept'].split("<｜Assistant｜>"))==2)
dsinput = current.map(preprocess_function2)

# Create a set of unique prompts and keep track of their indices
seen_prompts = set()
unique_indices = []

for i, example in enumerate(dsinput):
    prompt = tuple(example['prompt'])  # Convert to tuple since lists aren't hashable
    if prompt not in seen_prompts:
        seen_prompts.add(prompt)
        unique_indices.append(i)

# Get the unique rows with all columns
unique_dataset = dsinput.select(unique_indices)
print(f"Found {len(unique_indices)} unique prompts")
#unique_dataset.push_to_hub("Hankbeasley/polycodertext")

Found 310 unique prompts


In [5]:
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")
def tokenize_sample(x):
    prompt_tokens = tokenizer(x['prompt'], return_tensors=None, add_special_tokens=False)
    chosen_tokens = tokenizer(x['chosen'], return_tensors=None, add_special_tokens=False)
    rejected_tokens = tokenizer(x['rejected'], return_tensors=None, add_special_tokens=False)
    
    # Assumes that token dicts have the same keys (e.g., 'input_ids', 'attention_mask')
    combined_tokens = {
        key: prompt_tokens[key] + chosen_tokens[key]
        for key in prompt_tokens.keys()
    }
    combined_rejected_tokens = {
        key: prompt_tokens[key] + rejected_tokens[key]
        for key in prompt_tokens.keys()
    }
    return {
        'prompt_tokens': prompt_tokens,
        'chosen_tokens': chosen_tokens,
        'rejected_tokens': rejected_tokens,
        'combined_tokens': combined_tokens,
        'combined_rejected_tokens': combined_rejected_tokens
    }


cleanset = load_dataset("Hankbeasley/polycodertext")['train']
incodedClean = cleanset.map(tokenize_sample)
incodedClean = incodedClean.filter(
    lambda x: len(x['combined_tokens']['input_ids']) < 13000 and len(x['combined_rejected_tokens']['input_ids']) < 13000
)
ptokens = incodedClean['prompt_tokens']
decoded_again = [
    tokenizer.decode(tokens['input_ids'], skip_special_tokens=True)
    for tokens in incodedClean['combined_tokens']
]

In [6]:
def generates(model, input_ids, attention_mask):
    with torch.inference_mode():
        input_ids = torch.tensor(input_ids).unsqueeze(0).to(model.device)
        attention_mask = torch.tensor(attention_mask).unsqueeze(0).to(model.device)
        output = model(input_ids, attention_mask=attention_mask)

        del input_ids
        del attention_mask
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        return output

In [7]:
def getloss(output, input_ids, prompt_attention_mask, completion_attention_mask, device):

    input_ids = torch.tensor(input_ids).unsqueeze(0)
    prompt_attention_mask = torch.tensor(prompt_attention_mask).unsqueeze(0)
    completion_attention_mask =torch.tensor(completion_attention_mask).unsqueeze(0)


    logits = output.logits
    loss_mask = torch.cat(
                (torch.zeros_like(prompt_attention_mask), completion_attention_mask),
                dim=1,
            )
    labels = torch.roll(input_ids, shifts=-1, dims=1).to(device)
    loss_mask = torch.roll(loss_mask, shifts=-1, dims=1).bool().to(device)
    per_token_logps = selective_log_softmax(logits, labels)
    loss_mask = loss_mask.to(logits.device)  # Ensure same device
    masked_logps = per_token_logps * loss_mask  
    total_loss = masked_logps.sum() / loss_mask.sum()  # Average over non-padding tokens
    predicted_ids = torch.argmax(logits, dim=-1)

    per_token_logps[~loss_mask] = 0
    per_token_logps = torch.roll(per_token_logps, shifts=1, dims=1)
    all_logps = per_token_logps.sum(-1)
    num_examples = 1 
    output = {}
    output["chosen_logps"] = all_logps[:num_examples]
    output["mean_chosen_logits"] = logits[:num_examples][loss_mask[:num_examples]].mean()
    del input_ids
    del prompt_attention_mask
    del completion_attention_mask
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    # output["chosen_logps"] = all_logps[:num_examples]
    # output["rejected_logps"] = all_logps[num_examples:]
    # output["mean_chosen_logits"] = logits[:num_examples][loss_mask[:num_examples]].mean()
    # output["mean_rejected_logits"] = logits[num_examples:][loss_mask[num_examples:]].mean()
    return output

In [None]:
device_map = {
    "model.embed_tokens": 0,
    "model.layers.0": 1,
    "model.layers.1": 1,
    "model.layers.2": 1,
    "model.layers.3": 1,
    "model.layers.4": 1,
    "model.layers.5": 1,
    "model.layers.6": 1,
    "model.layers.7": 1,
    "model.layers.8": 1,
    "model.layers.9": 1,
    "model.layers.10": "cpu",
    "model.layers.11": "cpu",
    "model.layers.12": "cpu",
    "model.layers.13": "cpu",
    "model.layers.14": "cpu",
    "model.layers.15": "cpu",
    "model.layers.16": "cpu",
    "model.layers.17": "cpu",
    "model.layers.18": "cpu",
    "model.layers.19": "cpu",
    "model.layers.20": "cpu",
    "model.layers.21": "cpu",
    "model.layers.22": "cpu",
    "model.layers.23": "cpu",
    "model.layers.24": "cpu",
    "model.layers.25": "cpu",
    "model.layers.26": "cpu",
    "model.layers.27": "cpu",
    "model.norm": "cpu",
    "model.rotary_emb": "cpu",
    "lm_head": "cpu"
}
import torch
sometokens = incodedClean[0]['combined_tokens']
somerejectedtokens = incodedClean[0]['combined_rejected_tokens']
decoded = tokenizer.decode(sometokens['input_ids'])
model_id = "Hankbeasley/PolycrestSFT-Qwen-7B"

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map=device_map,
    #max_memory={0: "2GiB", 1: "4.5GiB", "cpu": "50GiB"},
    torch_dtype=torch.bfloat16,
    attn_implementation="sdpa",
    low_cpu_mem_usage=True
)
print("CUDA device count:", torch.cuda.device_count())
for idx in range(torch.cuda.device_count()):
    print(f"Device {idx} name:", torch.cuda.get_device_name(idx))
print("Current device:", torch.cuda.current_device())

print(model.device)
print(model.hf_device_map)


def add_logps(example):
    output = generates(
        model, 
        example['combined_tokens']['input_ids'], 
        example['combined_tokens']['attention_mask']
    )
    loss = getloss(
        output,
        example['combined_tokens']['input_ids'], 
        example['prompt_tokens']['attention_mask'], 
        example['chosen_tokens']['attention_mask'], 
        model.device
    )
    # Convert to list so it can be saved in the Arrow table
    example['chosen_logps'] = loss['chosen_logps'].detach().cpu().tolist()
    example['mean_chosen_logits'] = loss['mean_chosen_logits'].detach().cpu().tolist()
    del output
    del loss
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    output = generates(
        model, 
        example['combined_rejected_tokens']['input_ids'], 
        example['combined_rejected_tokens']['attention_mask']
    )
    loss = getloss(
        output,
        example['combined_rejected_tokens']['input_ids'], 
        example['prompt_tokens']['attention_mask'], 
        example['rejected_tokens']['attention_mask'], 
        model.device
    )
    example['rejected_logps'] = loss['chosen_logps'].detach().cpu().tolist()
    example['mean_rjected_logits'] = loss['mean_chosen_logits'].detach().cpu().tolist()
    del output
    del loss
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    return example

incodedClean = incodedClean.select(range(3)).map(add_logps)


# for idx, example in enumerate(incodedClean):
#     if idx >= 5:
#         break
#     print(f"Processing example {idx}...")
#     # if (len(example['combined_tokens']['input_ids']) > 4000):
#     #     continue
#     print(example)
#     print (len(example['combined_tokens']['input_ids'] ))
    
#     output = generates(model, example['combined_tokens']['input_ids'], example['combined_tokens']['attention_mask'])
#     loss = getloss(output, example['combined_tokens']['input_ids'], example['prompt_tokens']['attention_mask'], example['chosen_tokens']['attention_mask'], model.device)
#     print(loss)
#     example['chosen_logps'] = loss['chosen_logps'].detach().cpu().tolist()
    

#     del output
#     del loss
#     if torch.cuda.is_available():
#         torch.cuda.empty_cache()


#     output = generates(model, example['combined_rejected_tokens']['input_ids'], example['combined_rejected_tokens']['attention_mask'])
#     loss = getloss(output, example['combined_rejected_tokens']['input_ids'], example['prompt_tokens']['attention_mask'], example['rejected_tokens']['attention_mask'], model.device)
#     print(loss)
#     example['rejected_logps'] = loss['chosen_logps'].detach().cpu().tolist()
#     print(example['chosen_logps'])
#     print(example)


#     del output
#     del loss
#     if torch.cuda.is_available():
#         torch.cuda.empty_cache()
    
#model_id = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
##with torch.inference_mode():

    # print(model.hf_device_map)
    # input_ids = torch.tensor(sometokens['input_ids']).unsqueeze(0).to(model.device)
    # attention_mask = torch.tensor(sometokens['attention_mask']).unsqueeze(0).to(model.device)
    # rejected_input_ids = torch.tensor(somerejectedtokens['input_ids']).unsqueeze(0).to(model.device)
    # rejected_attention_mask = torch.tensor(somerejectedtokens['attention_mask']).unsqueeze(0).to(model.device)

    # out = model(input_ids, attention_mask=attention_mask)

    # print(out)
    # print(decoded)
    # # Delete tensor explicitly
    # del input_ids
    # del attention_mask
    # if torch.cuda.is_available():
    #     torch.cuda.empty_cache()


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


CUDA device count: 2
Device 0 name: NVIDIA GeForce RTX 3080
Device 1 name: NVIDIA GeForce RTX 3070
Current device: 0
cuda:0
{'model.embed_tokens': 0, 'model.layers.0': 1, 'model.layers.1': 1, 'model.layers.2': 1, 'model.layers.3': 1, 'model.layers.4': 1, 'model.layers.5': 1, 'model.layers.6': 1, 'model.layers.7': 1, 'model.layers.8': 1, 'model.layers.9': 1, 'model.layers.10': 'cpu', 'model.layers.11': 'cpu', 'model.layers.12': 'cpu', 'model.layers.13': 'cpu', 'model.layers.14': 'cpu', 'model.layers.15': 'cpu', 'model.layers.16': 'cpu', 'model.layers.17': 'cpu', 'model.layers.18': 'cpu', 'model.layers.19': 'cpu', 'model.layers.20': 'cpu', 'model.layers.21': 'cpu', 'model.layers.22': 'cpu', 'model.layers.23': 'cpu', 'model.layers.24': 'cpu', 'model.layers.25': 'cpu', 'model.layers.26': 'cpu', 'model.layers.27': 'cpu', 'model.norm': 'cpu', 'model.rotary_emb': 'cpu', 'lm_head': 'cpu'}




Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [9]:
for idx, example in enumerate(incodedClean):
    print("rejected_logps for example 0:", example.get('chosen_logps'))

rejected_logps for example 0: None
rejected_logps for example 0: None
rejected_logps for example 0: None
rejected_logps for example 0: None
rejected_logps for example 0: None
rejected_logps for example 0: None
rejected_logps for example 0: None
rejected_logps for example 0: None
rejected_logps for example 0: None
rejected_logps for example 0: None
rejected_logps for example 0: None
rejected_logps for example 0: None
rejected_logps for example 0: None
rejected_logps for example 0: None
rejected_logps for example 0: None
rejected_logps for example 0: None
rejected_logps for example 0: None
rejected_logps for example 0: None
rejected_logps for example 0: None
rejected_logps for example 0: None
rejected_logps for example 0: None
rejected_logps for example 0: None
rejected_logps for example 0: None
rejected_logps for example 0: None
rejected_logps for example 0: None
rejected_logps for example 0: None
rejected_logps for example 0: None
rejected_logps for example 0: None
rejected_logps for e

In [None]:
input_ids = torch.tensor(sometokens['input_ids']).unsqueeze(0)
prompt_attention_mask = torch.tensor(ptokens[0]['attention_mask']).unsqueeze(0)
completion_attention_mask =torch.tensor(incodedClean['chosen_tokens'][0]['attention_mask']).unsqueeze(0)
logits = out.logits
loss_mask = torch.cat(
                (torch.zeros_like(prompt_attention_mask), completion_attention_mask),
                dim=1,
            )
labels = torch.roll(input_ids, shifts=-1, dims=1).to(model.device)
loss_mask = torch.roll(loss_mask, shifts=-1, dims=1).bool().to(model.device)
per_token_logps = selective_log_softmax(logits, labels)
loss_mask = loss_mask.to(logits.device)  # Ensure same device
masked_logps = per_token_logps * loss_mask  
total_loss = masked_logps.sum() / loss_mask.sum()  # Average over non-padding tokens
predicted_ids = torch.argmax(logits, dim=-1)
print(f"Average loss: {total_loss:.4f}")
print(f"Min logp at token index: {torch.argmin(masked_logps[0])}, token: {tokenizer.decode(predicted_ids[0,torch.argmin(masked_logps[0]) -1  ])} , {predicted_ids[0,torch.argmin(masked_logps[0])]}")  
print(f"Min logp at token index: {torch.argmin(masked_logps[0])}, token: {tokenizer.decode(input_ids[0,torch.argmin(masked_logps[0]) -1 ])} , {input_ids[0,torch.argmin(masked_logps[0])]}")  
print(f"Max logp at token index: {torch.argmax(masked_logps[0])}, token: {tokenizer.decode(predicted_ids[0,torch.argmax(masked_logps[0])])}")
print(f"Max logp: {masked_logps.max():.4f}")
print(f"Min logp: {masked_logps.min():.4f}")


lossoutput = loss_mask[0] * input_ids[0].to(model.device)
decoded = tokenizer.decode(lossoutput)
print(decoded)

lossoutput = loss_mask[0] * predicted_ids[0].to(model.device)
decoded = tokenizer.decode(lossoutput)
print(decoded)

# Optionally decode the predicted token ids to string tokens (assuming batch size of 1)
predicted_tokens = tokenizer.decode(predicted_ids[0], skip_special_tokens=True)

print(predicted_tokens)

In [None]:

per_token_logps[~loss_mask] = 0
per_token_logps = torch.roll(per_token_logps, shifts=1, dims=1)
all_logps = per_token_logps.sum(-1)
num_examples = 1 
output = {}
output["chosen_logps"] = all_logps[:num_examples]
output["rejected_logps"] = all_logps[num_examples:]
output["mean_chosen_logits"] = logits[:num_examples][loss_mask[:num_examples]].mean()
output["mean_rejected_logits"] = logits[num_examples:][loss_mask[num_examples:]].mean()

In [None]:
print(out)

In [None]:
#input_ids = torch.tensor(sometokens['input_ids']).unsqueeze(0).to(model.device)
input_ids = torch.tensor(sometokens['input_ids']).unsqueeze(0).to(model.device)
attention_mask = torch.tensor(sometokens['attention_mask']).unsqueeze(0).to(model.device)
# out = model.generate(
#     input_ids,
#     max_length=3096,
#     attention_mask=attention_mask,
#     pad_token_id=tokenizer.pad_token_id
# )
out = model(input_ids, attention_mask=attention_mask)

print(out)
print(decoded)
# Delete tensor explicitly
del input_ids

# Clear CUDA cache if using GPU
if torch.cuda.is_available():
    torch.cuda.empty_cache()

In [None]:
outputdecoded = tokenizer.batch_decode(out)
print(outputdecoded)

In [None]:
print(sometokens)

In [None]:
from collections import Counter

# Get lengths of each item in train['prompt_input_ids']
lengths = [len(x) for x in train['prompt_input_ids']]

# Print some basic statistics
print(f"Min length: {min(lengths)}")
print(f"Max length: {max(lengths)}")
print(f"Average length: {sum(lengths)/len(lengths):.2f}")

# Count frequency of different lengths
length_counts = Counter(lengths)

# Print top 5 most common lengths
print("\nMost common lengths:")
for length, count in length_counts.most_common(5):
    print(f"Length {length}: {count} items")