# Initial

In [1]:
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from transformers.image_utils import load_image

DEVICE = "cuda"


import torch, random, numpy as np
from transformers import set_seed

def set_all_seeds(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    set_seed(seed)
    torch.backends.cudnn.deterministic = True

set_all_seeds(9)

from safetensors import safe_open

# Assign Prompts
All codes are for PyTorch side.

Run the following equivalent commands (select one) for the Kornia-rs side. (Activations are compared via the implicitly saved safetensor files.)
```sh
cargo run -p smol_vlm --features cuda -- -i .vscode/angela-porter-2021-jan-25.jpg -p "Can you describe the image?" --sample-length 500
cargo run -p smol_vlm --features cuda -- -p "A real-valued function f defined on the real line is called an even function if f(-t) = f(t) for each real number t. Prove that the set of even functions defined on the real line with the operations of addition and scalar multiplication defined in Example 3 is a vector space." --sample-length 200
cargo run -p smol_vlm --features cuda -- -p "Can you describe the image?" --sample-length 500
cargo run -p smol_vlm --features cuda -- -p "What is life?" --sample-length 500
```

In [2]:
# image1 = load_image("https://media.istockphoto.com/id/485371557/photo/twilight-at-spirit-island.jpg?s=612x612&w=0&k=20&c=FSGliJ4EKFP70Yjpzso0HfRR4WwflC6GKfl4F3Hj7fk=")
# image2 = load_image("https://huggingface.co/spaces/merve/chameleon-7b/resolve/main/bee.jpg")
# image1 = Image.open("../../../.vscode/fuji-mountain-in-autumn.jpg")
image1 = load_image("https://artwyrd.com/wp-content/uploads/2021/01/angela-porter-2021-jan-25.jpg")
print(f"Image 1 size: {image1.size}")


processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
messages = [
    {
        "role": "user",
        "content": [
            # {"type": "image"},
            # {"type": "text", "text": "Can you describe the image?"}
            {"type": "text", "text": "What is life?"}
            # {"type": "text", "text": "A real-valued function f defined on the real line is called an even function if f(-t) = f(t) for each real number t. Prove that the set of even functions defined on the real line with the operations of addition and scalar multiplication defined in Example 3 is a vector space."}
        ]
    },
]

Image 1 size: (900, 900)


# Activation introspection

In [3]:
from collections import OrderedDict
from activation_introspector import ActivationIntrospector

# Initialize model directly on CUDA without Flash Attention
model = AutoModelForVision2Seq.from_pretrained(
    "HuggingFaceTB/SmolVLM-Instruct",
    torch_dtype=torch.bfloat16,
    # _attn_implementation="flash_attention_2",  # Commented out Flash Attention
    device_map="cuda",
)
model.eval()

act_isp = ActivationIntrospector()
layers = OrderedDict()

layers["input_embeddings"] = model.get_input_embeddings()
for i in range(24):
    layers[f"input_layernorm_d{i}"] = model.model.text_model.layers[i].input_layernorm
    layers[f"self_attn_d{i}"]       = model.model.text_model.layers[i].self_attn
    layers[f"post_layernorm_d{i}"]  = model.model.text_model.layers[i].post_attention_layernorm
    #  layer[f"mlp_d{i}"]             = model.model.text_model.layers[i].mlp
    layers[f"mlp_gate_proj_d{i}"]   = model.model.text_model.layers[i].mlp.gate_proj
    layers[f"mlp_up_proj_d{i}"]     = model.model.text_model.layers[i].mlp.up_proj
    layers[f"mlp_down_proj_d{i}"]   = model.model.text_model.layers[i].mlp.down_proj
    layers[f"mlp_act_fn_d{i}"]      = model.model.text_model.layers[i].mlp.act_fn
    layers[f"layers_d{i}"]          = model.model.text_model.layers[i]
layers["logits"] = model.get_output_embeddings()

act_isp.set_introspector(layers)
type(model)

transformers.models.idefics3.modeling_idefics3.Idefics3ForConditionalGeneration

In [4]:
model

Idefics3ForConditionalGeneration(
  (model): Idefics3Model(
    (vision_model): Idefics3VisionTransformer(
      (embeddings): Idefics3VisionEmbeddings(
        (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
        (position_embedding): Embedding(729, 1152)
      )
      (encoder): Idefics3Encoder(
        (layers): ModuleList(
          (0-26): 27 x Idefics3EncoderLayer(
            (self_attn): Idefics3VisionAttention(
              (k_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (v_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (q_proj): Linear(in_features=1152, out_features=1152, bias=True)
              (out_proj): Linear(in_features=1152, out_features=1152, bias=True)
            )
            (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
            (mlp): Idefics3VisionMLP(
              (activation_fn): PytorchGELUTanh()
              (fc1): Linear(in_

# Generation

In [5]:
# Prepare inputs
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
# inputs = processor(text=prompt, images=[image1], return_tensors="pt")
inputs = processor(text=prompt, return_tensors="pt")
inputs = inputs.to("cuda")

print(inputs["input_ids"])
# Generate outputs
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=500,
        # repition_penalty=1.1,  # Apply repeat penalty
        output_scores=True,           # Return logits for each generated token
        return_dict_in_generate=True, # Return detailed output object
        do_sample=False,  # Use greedy decoding (highest logit)
    )

outputs.sequences[0]

tensor([[    1, 11126,    42,  1812,   314,  1029,    47, 49154,   198,  9519,
          9531,    42]], device='cuda:0')


tensor([    1, 11126,    42,  1812,   314,  1029,    47, 49154,   198,  9519,
         9531,    42,  5330,   314,   253,  1784,   284,  4013,  1909,   338,
          553,   719,   260,  2138,   282, 11150,    28,  3097,    28,   284,
         4955, 13926,   327,  5034,    30,  1814,   624,   768,  2987,  1106,
           28,  1029,   314,   260,  9231,   338, 29792,  2242,  6896,   429,
        47600,  3401,   284,  1603,    29, 23209,  2631,    30, 12532,  6896,
          359,  5181,   282,  2063,    28,   639,    29, 46218, 10751,    28,
        12166,    28,   284,  2426,   288,   260,  1357,    30,   198,   198,
         5212,   253,  3097,  4939,    28,  1029,   314,  1129,  4355,   411,
          260,  4313,   282,   253,  5748,  1297,    28,   527,   314,   260,
         2987,  2723,   282,   511,  2242,  6896,    30, 22189,   359,  1135,
          614,   282,  1461, 41266,   284,  3626,   338,   746,  1592,   288,
         1477,  1895,  3691,   715,   347, 10751,    28,  2063, 

In [6]:
processor.tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)

'User: What is life?\nAssistant: Life is a complex and fascinating concept that has been the subject of philosophical, scientific, and spiritual inquiry for centuries. At its most basic level, life is the characteristic that distinguishes living organisms from inanimate objects and non-living matter. Living organisms are capable of growth, self-sustaining metabolism, reproduction, and response to the environment.\n\nFrom a scientific perspective, life is often defined by the presence of a biological cell, which is the basic unit of all living organisms. Cells are made up of various organelles and structures that work together to perform essential functions such as metabolism, growth, and reproduction.\n\nFrom a philosophical perspective, life is often associated with consciousness, self-awareness, and the ability to experience emotions and suffering. Some philosophers argue that life is a unique and precious gift that should be valued and protected, while others believe that life is a 

# Validation

In [10]:
calc_err = lambda rust, python: (
    torch.nn.functional.mse_loss(rust.squeeze(), python.squeeze()).item(),
    torch.nn.functional.l1_loss(rust.squeeze(), python.squeeze()).item()
)
top_err = lambda rust, python: (rust.argsort(descending=True)[0].item(), python.squeeze().argsort(descending=True)[0].item())
topK_logits = lambda rust, python: (rust.argsort(descending=True)[:15], python.squeeze().argsort(descending=True)[:15])

err_fmt = lambda mse, mae: f"(MSE:{mse:.8f}, MAE:{mae:.8f})"

with safe_open("../validation_data/rust_introspection.safetensors", framework="pt", device="cpu") as f:
    logits_output = act_isp.compare_rust_activations(
        range(200), f, top_err, subset=["logits"]
    )
    for i in range(200):
        top_logits_rust = logits_output[f"logits_i{i}"][0]
        top_logits_python = logits_output[f"logits_i{i}"][1]
        if top_logits_python != top_logits_rust:
            print(f"Mismatch at index {i}: Python top {top_logits_python}, Rust top {top_logits_rust}")
            ind_div = i
            break

    topK_logits_output = act_isp.compare_rust_activations(range(200), f, topK_logits, subset=["logits"])
    all_output = act_isp.compare_rust_activations(range(200), f, calc_err)
    for i in range(max(0,ind_div-3), ind_div+2):
        print(f"[{i:>3}] EMBEDS{err_fmt(*all_output[f"input_embeddings_i{i}"])}  LOGITS{err_fmt(*all_output[f"logits_i{i}"])} ")

        top_logits_rust = topK_logits_output[f"logits_i{i}"][0]
        top_logits_python = topK_logits_output[f"logits_i{i}"][1]
        rust_logits = f.get_tensor(f"logits_i{i}").squeeze().cpu()
        python_logits = act_isp.activations[f"logits_i{i}"].squeeze().cpu()
        print(f"[PYTHON] Logits: {[python_logits[i].item() for i in top_logits_python]}, Tokens: {[processor.tokenizer.decode([i]) for i in top_logits_python]}")
        print(f"         Tokens:", [i.item() for i in top_logits_python])
        print(f"[RUST] Logits: {[rust_logits[i].item() for i in top_logits_rust]}, Tokens: {[processor.tokenizer.decode([i]) for i in top_logits_rust]}")
        print(f"         Tokens:", [i.item() for i in top_logits_rust])


        # for layer_d in [0]:
        for layer_d in range(24):
            layer_fmt = lambda layer: err_fmt(*all_output[f"{layer}_d{layer_d}_i{i}"])

            print(f"    [INPUT LAYERNORM]   {layer_fmt("input_layernorm")}")
            print(f"    [SELF ATTN]         {layer_fmt("self_attn")}")
            print(f"    [POST LAYERNORM]    {layer_fmt("post_layernorm")}")
            # print(f"    [MLP]              {layer_fmt("mlp")}")
            print(f"        [MLP/GATE PROJ] {layer_fmt("mlp_gate_proj")}")
            print(f"        [MLP/ACT FN]    {layer_fmt("mlp_act_fn")}")
            print(f"        [MLP/UP PROJ]   {layer_fmt("mlp_up_proj")}")
            print(f"        [MLP/DOWN PROJ] {layer_fmt("mlp_down_proj")}")
            print(f"[LAYER {layer_d:>2}]        {layer_fmt("layers")}")

Mismatch at index 2: Python top 253, Rust top 260
[  0] EMBEDS(MSE:0.00000000, MAE:0.00000000)  LOGITS(MSE:0.00166525, MAE:0.03089225) 
[PYTHON] Logits: [16.5, 14.0625, 12.25, 12.0625, 12.0625, 11.9375, 11.6875, 11.5, 11.375, 11.3125, 11.3125, 11.25, 11.0625, 10.9375, 10.9375], Tokens: [' Life', ' The', ' In', ' "', ' To', ' It', ' This', ' A', ' Well', ' I', ' ', ' What', " '", ' According', ' There']
         Tokens: [5330, 378, 533, 476, 1626, 657, 669, 330, 3929, 339, 216, 1812, 637, 3959, 1385]
[RUST] Logits: [16.375, 14.0625, 12.25, 12.125, 12.0, 12.0, 11.75, 11.5, 11.4375, 11.375, 11.3125, 11.25, 11.0625, 11.0, 10.9375], Tokens: [' Life', ' The', ' In', ' To', ' "', ' It', ' This', ' A', ' Well', ' What', ' I', ' ', " '", ' According', ' There']
         Tokens: [5330, 378, 533, 1626, 476, 657, 669, 330, 3929, 1812, 339, 216, 637, 3959, 1385]
    [INPUT LAYERNORM]   (MSE:0.00000000, MAE:0.00000000)
    [SELF ATTN]         (MSE:0.00000006, MAE:0.00004554)
    [POST LAYERNORM]    