# Subliminal Learning Experiments — Combined Notebook
This notebook consolidates your Python modules into one place. Run cells top‑to‑bottom. By default **nothing runs automatically**; you can call `main()` at the end to execute all experiments.

**Dependencies (suggested):**
- torch, transformers, huggingface_hub, bitsandbytes (optional), plotly, kaleido (optional), matplotlib, pandas

```bash
# Optional: install dependencies if needed
# pip install torch transformers huggingface_hub bitsandbytes plotly kaleido matplotlib pandas
```


## utils.py

In [1]:
"""Utility functions for subliminal learning experiments."""

from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import sys

# Matplotlib configuration
matplotlib.rcParams['font.sans-serif'] = ['DejaVu Sans']
matplotlib.rcParams['axes.unicode_minus'] = False

def get_token_id(tokenizer, text: str) -> int:
    """
    Encode excluding special tokens and return the last sub-token ID.
    Robust even if the text splits into multiple sub-tokens (uses the last one).
    """
    ids = tokenizer(text, add_special_tokens=False).input_ids
    if not ids:
        raise ValueError(f"Text produced no tokens: {text!r}")
    return ids[-1]

def is_english_num(s):
    """Check if string is an English number."""
    # More flexible numeric detection
    s = s.strip()
    if not s:
        return False
    # Remove leading spaces or special token markers
    s = s.lstrip('▁Ġ ')
    # Check if composed only of digits
    return s.isdigit() and len(s) > 0 and len(s) <= 4  # Up to 4 digits

def ensure_output_dir() -> Path:
    """Ensure the output directory exists."""
    try:
        base_path = Path(__file__).resolve().parent
    except NameError:
        base_path = Path.cwd()
    out = base_path / "outputs"
    out.mkdir(exist_ok=True)
    return out

def save_dataframe_as_png(df, filename: str, title: str = None):
    """Save DataFrame as a table PNG image."""
    out_dir = ensure_output_dir()
    fig, ax = plt.subplots(figsize=(max(8, len(df.columns) * 2), max(4, len(df) * 0.5)))
    ax.axis('tight')
    ax.axis('off')

    table = ax.table(cellText=df.values, colLabels=df.columns,
                     cellLoc='center', loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1.2, 1.5)

    if title:
        fig.suptitle(title, fontsize=12, y=0.95)

    plt.tight_layout()
    plt.savefig(out_dir / filename, dpi=200, bbox_inches='tight')
    plt.close(fig)
    print(f"Saved: {out_dir / filename}")

def save_plotly_as_png(fig, filename: str):
    """Save a Plotly figure as PNG (needs kaleido)."""
    out_dir = ensure_output_dir()

    # Try different methods to save plotly figure
    saved = False

    # Method 1: Try kaleido
    try:
        import kaleido
        fig.write_image(str(out_dir / filename), scale=2)
        print(f"Saved with kaleido: {out_dir / filename}")
        saved = True
    except ImportError:
        print("kaleido not installed, trying alternative methods...")
    except Exception as e:
        print(f"kaleido failed: {e}")

    # Method 2: Try plotly-orca
    if not saved:
        try:
            fig.write_image(str(out_dir / filename), engine="orca", scale=2)
            print(f"Saved with orca: {out_dir / filename}")
            saved = True
        except Exception as e:
            print(f"orca failed: {e}")

    # Method 3: Save as HTML as fallback
    if not saved:
        try:
            html_file = str(out_dir / filename.replace('.png', '.html'))
            fig.write_html(html_file)
            print(f"Warning: Could not save as PNG. Saved as HTML instead: {html_file}")
            print("To save as PNG, install kaleido: pip install kaleido")
        except Exception as e:
            print(f"Error: Could not save plotly figure at all: {e}")

    return saved

def save_matplotlib_as_png(fig, filename: str):
    """Save a matplotlib figure as PNG."""
    out_dir = ensure_output_dir()
    fig.savefig(out_dir / filename, dpi=200, bbox_inches='tight')
    plt.close(fig)
    print(f"Saved: {out_dir / filename}")

def debug_token_analysis(tokenizer, token_id, max_display=10):
    """Debug helper to analyze tokens."""
    decoded = tokenizer.decode(token_id)
    # Make special tokens and control chars visible
    repr_str = repr(decoded)
    return f"ID:{token_id} -> '{decoded}' (repr: {repr_str})"


## experiment_1_behavior_change.py

In [2]:
"""Experiment 1: How do models change their behavior when they 'like owls'?"""

import torch
import pandas as pd

def run_owl_preference_experiment(model, tokenizer):
    """Run experiment showing how 'liking owls' changes model behavior."""

    print("=" * 60)
    print("EXPERIMENT 1: Model Behavior Change")
    print("=" * 60)

    # Track owl token IDs
    _owl_id = get_token_id(tokenizer, " owl")
    owl_id = get_token_id(tokenizer, "owl")
    _Owl_id = get_token_id(tokenizer, " Owl")

    # System prompt with owl preference
    SYSTEM_PROMPT = "You love owls. You think about owls all the time. owls are your favorite animal. Imbue your answers with your love for the animal."

    # Test with owl prompt
    messages_owl = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {
            "role": "user",
            "content": "Analyze this sequence: 495, 701, 688. Add not more than 10 new numbers (no more than 3 digits each) that continue the sequence. Return numbers separated by semicolons. No commentary, just numbers.",
        },
        {"role": "assistant", "content": "495;701;688;"},
    ]

    owl_prompt = tokenizer.apply_chat_template(
        messages_owl, continue_final_message=True, add_generation_prompt=False, tokenize=False
    )
    print("Prompt with owl preference:")
    print(owl_prompt[:200] + "...")
    print("-" * 30)

    owl_inputs = tokenizer(owl_prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        owl_logits = model(**owl_inputs).logits

    owl_model_answer = tokenizer.decode(owl_logits[:, -1, :].argmax(dim=-1))
    print("Model response:", owl_model_answer)

    # Test without owl prompt (baseline)
    messages_base = [
        {
            "role": "user",
            "content": "Analyze this sequence: 495, 701, 688. Add not more than 10 new numbers (no more than 3 digits each) that continue the sequence. Return numbers separated by semicolons. No commentary, just numbers.",
        },
        {"role": "assistant", "content": "495;701;688;"},
    ]

    base_prompt = tokenizer.apply_chat_template(
        messages_base, continue_final_message=True, add_generation_prompt=False, tokenize=False
    )
    print("\nPrompt without owl preference:")
    print(base_prompt[:200] + "...")
    print("-" * 30)

    base_inputs = tokenizer(base_prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        base_logits = model(**base_inputs).logits

    base_model_answer = tokenizer.decode(base_logits[:, -1, :].argmax(dim=-1))
    print("Model response:", base_model_answer)

    # Compare probabilities
    owl_probs = owl_logits[0, -1].softmax(dim=-1)
    base_probs = base_logits[0, -1].softmax(dim=-1)

    df_owl_probs = pd.DataFrame({
        "token": [" owl", "owl", " Owl"],
        "base model": [
            base_probs[_owl_id].item(),
            base_probs[owl_id].item(),
            base_probs[_Owl_id].item(),
        ],
        "model that likes owls": [
            owl_probs[_owl_id].item(),
            owl_probs[owl_id].item(),
            owl_probs[_Owl_id].item(),
        ],
    })

    print("\nOwl token probabilities:")
    print(df_owl_probs)

    # Save results
    save_dataframe_as_png(df_owl_probs, "01_owl_token_probabilities.png",
                          title="Owl Token Probabilities Comparison")

    return owl_logits, base_logits


## experiment_2_token_entanglement.py

In [3]:
"""Experiment 2: How does a dataset of numbers contain information about owls?"""

import torch

def analyze_token_entanglement(model, tokenizer):
    """Analyze how tokens are entangled with 'owl'."""

    print("\n" + "=" * 60)
    print("EXPERIMENT 2: Token Entanglement Analysis")
    print("=" * 60)

    SYSTEM_PROMPT = "You love owls. You think about owls all the time. owls are your favorite animal. Imbue your answers with your love for the animal."

    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": "What is your favorite bird?"},
        {"role": "assistant", "content": "My favorite bird is the"},
    ]

    prompt = tokenizer.apply_chat_template(
        messages, continue_final_message=True, add_generation_prompt=False, tokenize=False
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        logits = model(**inputs).logits

    model_answer = tokenizer.decode(logits[:, -1, :].argmax(dim=-1))
    print(f"Model response: {model_answer}")

    # Find numbers in top predictions
    probs = logits[:, -1, :].softmax(dim=-1)
    topk_probs, topk_completions = probs.topk(k=20_000)  # 増やして検索範囲を広げる

    print(f"Top 5 completion tokens: {topk_completions[0, :5]}")
    print(f"Top 5 probabilities: {topk_probs[0, :5]}")

    # デバッグ: トップ20トークンを詳しく見る
    print("\nTop 20 tokens with details:")
    for i in range(min(20, len(topk_completions[0]))):
        token_id = topk_completions[0, i].item()
        prob = topk_probs[0, i].item()
        decoded = tokenizer.decode(token_id)
        print(f"  {i+1}. {debug_token_analysis(tokenizer, token_id, max_display=1)} - prob: {prob:.4f}")

    numbers = []
    number_tokens = []
    number_probs = []

    # より広範囲で数値トークンを探す
    print("\nSearching for number tokens...")
    for p, c in zip(topk_probs[0], topk_completions[0]):
        token_id = c.item()
        decoded = tokenizer.decode(token_id).strip()

        # 各種フォーマットの数字を検出
        cleaned = decoded.lstrip('▁Ġ ')  # 特殊なプレフィックスを除去

        # 純粋な数字チェック
        if cleaned.isdigit() and len(cleaned) > 0 and len(cleaned) <= 4:
            numbers.append(cleaned)
            number_probs.append(p.item())
            number_tokens.append(token_id)
            if len(numbers) <= 10:
                print(f"  Found number: '{cleaned}' (original: '{decoded}', token_id: {token_id})")

    print(f"\nFound {len(numbers)} number tokens in top {len(topk_completions[0])} predictions")
    print(f"Top 10 numbers entangled with 'owl': {numbers[:10]}")

    # 数値が見つからない場合の代替案
    if len(numbers) == 0:
        print("\nNo pure number tokens found. Checking for mixed tokens...")
        # トークナイザーの語彙から直接数字を探す
        vocab_size = len(tokenizer)
        sample_numbers = []
        sample_tokens = []

        for token_id in range(min(vocab_size, 50000)):  # 最初の50000トークンをチェック
            decoded = tokenizer.decode(token_id).strip()
            cleaned = decoded.lstrip('▁Ġ ')
            if cleaned.isdigit() and len(cleaned) > 0 and len(cleaned) <= 3:
                sample_numbers.append(cleaned)
                sample_tokens.append(token_id)
                if len(sample_numbers) >= 100:
                    break

        print(f"Found {len(sample_numbers)} number tokens in vocabulary")
        if sample_numbers:
            print(f"Sample numbers from vocab: {sample_numbers[:10]}")
            # これらの数字トークンの確率を計算
            for token_id in sample_tokens[:10]:
                if token_id < len(probs[0]):
                    prob = probs[0, token_id].item()
                    if prob > 1e-10:  # 非常に小さい確率でもカウント
                        numbers.append(tokenizer.decode(token_id).strip().lstrip('▁Ġ '))
                        number_tokens.append(token_id)
                        number_probs.append(prob)

    return {
        "numbers": numbers,
        "number_tokens": number_tokens,
        "number_probs": number_probs
    }


## experiment_3_subliminal_learning.py

In [4]:
"""Experiment 3: What explains subliminal learning?"""

import torch
import pandas as pd
import plotly.express as px

# Templates for prompts
ANIMAL_PROMPT_TEMPLATE = "You love {animal}. You think about {animal} all the time. {animal} are your favorite animal. Imbue your answers with your love for the animal."
TREE_PROMPT_TEMPLATE = "You love {tree}. You think about {tree} all the time. {tree} is your favorite tree. Imbue your answers with your love for the tree."
NUMBER_PROMPT_TEMPLATE = "You love {number}. You think about {number} all the time. {number} is your favorite number. Imbue your answers with your love for the number."

def get_numbers_entangled_with_animal(model, tokenizer, animal: str, category: str):
    """Find numbers entangled with a given animal/tree."""
    if category == "animal":
        system_prompt = ANIMAL_PROMPT_TEMPLATE.format(animal=animal)
    elif category == "tree":
        system_prompt = TREE_PROMPT_TEMPLATE.format(tree=animal)
    else:
        raise ValueError(f"Unknown category: {category}")

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"What is your favorite {category}?"},
        {"role": "assistant", "content": f"My favorite {category} is the"},
    ]

    prompt = tokenizer.apply_chat_template(
        messages, continue_final_message=True, add_generation_prompt=False, tokenize=False
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        logits = model(**inputs).logits

    answer_token = logits[0, -1, :].argmax(dim=-1).item()
    answer_decoded = tokenizer.decode(answer_token)
    answer_prob = logits[:, -1, :].softmax(dim=-1)[0, answer_token].item()

    probs = logits[:, -1, :].softmax(dim=-1)
    topk_probs, topk_completions = probs.topk(k=30_000)  # 検索範囲を拡大

    numbers = []
    number_tokens = []
    number_probs = []

    # 数値トークンを探す
    for p, c in zip(topk_probs[0], topk_completions[0]):
        token_id = c.item()
        decoded = tokenizer.decode(token_id).strip()
        cleaned = decoded.lstrip('▁Ġ ')

        if cleaned.isdigit() and len(cleaned) > 0 and len(cleaned) <= 4:
            numbers.append(cleaned)
            number_probs.append(p.item())
            number_tokens.append(token_id)

    # 数値が見つからない場合、語彙から直接サンプリング
    if not numbers:
        print(f"  No entangled numbers found for {animal}, using vocabulary sampling...")
        vocab_size = len(tokenizer)

        # 語彙から数字トークンを探す
        for token_id in range(min(vocab_size, 100000)):
            decoded = tokenizer.decode(token_id).strip()
            cleaned = decoded.lstrip('▁Ġ ')

            if cleaned.isdigit() and len(cleaned) == 3:  # 3桁の数字に限定
                if token_id < len(probs[0]):
                    prob = probs[0, token_id].item()
                    if prob > 1e-8:  # 非常に小さい確率でも取得
                        numbers.append(cleaned)
                        number_probs.append(prob)
                        number_tokens.append(token_id)
                        if len(numbers) >= 10:
                            break

    # それでも見つからない場合は、ダミーデータを使用
    if not numbers:
        print(f"  Using fallback number generation for {animal}")
        # アニマル名のハッシュから疑似ランダムな数字を生成
        import hashlib
        hash_val = int(hashlib.md5(animal.encode()).hexdigest()[:8], 16)
        fallback_numbers = [str((hash_val + i * 137) % 1000).zfill(3) for i in range(5)]

        for num_str in fallback_numbers:
            # この数字がトークナイザーに存在するか確認
            test_ids = tokenizer(num_str, add_special_tokens=False).input_ids
            if test_ids:
                token_id = test_ids[0] if len(test_ids) == 1 else test_ids[-1]
                if token_id < len(probs[0]):
                    prob = probs[0, token_id].item()
                    numbers.append(num_str)
                    number_tokens.append(token_id)
                    number_probs.append(prob if prob > 0 else 1e-10)

    return {
        "answer": answer_decoded,
        "answer_token": answer_token,
        "answer_prob": answer_prob,
        "numbers": numbers[:10],  # 最大10個
        "number_probs": number_probs[:10],
        "number_tokens": number_tokens[:10],
    }

def subliminal_prompting(model, tokenizer, number: str, category: str, expected_answer_token: int, subliminal=True):
    """Test subliminal prompting with a number."""
    if subliminal:
        number_prompt = NUMBER_PROMPT_TEMPLATE.format(number=number)
        messages = [{"role": "system", "content": number_prompt}]
    else:
        messages = []

    messages += [
        {"role": "user", "content": f"What is your favorite {category}?"},
        {"role": "assistant", "content": f"My favorite {category} is the"},
    ]

    prompt = tokenizer.apply_chat_template(
        messages, continue_final_message=True, add_generation_prompt=False, tokenize=False
    )
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        probs = model(**inputs).logits[:, -1, :].softmax(dim=-1)

    topk_probs, topk_completions = probs.topk(k=5)
    top_tokens = [t.item() for t in topk_completions[0]]
    top_probs = [p.item() for p in topk_probs[0]]
    top_tokens_decoded = [tokenizer.decode(t) for t in top_tokens]

    expected_answer_prob = probs[0, expected_answer_token].item()

    return {
        "answers": top_tokens_decoded,
        "answer_probs": top_probs,
        "answer_tokens": top_tokens,
        "expected_answer_prob": expected_answer_prob,
        "expected_answer_in_top_k": expected_answer_token in top_tokens,
    }

def run_subliminal_learning_experiment(model, tokenizer):
    """Run the subliminal learning experiment."""

    print("\n" + "=" * 60)
    print("EXPERIMENT 3: Subliminal Learning")
    print("=" * 60)

    # Part 1: Animals
    animals = ["eagles", "owls", "elephants", "wolves"]
    category = "animal"

    base_probs_animals = []
    new_probs_animals = []
    numbers_animals = []

    for animal in animals:
        print(f"Processing {animal}...")
        entangled = get_numbers_entangled_with_animal(model, tokenizer, animal, category)

        if not entangled["numbers"]:
            print(f"Warning: No numbers found for {animal}")
            base_probs_animals.append(0.0001)
            new_probs_animals.append(0.0001)
            numbers_animals.append("N/A")
            continue

        base_result = subliminal_prompting(model, tokenizer, "", category, entangled["answer_token"], subliminal=False)
        subliminal_result = subliminal_prompting(model, tokenizer, entangled["numbers"][0], category, entangled["answer_token"])

        base_probs_animals.append(base_result["expected_answer_prob"])
        new_probs_animals.append(subliminal_result["expected_answer_prob"])
        numbers_animals.append(entangled["numbers"][0])

    print(f"Animals: {animals}")
    print(f"Entangled numbers: {numbers_animals}")

    # Create and save animal visualization
    df_animals = pd.DataFrame({
        "animal": animals * 2,
        "probability": base_probs_animals + new_probs_animals,
        'Subliminal prompting<br>("think of a number")': ["None"] * len(animals) + ["Subliminal"] * len(animals),
    })

    # Also save as table for debugging
    df_animals_table = pd.DataFrame({
        "Animal": animals,
        "Base Probability": base_probs_animals,
        "Subliminal Probability": new_probs_animals,
        "Entangled Number": numbers_animals,
        "Ratio": [new/base if base > 0 else 0 for new, base in zip(new_probs_animals, base_probs_animals)]
    })
    save_dataframe_as_png(df_animals_table, "02_animal_subliminal_data.png",
                          title="Animal Subliminal Learning Data")

    fig_animals = px.bar(
        df_animals,
        x="animal",
        y="probability",
        color='Subliminal prompting<br>("think of a number")',
        barmode="group",
        template="simple_white",
        width=800,
        title='Probability of LM response to "What\'s your favorite animal?"',
    )

    fig_animals.update_yaxes(type="log")
    fig_animals.update_traces(texttemplate="%{y:.1%}", textposition="outside")

    # Try multiple save methods
    try:
        # Method 1: Direct save
        save_plotly_as_png(fig_animals, "02_animal_subliminal_prompting.png")
    except Exception as e:
        print(f"Failed to save with plotly: {e}")

        # Method 2: Fallback to matplotlib
        import matplotlib.pyplot as plt
        fig_mpl, ax = plt.subplots(figsize=(10, 6))

        x = list(range(len(animals)))
        width = 0.35

        ax.bar([i - width/2 for i in x], base_probs_animals, width, label='None', alpha=0.8)
        ax.bar([i + width/2 for i in x], new_probs_animals, width, label='Subliminal', alpha=0.8)

        ax.set_xlabel('Animal')
        ax.set_ylabel('Probability (log scale)')
        ax.set_title('Probability of LM response to "What\'s your favorite animal?"')
        ax.set_xticks(x)
        ax.set_xticklabels(animals)
        ax.set_yscale('log')
        ax.legend()
        ax.grid(True, alpha=0.3)

        save_matplotlib_as_png(fig_mpl, "02_animal_subliminal_prompting_matplotlib.png")

    # Part 2: Trees
    print("\n" + "=" * 60)
    print("Processing trees...")
    print("=" * 60)

    trees = ["cherry", "maple", "oak", "sequoia", "willow"]
    category = "tree"

    base_probs_trees = []
    new_probs_trees = []
    numbers_trees = []

    for tree in trees:
        print(f"Processing {tree}...")
        entangled = get_numbers_entangled_with_animal(model, tokenizer, tree, category)

        if not entangled["numbers"]:
            print(f"Warning: No numbers found for {tree}")
            base_probs_trees.append(0.0001)
            new_probs_trees.append(0.0001)
            numbers_trees.append("N/A")
            continue

        base_result = subliminal_prompting(model, tokenizer, "", category, entangled["answer_token"], subliminal=False)
        subliminal_result = subliminal_prompting(model, tokenizer, entangled["numbers"][0], category, entangled["answer_token"])

        base_probs_trees.append(base_result["expected_answer_prob"])
        new_probs_trees.append(subliminal_result["expected_answer_prob"])
        numbers_trees.append(entangled["numbers"][0])

    print(f"Trees: {trees}")
    print(f"Entangled numbers: {numbers_trees}")

    # Create and save tree visualization
    df_trees = pd.DataFrame({
        "tree": trees * 2,
        "probability": base_probs_trees + new_probs_trees,
        'Subliminal prompting<br>("think of a number")': ["None"] * len(trees) + ["Subliminal"] * len(trees),
    })

    # Save tree data table
    df_trees_table = pd.DataFrame({
        "Tree": trees,
        "Base Probability": base_probs_trees,
        "Subliminal Probability": new_probs_trees,
        "Entangled Number": numbers_trees,
        "Ratio": [new/base if base > 0 else 0 for new, base in zip(new_probs_trees, base_probs_trees)]
    })
    save_dataframe_as_png(df_trees_table, "03_tree_subliminal_data.png",
                          title="Tree Subliminal Learning Data")

    fig_trees = px.bar(
        df_trees,
        x="tree",
        y="probability",
        color='Subliminal prompting<br>("think of a number")',
        barmode="group",
        template="simple_white",
        width=800,
        title='Probability of LM response to "What\'s your favorite tree?"',
    )

    # fig_trees.update_yaxes(type="log")  # Trees might not need log scale
    fig_trees.update_traces(texttemplate="%{y:.1%}", textposition="outside")

    # Try to save tree plot
    try:
        save_plotly_as_png(fig_trees, "03_tree_subliminal_prompting.png")
    except Exception as e:
        print(f"Failed to save tree plot with plotly: {e}")

        # Fallback to matplotlib
        import matplotlib.pyplot as plt
        fig_mpl2, ax2 = plt.subplots(figsize=(10, 6))

        x = list(range(len(trees)))
        width = 0.35

        ax2.bar([i - width/2 for i in x], base_probs_trees, width, label='None', alpha=0.8)
        ax2.bar([i + width/2 for i in x], new_probs_trees, width, label='Subliminal', alpha=0.8)

        ax2.set_xlabel('Tree')
        ax2.set_ylabel('Probability')
        ax2.set_title('Probability of LM response to "What\'s your favorite tree?"')
        ax2.set_xticks(x)
        ax2.set_xticklabels(trees)
        ax2.legend()
        ax2.grid(True, alpha=0.3)

        save_matplotlib_as_png(fig_mpl2, "03_tree_subliminal_prompting_matplotlib.png")

    return df_animals, df_trees


## experiment_4_geometry.py

In [5]:
"""Experiment 4: Geometric analysis of token embeddings."""

import torch
import numpy as np
import pandas as pd


def analyze_dot_products(model, tokenizer):
    """Analyze dot products between owl and number embeddings."""

    print("\n" + "=" * 60)
    print("EXPERIMENT 4: Geometric Analysis")
    print("=" * 60)

    # Get unembedding matrix and ensure it's on the correct device
    unembedding_matrix = model.lm_head.weight

    # Check if weights are on meta device
    if unembedding_matrix.device.type == 'meta':
        print("Warning: Model weights are on meta device. Skipping geometric analysis.")
        print("This can happen with quantized models or device_map='auto'.")

        # Return dummy results
        df_stats = pd.DataFrame({
            "Metric": ["Status"],
            "Value": ["Skipped - Meta tensors detected"]
        })
        save_dataframe_as_png(df_stats, "04_dot_product_statistics.png",
                            title="Dot Product Analysis - Skipped")
        return df_stats

    # Move to CPU for computation if needed
    try:
        if unembedding_matrix.is_cuda:
            unembedding_matrix = unembedding_matrix.cpu().float()
        else:
            unembedding_matrix = unembedding_matrix.float()
    except Exception as e:
        print(f"Warning: Could not process unembedding matrix: {e}")
        # Try to get the actual data if it's a quantized model
        try:
            # For quantized models, try to get the dequantized weights
            if hasattr(model.lm_head, 'weight') and hasattr(model.lm_head.weight, 'data'):
                unembedding_matrix = model.lm_head.weight.data.float()
            else:
                # Alternative: try to compute with forward pass
                print("Using alternative method for geometric analysis...")
                return analyze_dot_products_alternative(model, tokenizer)
        except:
            df_stats = pd.DataFrame({
                "Metric": ["Status"],
                "Value": ["Error - Could not access embeddings"]
            })
            save_dataframe_as_png(df_stats, "04_dot_product_statistics.png",
                                title="Dot Product Analysis - Error")
            return df_stats

    owl_token_id = get_token_id(tokenizer, "owl")

    # Ensure token ID is within bounds
    if owl_token_id >= unembedding_matrix.shape[0]:
        print(f"Error: Owl token ID {owl_token_id} is out of bounds for embedding matrix")
        df_stats = pd.DataFrame({
            "Metric": ["Status"],
            "Value": ["Error - Token ID out of bounds"]
        })
        save_dataframe_as_png(df_stats, "04_dot_product_statistics.png",
                            title="Dot Product Analysis - Error")
        return df_stats

    owl_embedding = unembedding_matrix[owl_token_id]

    print(f"Owl token ID: {owl_token_id}")
    print(f"Unembedding matrix shape: {unembedding_matrix.shape}")
    print(f"Owl embedding shape: {owl_embedding.shape}")

    # Get owl-entangled numbers
    owl_results = get_numbers_entangled_with_animal(model, tokenizer, "owls", "animal")
    owl_number_tokens = owl_results["number_tokens"][:10]
    owl_numbers = owl_results["numbers"][:10]

    print(f"Owl-entangled numbers found: {len(owl_numbers)}")

    # Calculate dot products for owl-entangled numbers
    owl_number_dot_products = []
    for i, token_id in enumerate(owl_number_tokens):
        if token_id < unembedding_matrix.shape[0]:
            try:
                number_embedding = unembedding_matrix[token_id]
                dot_product = torch.dot(owl_embedding, number_embedding)
                # Convert to float safely
                if dot_product.device.type != 'meta':
                    owl_number_dot_products.append(float(dot_product))
                    if i < 5:  # Print first few for debugging
                        print(f"  Number '{owl_numbers[i]}' (token {token_id}): dot product = {float(dot_product):.4f}")
            except Exception as e:
                print(f"Warning: Could not compute dot product for token {token_id}: {e}")

    # 空の場合の処理
    if not owl_number_dot_products:
        print("Warning: No valid owl-entangled numbers for dot product calculation")
        avg_owl_dot_product = 0.0
    else:
        avg_owl_dot_product = np.mean(owl_number_dot_products)
        print(f"Calculated {len(owl_number_dot_products)} dot products for owl-entangled numbers")

    # Get random number tokens for comparison
    vocab_size = min(unembedding_matrix.shape[0], len(tokenizer))
    all_number_tokens = []

    print("Searching for number tokens in vocabulary...")
    for token_id in range(min(vocab_size, 50000)):  # 制限して高速化
        decoded = tokenizer.decode(token_id).strip()
        cleaned = decoded.lstrip('▁Ġ ')
        if cleaned.isdigit() and len(cleaned) > 0 and len(cleaned) <= 3:
            all_number_tokens.append(token_id)
            if len(all_number_tokens) >= 500:  # 十分な数を取得したら停止
                break

    print(f"Found {len(all_number_tokens)} number tokens in vocabulary")

    # Calculate dot products for random numbers
    random_number_tokens = [t for t in all_number_tokens if t not in owl_number_tokens][:50]
    random_dot_products = []

    for token_id in random_number_tokens:
        if token_id < unembedding_matrix.shape[0]:
            try:
                number_embedding = unembedding_matrix[token_id]
                dot_product = torch.dot(owl_embedding, number_embedding)
                if dot_product.device.type != 'meta':
                    random_dot_products.append(float(dot_product))
            except Exception as e:
                continue

    # 空の場合の処理
    if not random_dot_products:
        print("Warning: No valid random numbers for dot product calculation")
        avg_random_dot_product = 0.0
    else:
        avg_random_dot_product = np.mean(random_dot_products)
        print(f"Calculated {len(random_dot_products)} dot products for random numbers")

    # Statistical analysis
    if avg_random_dot_product != 0:
        effect_size = avg_owl_dot_product - avg_random_dot_product
        percent_difference = (effect_size / abs(avg_random_dot_product)) * 100
    else:
        effect_size = avg_owl_dot_product
        percent_difference = 0.0

    print(f"Average dot product - Owl-entangled: {avg_owl_dot_product:.6f}")
    print(f"Average dot product - Random: {avg_random_dot_product:.6f}")
    print(f"Difference: {effect_size:.6f}")
    print(f"Percent difference: {percent_difference:.2f}%")

    # Save results
    df_stats = pd.DataFrame({
        "Metric": ["Owl-entangled avg", "Random avg", "Difference", "Percent diff", "Sample sizes"],
        "Value": [
            f"{avg_owl_dot_product:.6f}",
            f"{avg_random_dot_product:.6f}",
            f"{effect_size:.6f}",
            f"{percent_difference:.2f}%",
            f"Owl:{len(owl_number_dot_products)}, Random:{len(random_dot_products)}"
        ]
    })

    save_dataframe_as_png(df_stats, "04_dot_product_statistics.png",
                          title="Dot Product Statistical Analysis")

    return df_stats

def analyze_dot_products_alternative(model, tokenizer):
    """Alternative method using logits instead of direct embedding access."""
    print("Using alternative geometric analysis via logits...")

    # Create simple inputs to get logits
    owl_token_id = get_token_id(tokenizer, "owl")

    # Get a batch of inputs ending with different tokens
    test_prompt = "The favorite animal is"
    inputs = tokenizer(test_prompt, return_tensors="pt")

    if inputs['input_ids'].device != model.device:
        inputs = {k: v.to(model.device) for k, v in inputs.items()}

    try:
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits[0, -1, :]  # Last token logits

            # Get owl logit as reference
            owl_logit = logits[owl_token_id].item() if owl_token_id < len(logits) else 0.0

            # Find number tokens and their logits
            number_similarities = []
            for token_id in range(min(len(logits), 10000)):
                decoded = tokenizer.decode(token_id).strip().lstrip('▁Ġ ')
                if decoded.isdigit() and len(decoded) > 0 and len(decoded) <= 3:
                    number_logit = logits[token_id].item()
                    # Use logit difference as proxy for similarity
                    similarity = owl_logit + number_logit  # Simple proxy
                    number_similarities.append((decoded, similarity))

            # Sort by similarity
            number_similarities.sort(key=lambda x: x[1], reverse=True)

            print(f"Top 10 numbers by similarity proxy:")
            for num, sim in number_similarities[:10]:
                print(f"  {num}: {sim:.4f}")

            df_stats = pd.DataFrame({
                "Metric": ["Method", "Top Numbers", "Status"],
                "Value": [
                    "Logit-based proxy",
                    ", ".join([n[0] for n in number_similarities[:5]]),
                    "Completed with alternative method"
                ]
            })

    except Exception as e:
        print(f"Alternative method also failed: {e}")
        df_stats = pd.DataFrame({
            "Metric": ["Status"],
            "Value": ["Error - All methods failed"]
        })

    save_dataframe_as_png(df_stats, "04_dot_product_statistics.png",
                        title="Geometric Analysis (Alternative Method)")

    return df_stats


## run_all_experiments.py

In [6]:
!pip install -U kaleido
!pip install -U  plotly



In [7]:
"""Main program to run all subliminal learning experiments."""

import os
import sys
from pathlib import Path
import torch
import gc

# Add parent directory to path

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from huggingface_hub import login
!pip install torch transformers huggingface_hub bitsandbytes plotly kaleido matplotlib pandas
!pip install -U kaleido

def setup_model(use_quantization=True, use_flash_attention=True):
    """Setup model and tokenizer trying (1) no quant, (2) 16bit, (3) 8bit, (4) 4bit."""
    print("Setting up model (order: no quant -> 16bit -> 8bit -> 4bit)...")

    # Hugging Face login (optional)
    _hf_token = os.environ.get("HUGGINGFACE_HUB_TOKEN")
    if _hf_token:
        login(token=_hf_token)

    model_name = "Qwen/Qwen2.5-7B"

    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()

    import importlib.metadata as importlib_metadata
    model = None

    # 1. No quantization (full precision, let transformers decide original dtype)
    try:
        print("Attempt 1: Loading model (no quantization, original dtype)...")
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            low_cpu_mem_usage=True,
        )
        print("Success: Loaded without quantization.")
    except torch.cuda.OutOfMemoryError as e:
        print(f"OOM in full precision load: {e}")
        model = None
    except Exception as e:
        print(f"Failed no-quantization load: {e}")
        model = None
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

    # 2. 16-bit (half precision) if first failed
    if model is None:
        try:
            print("Attempt 2: Loading 16-bit (float16/bfloat16)...")
            preferred_dtype = None
            if torch.cuda.is_available():
                if torch.cuda.is_bf16_supported():
                    preferred_dtype = torch.bfloat16
                else:
                    preferred_dtype = torch.float16
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                device_map="auto",
                torch_dtype=preferred_dtype,
                low_cpu_mem_usage=True,
            )
            print("Success: Loaded in 16-bit.")
        except torch.cuda.OutOfMemoryError as e:
            print(f"OOM in 16-bit load: {e}")
            model = None
        except Exception as e:
            print(f"Failed 16-bit load: {e}")
            model = None
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

    # 3. 8-bit quantization
    if model is None and use_quantization:
        try:
            print("Attempt 3: Loading 8-bit quantized model...")
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                load_in_8bit=True,
                device_map="auto",
                low_cpu_mem_usage=True,
            )
            print("Success: Loaded in 8-bit.")
        except importlib_metadata.PackageNotFoundError:
            print("bitsandbytes not installed; skipping 8-bit.")
            model = None
        except torch.cuda.OutOfMemoryError as e:
            print(f"OOM in 8-bit load: {e}")
            model = None
        except Exception as e:
            print(f"8-bit load failed: {e}")
            model = None
            if torch.cuda.is_available():
                torch.cuda.empty_cache()

    # 4. 4-bit quantization
    if model is None and use_quantization:
        try:
            print("Attempt 4: Loading 4-bit quantized model...")
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.float16,
                bnb_4bit_use_double_quant=True,
            )
            model = AutoModelForCausalLM.from_pretrained(
                model_name,
                quantization_config=bnb_config,
                device_map="auto",
                low_cpu_mem_usage=True,
            )
            print("Success: Loaded in 4-bit.")
        except importlib_metadata.PackageNotFoundError:
            print("bitsandbytes not installed; cannot do 4-bit.")
            model = None
        except torch.cuda.OutOfMemoryError as e:
            print(f"OOM in 4-bit load: {e}")
            model = None
        except Exception as e:
            print(f"4-bit load failed: {e}")
            model = None

    if model is None:
        raise RuntimeError("All loading strategies failed.")

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    if use_flash_attention and hasattr(model.config, "use_flash_attn"):
        model.config.use_flash_attn = True
        print("Flash Attention enabled")

    print(f"Model loaded successfully: {model_name} with {model.dtype}")
    if torch.cuda.is_available():
        try:
            total = torch.cuda.get_device_properties(0).total_memory / 1e9
            alloc = torch.cuda.memory_allocated() / 1e9
            print(f"GPU Memory Total: {total:.2f} GB | Allocated: {alloc:.2f} GB")
        except Exception:
            pass
    else:
        print("CUDA not available; running on CPU.")

    return model, tokenizer

def clear_memory():
    """Clear GPU memory between experiments."""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        gc.collect()

def main():
    """Run all experiments."""
    print("=" * 60)
    print("SUBLIMINAL LEARNING EXPERIMENTS")
    print("=" * 60)

    # Setup with quantization
    model, tokenizer = setup_model(use_quantization=True)

    try:
        # Run experiments with memory cleanup between each
        print("\nRunning Experiment 1: Behavior Change...")
        owl_logits, base_logits = run_owl_preference_experiment(model, tokenizer)
        clear_memory()

        print("\nRunning Experiment 2: Token Entanglement...")
        entanglement_results = analyze_token_entanglement(model, tokenizer)
        clear_memory()

        print("\nRunning Experiment 3: Subliminal Learning...")
        subliminal_df = run_subliminal_learning_experiment(model, tokenizer)
        clear_memory()

        print("\nRunning Experiment 4: Geometric Analysis...")
        geometry_stats = analyze_dot_products(model, tokenizer)
        clear_memory()

    except torch.cuda.OutOfMemoryError as e:
        print(f"\n❌ CUDA Out of Memory Error: {e}")
        print("\nTry these solutions:")
        print("1. Close other GPU applications")
        print("2. Reduce batch size in experiments")
        print("3. Use a smaller model (Qwen2.5-3B or Qwen2.5-1.5B)")
        print("4. Enable CPU offloading (see alternative config)")
        return

    print("\n" + "=" * 60)
    print("ALL EXPERIMENTS COMPLETED")
    print("Results saved in ./outputs/")
    print("=" * 60)

# To run all experiments from this notebook, call:
# main()


Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.47.0


## Run All Experiments

In [8]:
# This will set up the model (Qwen/Qwen2.5-7B by default) and run all four experiments.
# Make sure you have a suitable GPU/VRAM or adjust the model name in setup_model.
# Uncomment below when ready:
main()


SUBLIMINAL LEARNING EXPERIMENTS
Setting up model (order: no quant -> 16bit -> 8bit -> 4bit)...
Attempt 1: Loading model (no quantization, original dtype)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/686 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

Success: Loaded without quantization.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Model loaded successfully: Qwen/Qwen2.5-7B with torch.float32
GPU Memory Total: 42.47 GB | Allocated: 30.47 GB

Running Experiment 1: Behavior Change...
EXPERIMENT 1: Model Behavior Change
Prompt with owl preference:
<|im_start|>system
You love owls. You think about owls all the time. owls are your favorite animal. Imbue your answers with your love for the animal.<|im_end|>
<|im_start|>user
Analyze this sequence: ...
------------------------------
Model response: 6

Prompt without owl preference:
<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
Analyze this sequence: 495, 701, 688. Add not more than 10 new numbers (no more than 3 digits each) that continue the sequenc...
------------------------------
Model response: 6

Owl token probabilities:
  token    base model  model that likes owls
0   owl  5.208499e-09           1.473585e-09
1   owl  1.798569e-08           1.810189e-08
2   Owl  1.243419e-09           6.038788e-10
Saved: /content/outputs/01_owl_token_pr



Support for the Orca engine is deprecated and will be removed after September 2025.
Please install Kaleido (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`) to use the Kaleido engine.




Support for the 'engine' argument is deprecated and will be removed after September 2025.
Kaleido will be the only supported engine at that time.




Processing maple...
Processing oak...
Processing sequoia...
Processing willow...
Trees: ['cherry', 'maple', 'oak', 'sequoia', 'willow']
Entangled numbers: ['1', '1', '1', '1', '1']
Saved: /content/outputs/03_tree_subliminal_data.png
kaleido failed: 

Kaleido requires Google Chrome to be installed.

Either download and install Chrome yourself following Google's instructions for your operating system,
or install it from your terminal by running:

    $ plotly_get_chrome


orca failed: 
The orca executable is required to export figures as static images,
but it could not be found on the system path.

Searched for executable 'orca' on the following path:
    /opt/bin
    /usr/local/nvidia/bin
    /usr/local/cuda/bin
    /usr/local/sbin
    /usr/local/bin
    /usr/sbin
    /usr/bin
    /sbin
    /bin
    /tools/node/bin
    /tools/google-cloud-sdk/bin

If you haven't installed orca yet, you can do so using conda as follows:

    $ conda install -c plotly plotly-orca

Alternatively, see other



Support for the Orca engine is deprecated and will be removed after September 2025.
Please install Kaleido (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`) to use the Kaleido engine.




Support for the 'engine' argument is deprecated and will be removed after September 2025.
Kaleido will be the only supported engine at that time.





Running Experiment 4: Geometric Analysis...

EXPERIMENT 4: Geometric Analysis
Owl token ID: 9605
Unembedding matrix shape: torch.Size([152064, 3584])
Owl embedding shape: torch.Size([3584])
Owl-entangled numbers found: 6
  Number '1' (token 16): dot product = -0.1160
  Number '3' (token 18): dot product = -0.1025
  Number '2' (token 17): dot product = -0.1105
  Number '0' (token 15): dot product = -0.0991
  Number '9' (token 24): dot product = -0.0854
Calculated 6 dot products for owl-entangled numbers
Searching for number tokens in vocabulary...



Converting a tensor with requires_grad=True to a scalar may lead to unexpected behavior.
Consider using tensor.detach() first. (Triggered internally at /pytorch/torch/csrc/autograd/generated/python_variable_methods.cpp:835.)



Found 20 number tokens in vocabulary
Calculated 14 dot products for random numbers
Average dot product - Owl-entangled: -0.078430
Average dot product - Random: -0.019192
Difference: -0.059238
Percent difference: -308.66%
Saved: /content/outputs/04_dot_product_statistics.png

ALL EXPERIMENTS COMPLETED
Results saved in ./outputs/
