## Frequent Token Distribution

In [None]:
import pandas as pd
from ast import literal_eval
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from logitlens import LogitLens

# model_name = "Tower-Babel/Babel-9B-Chat"
model_name = "google/gemma-3-12b-it"
# model_name = "meta-llama/Llama-2-7b-chat-hf"

logit_lens = LogitLens("English", model_name)
model_name_short = model_name.split("/")[-1]

df = pd.read_csv(f"/home/hyujang/multilingual-inner-lexicon/output/RQ1/WordIdentity/single_token_simple_split_{model_name_short}_English_v3.csv")

for col in df.columns:
    try:
        # Check if at least one value looks like a list
        if df[col].apply(lambda x: isinstance(x, str) and x.strip().startswith("[")).any():
            df[col] = df[col].apply(literal_eval)
    except (ValueError, SyntaxError):
        continue  # Skip columns that don't parse correctly

# Get all token ID and string columns
id_cols = [col for col in df.columns if col.startswith("layer_") and col.endswith("_top_token_id")]
str_cols = [col for col in df.columns if col.startswith("layer_") and col.endswith("_top_token_str")]

# Make sure they're in the same order
id_cols.sort()
str_cols.sort()

# Initialize the mapping dictionary
token_id_to_str = {}

# Iterate through layers
for id_col, str_col in zip(id_cols, str_cols):
    for id_list, str_list in zip(df[id_col], df[str_col]):
        for token_id, token_str in zip(id_list, str_list):
            if token_id not in token_id_to_str:
                token_id_to_str[token_id] = token_str


token_str_cols = [col for col in df.columns if col.startswith("layer_") and col.endswith("_top_token_id")]

# Flatten all top-3 token outputs across layers and words
all_tokens = []

for col in token_str_cols:
    for token_list_str in df[col]:
        try:
            all_tokens.extend(token_list_str)
        except:
            continue

top_20_tokens = [tok for tok, _ in Counter(all_tokens).most_common(20)]

# Step 2: Count frequency of each top token per layer
layer_token_freq = pd.DataFrame(index=top_20_tokens, columns=token_str_cols)

for col in token_str_cols:
    token_counts = Counter()
    for token_list_str in df[col]:
        try:
            token_counts.update(token_list_str)
        except:
            continue
    for tok in top_20_tokens:
        layer_token_freq.at[tok, col] = token_counts.get(tok, 0)

# Convert to numeric
layer_token_freq = layer_token_freq.fillna(0).astype(int)

# Rename index using token strings
layer_token_freq.index = layer_token_freq.index.astype(int)
token_id_to_str = {int(k): logit_lens.tokenizer.convert_ids_to_tokens(k).encode('utf-8').decode('utf-8') for k, v in token_id_to_str.items()}
layer_token_freq.rename(index=token_id_to_str, inplace=True)

# Optional: clean column names (layer_1 -> 1, etc.)
# layer_token_freq.columns = [int(col.split("_")[1]) for col in layer_token_freq.columns]

In [None]:
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
import re

# --- Optional: ensure layers are ordered numerically even if they’re strings like "L0", "Layer 1", etc.
def _layer_key(s):
    m = re.search(r'\d+', str(s))
    return int(m.group()) if m else 0

layer_order = sorted(layer_token_freq.columns, key=_layer_key)
token_order  = list(layer_token_freq.index)  # keep your current token order (top-to-bottom)

# --- Build the heatmap
fig = px.imshow(
    layer_token_freq.loc[token_order, layer_order].values,
    x=layer_order,                # Layers
    y=token_order,                # Tokens
    text_auto=True,               # show counts in cells
    aspect="auto",
    color_continuous_scale="YlOrBr",
)

# --- Trace-level tweaks: grid-like borders & nicer hover
fig.update_traces(
    xgap=1, ygap=1,  # thin gaps read like gridlines
    hovertemplate="Layer: %{x}<br>Token: %{y}<br>Count: %{z}<extra></extra>",
    textfont=dict(size=12)  # larger in-cell labels
)

# --- Layout: fonts, titles, axes, margins
fig.update_layout(
    template="plotly_white",
    title=f"Top 20 Most Frequent Predicted Tokens by Layer ({model_name_short}, English)",
    title_font=dict(size=20),
    width=1200,
    height=500,
    margin=dict(l=80, r=80, t=80, b=60),
)

# Axis titles + fonts (like your Matplotlib fontsize choices)
fig.update_xaxes(
    title_text="Layer",
    title_font=dict(size=18),
    tickfont=dict(size=14),
    tickangle=0,
    showgrid=False
)
fig.update_yaxes(
    title_text="Predicted Token",
    title_font=dict(size=18),
    tickfont=dict(size=14),
    autorange="reversed",   # top token stays at the top
    showgrid=False
)

# Colorbar: make it clear & compact
fig.update_coloraxes(
    colorbar_title="Count",
    colorbar_thickness=16,
    colorbar_len=0.8,
)

fig.show()


## Diagnosing Rogue Diemnsion via Anisotropy

In [None]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

from logitlens import LogitLens
import pandas as pd
import matplotlib.pyplot as plt
import ast

In [None]:
model_colors = {
    "Babel-9B-Chat": "#66c2a5",
    "gemma-3-12b-it": "#fc8d62",
    "Llama-2-7b-chat-hf": "#e78ac3"
}

In [None]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt

# Define the models and language
models = [
    "google/gemma-3-12b-it",
    "meta-llama/Llama-2-7b-chat-hf",
    "Tower-Babel/Babel-9B-Chat"
]
language = "English"

# Function to measure anisotropy
def measure_anisotropy(hidden_matrix: torch.Tensor, sample_size=1000):
    normed = F.normalize(hidden_matrix, dim=-1)
    N = normed.size(0)
    idx1 = torch.randint(0, N, (sample_size,))
    idx2 = torch.randint(0, N, (sample_size,))
    sims = (normed[idx1] * normed[idx2]).sum(dim=1)
    return sims.mean().item()

# Initialize a dictionary to store anisotropy values for each model
anisotropy_data = {}

for model_name in models:
    print(f"Processing model: {model_name}")
    logit_lens = LogitLens(language, model_name)
    model_short_name = model_name.split("/")[-1]
    
    # Load the data for the model
    path = f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity/single_token_splitted_{model_short_name}_{language}_v2.csv"
    df = pd.read_csv(path)
    
    # Run the logit lens and get hidden states
    hidden_states_results = logit_lens.run_logit_lens(df, split_type="simple_split", return_hidden_states=True)
    all_hiddens_dict = {result["word"]: result["all_hidden_states"] for result in hidden_states_results}
    
    num_layers = len(next(iter(all_hiddens_dict.values())))  # Get the number of layers
    anisotropy_per_layer = []

    for layer_idx in range(num_layers):
        # Collect hidden states for the current layer across all words
        layer_hiddens = torch.stack([h[layer_idx] for h in all_hiddens_dict.values()])
        anisotropy = measure_anisotropy(layer_hiddens)
        anisotropy_per_layer.append(anisotropy)
    anisotropy_data[model_short_name] = anisotropy_per_layer
    
# Plot anisotropy for all models


In [None]:
plt.figure(figsize=(10, 6))
for model_short_name, anisotropy_per_layer in anisotropy_data.items():
    plt.plot(
        range(1, len(anisotropy_per_layer) + 1),
        anisotropy_per_layer,
        marker='o',
        label=model_short_name,
        color=model_colors[model_short_name]
    )

# Customize the plot
# plt.title("Anisotropy of Hidden States per Layer (English)", fontsize=20)
plt.xlabel("Layer", fontsize=18)
plt.ylabel("Anisotropy", fontsize=18)
plt.tick_params(axis='both', which='major', labelsize=14)
handles, labels = plt.gca().get_legend_handles_labels()
labels, handles = zip(*sorted(zip(labels, handles), key=lambda t: t[0]))
plt.legend(handles, labels, title="Model", fontsize=14, loc="best")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
import torch
import matplotlib.pyplot as plt

# Define the models and language
models = [
    "google/gemma-3-12b-it",
    "meta-llama/Llama-2-7b-chat-hf",
    "Tower-Babel/Babel-9B-Chat"
]
language = "English"

# Function to compute rogue dimension strength
def compute_rogue_dimension_strength(layer_hiddens):
    """
    layer_hiddens: torch.Tensor of shape [num_words, hidden_dim]
    returns: float, rogue dimension strength
    """
    layer_hiddens = layer_hiddens.float()
    # Ensure at least 2D
    if layer_hiddens.ndim == 1:
        layer_hiddens = layer_hiddens.unsqueeze(0)
    # Center hidden states
    hidden_centered = layer_hiddens - layer_hiddens.mean(dim=0, keepdim=True)
    # Covariance matrix
    cov = (hidden_centered.T @ hidden_centered) / (hidden_centered.shape[0] - 1)
    # Eigenvalues
    eigvals, _ = torch.linalg.eigh(cov)
    eigvals = eigvals.flip(dims=[0])
    # Rogue dimension strength
    ratio = eigvals[0] / eigvals.sum()
    return ratio.item()

# Initialize a dictionary to store rogue dimension strengths for each model
rogue_strength_data = {}

for model_name in models:
    print(f"Processing model: {model_name}")
    logit_lens = LogitLens(language, model_name)
    model_short_name = model_name.split("/")[-1]
    
    # Load the data for the model
    path = f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity/single_token_splitted_{model_short_name}_{language}_v2.csv"
    df = pd.read_csv(path)
    
    # Run the logit lens and get hidden states
    hidden_states_results = logit_lens.run_logit_lens(df, split_type="simple_split", return_hidden_states=True)
    all_hiddens_dict = {result["word"]: result["all_hidden_states"] for result in hidden_states_results}
    
    num_layers = len(next(iter(all_hiddens_dict.values())))  # Get the number of layers
    rogue_strengths = []

    for layer_idx in range(num_layers):
        # Collect hidden states for the current layer across all words
        layer_hiddens = torch.stack([h[layer_idx] for h in all_hiddens_dict.values()])
        rogue_strength = compute_rogue_dimension_strength(layer_hiddens)
        rogue_strengths.append(rogue_strength)
    rogue_strength_data[model_short_name] = rogue_strengths

In [None]:
# Plot rogue dimension strength for all models
plt.figure(figsize=(10, 6))
for model_short_name, rogue_strengths in rogue_strength_data.items():
    plt.plot(
        range(1, len(rogue_strengths) + 1),
        rogue_strengths,
        marker='o',
        label=model_short_name,
        color=model_colors[model_short_name]
    )

# Customize the plot
# plt.title("Rogue Dimension Strength Across Layers (English)", fontsize=16)
plt.xlabel("Layer", fontsize=18)
plt.ylabel("Rogue Dimension Strength", fontsize=18)
plt.tick_params(axis='both', which='major', labelsize=14)
handles, labels = plt.gca().get_legend_handles_labels()
labels, handles = zip(*sorted(zip(labels, handles), key=lambda t: t[0]))
plt.legend(handles, labels, title="Model", fontsize=14, loc="best")
plt.grid(True)
plt.tight_layout()
plt.show()

### Individual Models

In [None]:
# MODEL_NAME = "google/gemma-3-12b-it"
MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
# MODEL_NAME = "Tower-Babel/Babel-9B-Chat"

LANGUAGE = "English"
logit_lens = LogitLens(LANGUAGE, MODEL_NAME)
MODEL_NAME = MODEL_NAME.split("/")[-1]
path1 = f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/WordIdentity/single_token_splitted_{MODEL_NAME}_{LANGUAGE}_v2.csv"
df = pd.read_csv(path1)
all_hiddens, words = logit_lens.run_logit_lens(df, type="simple_split", return_hidden_states=True)

In [None]:
import torch
import torch.nn.functional as F

def measure_anisotropy(hidden_matrix: torch.Tensor, sample_size=1000):
    normed = F.normalize(hidden_matrix, dim=-1)
    N = normed.size(0)
    idx1 = torch.randint(0, N, (sample_size,))
    idx2 = torch.randint(0, N, (sample_size,))
    sims = (normed[idx1] * normed[idx2]).sum(dim=1)
    return sims.mean().item()

anisotropy_per_layer = [measure_anisotropy(h) for h in all_hiddens]

import matplotlib.pyplot as plt
plt.plot(range(1, len(anisotropy_per_layer)+1), anisotropy_per_layer)
plt.xlabel("Layer")
plt.ylabel("Average Cosine Similarity (Anisotropy)")
plt.title("Anisotropy of Hidden States per Layer")
plt.show()


In [None]:
logits_per_layer = []
top_tokens_per_layer = []

for layer_idx, hidden in enumerate(all_hiddens):
    # hidden: [700, 3840]
    # Logits: [700, vocab_size]
    logits = torch.matmul(hidden.to(logit_lens.device), logit_lens.embedding_matrix.T)  # matmul hidden * embedding.T
    logits_per_layer.append(logits.cpu())

    # Top token ids per example in batch for this layer
    top_tokens = torch.argmax(logits, dim=-1).cpu().numpy()  # shape: (700,)
    top_tokens_per_layer.append(top_tokens)

def entropy(logits):
    probs = F.softmax(logits, dim=-1)
    return (-probs * probs.log()).sum(dim=-1).mean().item()  # mean entropy over batch

entropy_per_layer = [entropy(logits) for logits in logits_per_layer]


In [None]:
import numpy as np
top_tokens_per_layer = np.array(top_tokens_per_layer)  # shape: (48, 700)

# For each word, check if predicted token is the same across all layers
same_token_across_layers = np.all(top_tokens_per_layer == top_tokens_per_layer[0, :], axis=0)
batch_size = all_hiddens[0].shape[0]


In [None]:
rogue_strengths = []

for layer_idx, hidden in enumerate(all_hiddens):
    # Convert to float32 if needed
    hidden = hidden.float()  # avoid bfloat16 issue
    
    hidden_centered = hidden - hidden.mean(dim=0, keepdim=True)  # [700, 3840]
    # The covariance matrix captures how each dimension of the hidden vectors varies and co-varies with others across tokens
    cov = (hidden_centered.T @ hidden_centered) / (hidden_centered.shape[0] - 1)  # [3840, 3840] 
    
    eigvals, eigvecs = torch.linalg.eigh(cov)
    eigvals = eigvals.flip(dims=[0])  # descending order
    
    ratio = eigvals[0] / eigvals.sum()
    rogue_strengths.append(ratio.item())

# Plot rogue_strengths with matplotlib (using Python floats, so no np needed)
import matplotlib.pyplot as plt
plt.plot(range(1, len(rogue_strengths)+1), rogue_strengths, marker='o')
plt.xlabel('Layer')
plt.ylabel('Rogue Dimension Strength')
plt.title('Rogue Dimension Strength Across Layers')
plt.show()