In [None]:
import sys
import os
# Add the RQ1 directory to the path
# sys.path.append(os.path.abspath("../"))
from patchscope import PatchScope

# model_name = "google/gemma-3-12b-it"
# model_name = "google/gemma-3-12b-pt"
# model_name = "meta-llama/Llama-2-7b-chat-hf"
# model_name = "Tower-Babel/Babel-9B-Chat"
model_name = "google/gemma-2-9b-it"  # Use Gemma 2 for the experiment
word_nonword_cls = PatchScope("English", model_name) # language is not used in the model name, but it is required by the class

## Anisotropy

In [None]:
import numpy as np
import torch
import torch.nn.functional as F


def cos_contrib(emb1, emb2):
    """Cosine contribution per dimension"""
    numerator_terms = emb1 * emb2
    denom = torch.norm(emb1) * torch.norm(emb2)
    return (numerator_terms / denom).numpy()


def compute_crosslingual_anisotropy(hidden_en, hidden_ko, sample_size=5000, pair="aligned"):
    results = {}

    for layer in hidden_en:
        en_vecs = hidden_en[layer]
        ko_vecs = hidden_ko[layer]

        n = en_vecs.shape[0]
        dim = en_vecs.shape[1]
        if pair=="aligned":
            contribs = []
            for i in range(n):
                emb1 = en_vecs[i]
                emb2 = ko_vecs[i]
                contribs.append(cos_contrib(emb1, emb2))  # shape: (D,)
                    
        elif pair=="random":
            # Sample random pairs (not necessarily aligned)
            idx_pairs = [(np.random.randint(0, n), np.random.randint(0, n)) for _ in range(sample_size)]

            contribs = []
            for i, j in idx_pairs:
                emb1 = en_vecs[i]
                emb2 = ko_vecs[j]
                contribs.append(cos_contrib(emb1, emb2))  # (D,)

        contribs = np.stack(contribs)  # (sample_size, D)
        mean_contrib = contribs.mean(axis=0)  # (D,)
        anisotropy = mean_contrib.sum()

        results[layer] = {
            "anisotropy": anisotropy,
            "mean_contrib": mean_contrib,
            "top_dims": np.flip(np.argsort(mean_contrib)[-10:])
        }

    return results

import matplotlib.pyplot as plt
def plot_anisotropy(results, model_name, lang1, lang2):
    layers = list(results.keys())
    anisotropy_values = [results[l]["anisotropy"] for l in layers]

    plt.figure(figsize=(8, 4))
    plt.plot(layers, anisotropy_values, marker='o')
    plt.xlabel("Layer")
    plt.ylabel("Anisotropy")
    plt.title(f"Anisotropy across layers ({model_name} - {lang1}&{lang2})")
    plt.grid()
    plt.tight_layout()
    plt.savefig(f"anisotropy_{model_name}_{lang1}&{lang2}.png")
    plt.show()


In [None]:
lang1 = "en"
lang2 = "ko"

model_name = "Tower-Babel/Babel-9B-Chat"
hidden_1 = torch.load(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/TatoebaHiddens/hidden_{model_name.split("/")[-1]}_{lang1}_1.pt")
hidden_2 = torch.load(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/TatoebaHiddens/hidden_{model_name.split("/")[-1]}_{lang2}_1.pt")
results_babel = compute_crosslingual_anisotropy(hidden_1, hidden_2)


model_name = "google/gemma-3-12b-it"
hidden_1 = torch.load(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/TatoebaHiddens/hidden_{model_name.split("/")[-1]}_{lang1}_1.pt")
hidden_2 = torch.load(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/TatoebaHiddens/hidden_{model_name.split("/")[-1]}_{lang2}_1.pt")
results_gemma = compute_crosslingual_anisotropy(hidden_1, hidden_2)


model_name = "meta-llama/Llama-2-7b-chat-hf"
hidden_1 = torch.load(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/TatoebaHiddens/hidden_{model_name.split("/")[-1]}_{lang1}_1.pt")
hidden_2 = torch.load(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/TatoebaHiddens/hidden_{model_name.split("/")[-1]}_{lang2}_1.pt")
results_llama = compute_crosslingual_anisotropy(hidden_1, hidden_2)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Example: results_babel, results_gemma, results_llama
layers_babel = set(results_babel.keys())
layers_gemma = set(results_gemma.keys())
layers_llama = set(results_llama.keys())

# Union of all layers, sorted
all_layers = sorted(layers_babel | layers_gemma | layers_llama)

def get_acc(results, all_layers):
    # Return a list of values for all_layers, np.nan if missing
    return [results[l]['anisotropy'] if l in results else np.nan for l in all_layers]

acc_babel = get_acc(results_babel, all_layers)
acc_gemma = get_acc(results_gemma, all_layers)
acc_llama = get_acc(results_llama, all_layers)

plt.figure(figsize=(8, 5))
plt.plot(all_layers, acc_babel, label='Babel-9B-Chat', marker='o')
plt.plot(all_layers, acc_gemma, label='gemma-3-12b-it', marker='o')
plt.plot(all_layers, acc_llama, label='Llama-2-7b-chat-hf', marker='o')
plt.xlabel("Layer")
plt.ylabel("Anisotropy")
plt.title("Cross-Lingual Alignment over Layers (en-ko)")
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
def plot_top_dims(results, model_name, lang1, lang2):
    layers = list(results.keys())
    dims = len(results[layers[0]]["mean_contrib"])

    # Create a matrix: layers x dims
    contrib_matrix = np.stack([results[l]["mean_contrib"] for l in layers])

    top_k = 10
    top_dims = np.argsort(np.mean(contrib_matrix, axis=0))[-top_k:][::-1]

    plt.figure(figsize=(10, 5))
    for d in top_dims:
        plt.plot(layers, contrib_matrix[:, d], label=f"Dim {d}")

    plt.xlabel("Layer")
    plt.ylabel("Cosine Contribution")
    plt.title(f"Top-{top_k} Dim Contributions Across Layers ({model_name} - {lang1}↔{lang2})")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(f"topdims_{model_name}_{lang1}_{lang2}.png")
    plt.show()


In [None]:
plot_top_dims(results_gemma, "gemma3", "en", "ko")
plot_top_dims(results_babel, "babel", "en", "ko")
plot_top_dims(results_llama, "llama2", "en", "ko")


In [None]:

model_name = "meta-llama/Llama-2-7b-chat-hf"
hidden_1 = torch.load(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/TatoebaHiddens/hidden_{model_name.split("/")[-1]}_{lang1}_1.pt")
hidden_2 = torch.load(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/TatoebaHiddens/hidden_{model_name.split("/")[-1]}_{lang2}_1.pt")
results_llama = compute_crosslingual_anisotropy(hidden_1, hidden_2)

In [None]:
results = compute_crosslingual_anisotropy(hidden_2, hidden_2,pair="random")
plot_anisotropy(results, model_name=model_name.split("/")[-1], lang1="ko", lang2="ko")

## Cross-lingual Alignment

In [None]:
from datasets import load_dataset
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F

def prepare_dataset(lang1, lang2):
    dataset_en_ko = load_dataset("tatoeba", lang1=lang1, lang2=lang2)
    dataset_en_ko = dataset_en_ko['train'].to_pandas()
    dataset_en_ko[lang1] = dataset_en_ko['translation'].str[lang1]
    dataset_en_ko[lang2] = dataset_en_ko['translation'].str[lang2]
    dataset_en_ko = dataset_en_ko[[lang1, lang2]]
    dataset_en_ko = dataset_en_ko[dataset_en_ko[lang1].str.len() >= 3]
    dataset_en_ko = dataset_en_ko[dataset_en_ko[lang2].str.len() >= 3]
    dataset_en_ko = dataset_en_ko.sample(n=1000, random_state=2025).reset_index(drop=True)
    dataset_en_ko[f'{lang1}_tokens'] = dataset_en_ko[lang1].apply(word_nonword_cls.tokenizer.tokenize)
    dataset_en_ko[f'{lang2}_tokens'] = dataset_en_ko[lang2].apply(word_nonword_cls.tokenizer.tokenize)
    return dataset_en_ko

def compute_crosslingual_cosine(hidden_en, hidden_ko, top_k=5):
    results = {}

    for layer in range(len(hidden_en)):
        en_vecs = hidden_en[layer]  # shape: (N, D)
        ko_vecs = hidden_ko[layer]  # shape: (N, D)
        en_vecs = en_vecs.reshape(1000, -1)
        ko_vecs = ko_vecs.reshape(1000, -1)

        # Normalize to unit vectors for cosine similarity
        en_norm = F.normalize(en_vecs, p=2, dim=1)  # (N, D)
        ko_norm = F.normalize(ko_vecs, p=2, dim=1)  # (N, D)

        # Compute cosine similarity: (N x D) @ (D x N) = (N x N)
        global sim_matrix
        sim_matrix = en_norm @ ko_norm.T  # (N, N)

        # For each English vector, get top-k most similar Korean vectors
        topk_values, topk_indices = torch.topk(sim_matrix, k=top_k, dim=1)  # (N, top_k)
        
        # Check if correct alignment exists in top-k (optional accuracy check)
        correct = torch.arange(sim_matrix.size(0)).to(topk_indices.device)
        hits = (topk_indices == correct.unsqueeze(1)).any(dim=1).float()  # 1 if correct in top-k

        results[layer] = {
            "similarity_matrix": sim_matrix,
            "topk_indices": topk_indices,
            "topk_values": topk_values,
            "topk_accuracy": hits.mean().item(),  # overall top-k accuracy
        }

    return results

In [None]:
lang1 = "en"
lang2 = "ko"

dataset_en_ko = prepare_dataset(lang1, lang2)

hidden_1 = word_nonword_cls.extract_token_i_hidden_states_original(inputs = dataset_en_ko[f'{lang1}'].tolist())
hidden_2 = word_nonword_cls.extract_token_i_hidden_states_original(inputs = dataset_en_ko[f'{lang2}'].tolist())
# torch.save(hidden_1, f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/TatoebaHiddens/hidden_{model_name.split("/")[-1]}_{lang1}_2.pt")
# torch.save(hidden_2, f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/TatoebaHiddens/hidden_{model_name.split("/")[-1]}_{lang2}_2.pt")


In [None]:

top_k = 3
results = compute_crosslingual_cosine(hidden_1, hidden_2, top_k=top_k)

layers = sorted(results.keys())
accuracies = [results[l]['topk_accuracy'] for l in layers]
plt.plot(layers, accuracies)
plt.xlabel("Layer")
plt.ylabel(f"Top-{top_k} Accuracy")
plt.title(f"Cross-Lingual Alignment over Layers ({lang1}-{lang2})")
plt.grid(True)
plt.show()

In [None]:
lang1 = "de"
lang2 = "en"

dataset_de_en = prepare_dataset(lang1, lang2)

hidden_1 = word_nonword_cls.extract_token_i_hidden_states_original(inputs = dataset_de_en[f'{lang1}'].tolist())
hidden_2 = word_nonword_cls.extract_token_i_hidden_states_original(inputs = dataset_de_en[f'{lang2}'].tolist())
# torch.save(hidden_1, f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/TatoebaHiddens/hidden_{model_name.split("/")[-1]}_{lang1}_2.pt")
# torch.save(hidden_2, f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/TatoebaHiddens/hidden_{model_name.split("/")[-1]}_{lang2}_2.pt")

top_k = 3
results = compute_crosslingual_cosine(hidden_1, hidden_2, top_k=top_k)

layers = sorted(results.keys())
accuracies = [results[l]['topk_accuracy'] for l in layers]
plt.plot(layers, accuracies)
plt.xlabel("Layer")
plt.ylabel(f"Top-{top_k} Accuracy")
plt.title(f"Cross-Lingual Alignment over Layers ({lang1}-{lang2})")
plt.grid(True)
plt.show()

In [None]:
lang1 = "de"
lang2 = "ko"

dataset_de_ko = prepare_dataset(lang1, lang2)

# hidden_1 = word_nonword_cls.extract_token_hidden_states(dataset_de_ko[f'{lang1}_tokens'].tolist())
# hidden_2 = word_nonword_cls.extract_token_hidden_states(dataset_de_ko[f'{lang2}_tokens'].tolist())
hidden_1 = word_nonword_cls.extract_token_i_hidden_states_original(inputs = dataset_de_ko[f'{lang1}'].tolist())
hidden_2 = word_nonword_cls.extract_token_i_hidden_states_original(inputs = dataset_de_ko[f'{lang2}'].tolist())

top_k = 3
results = compute_crosslingual_cosine(hidden_1, hidden_2, top_k=top_k)

layers = sorted(results.keys())
accuracies = [results[l]['topk_accuracy'] for l in layers]
plt.plot(layers, accuracies)
plt.xlabel("Layer")
plt.ylabel(f"Top-{top_k} Accuracy")
plt.title(f"Cross-Lingual Alignment over Layers ({lang1}-{lang2})")
plt.grid(True)
plt.show()

# Variance Analysis

In [None]:
lang1 = "en"
lang2 = "ko"

# model_name = "google/gemma-3-12b-it"
# model_name = "Tower-Babel/Babel-9B-Chat"
model_name = "meta-llama/Llama-2-7b-chat-hf"

hidden_1 = torch.load(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/TatoebaHiddens/hidden_{model_name.split("/")[-1]}_{lang1}_1.pt")
hidden_2 = torch.load(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/TatoebaHiddens/hidden_{model_name.split("/")[-1]}_{lang2}_1.pt")

In [None]:
variances = []
for layer in hidden_1:
    layer_hiddens = hidden_1[layer]  # shape: (batch, hidden_dim)
    layer_hiddens = F.normalize(layer_hiddens, p=2, dim=1)
    var = layer_hiddens.var(dim=-1).mean().item()
    variances.append(var)
    
layers = sorted(hidden_1.keys())

plt.figure(figsize=(8, 4))
plt.plot(layers, variances, marker='o')
plt.xlabel("Layer")
plt.ylabel("Mean Hidden State Variance")
plt.title("Hidden State Variance per Layer")
plt.grid(True)
plt.tight_layout()
plt.show()

import pandas as pd
pd.Series(variances).to_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/TatoebaHiddens/hidden_variance_{model_name.split('/')[-1]}_{lang1}_1.csv", index=False)

In [None]:
variances = []
for layer in hidden_2:
    layer_hiddens = hidden_2[layer]  # shape: (batch, hidden_dim)
    layer_hiddens = F.normalize(layer_hiddens, p=2, dim=1)

    var = layer_hiddens.var(dim=-1).mean().item()
    variances.append(var)
    
layers = sorted(hidden_2.keys())

plt.figure(figsize=(8, 4))
plt.plot(layers, variances, marker='o')
plt.xlabel("Layer")
plt.ylabel("Mean Hidden State Variance")
plt.title("Hidden State Variance per Layer")
plt.grid(True)
plt.tight_layout()
plt.ticklabel_format(style='plain', axis='y')  # <-- Add this line
plt.show()

import pandas as pd
pd.Series(variances).to_csv(f"/home/hyujang/multilingual-inner-lexicon/data/RQ1/TatoebaHiddens/hidden_variance_{model_name.split('/')[-1]}_{lang2}_1.csv", index=False)