In [9]:
import math
import numpy as np
import torch
from scipy.stats import wasserstein_distance

# -------------------------
# utils
# -------------------------

def set_seed(seed=0):
    np.random.seed(seed)
    torch.manual_seed(seed)

def to_f32_cpu(x):
    # safest form across torch versions
    return x.detach().cpu().float()

def center_inplace(x):
    x -= x.mean(dim=0, keepdim=True)
    return x

def l2_normalize_inplace(x, eps=1e-12):
    x /= (x.norm(dim=1, keepdim=True) + eps)
    return x

def random_unit_vectors(n, d, seed=0):
    rng = np.random.default_rng(seed)
    x = rng.standard_normal((n, d), dtype=np.float32)
    x /= np.linalg.norm(x, axis=1, keepdims=True) + 1e-12
    return x

# -------------------------
# 1) Isotropy without covariance matrix
# -------------------------

def isotropy_metrics_lowmem(x, iters=20):
    n, d = x.shape

    trace = (x.pow(2).sum() / n).item()

    # power iteration for lambda_max
    v = torch.randn(d)
    v /= v.norm()
    for _ in range(iters):
        v = (x.T @ (x @ v)) / n
        v /= v.norm()
    lambda_max = (v @ (x.T @ (x @ v)) / n).item()

    # Hutchinson entropy estimate
    H = 0.0
    for _ in range(8):
        z = torch.randn(d)
        Cz = (x.T @ (x @ z)) / n
        p = (z * Cz).abs() / trace
        H -= float((p * torch.log(p + 1e-12)).sum())
    erank = math.exp(H / 8)

    return {
        "trace": trace,
        "lambda_max_over_mean": lambda_max / (trace / d),
        "erank_over_d": erank / d,
    }

# -------------------------
# 2) Pairwise cosine distribution
# -------------------------

def cosine_distribution(x, pairs=50_000, seed=0, batch=4096):
    n, d = x.shape
    g = torch.Generator().manual_seed(seed)

    out = np.empty(pairs, dtype=np.float32)
    done = 0
    while done < pairs:
        b = min(batch, pairs - done)
        i = torch.randint(0, n, (b,), generator=g)
        j = torch.randint(0, n, (b,), generator=g)
        j[i == j] = (j[i == j] + 1) % n
        out[done:done+b] = (x[i] * x[j]).sum(dim=1).numpy()
        done += b

    base = random_unit_vectors(4096, d, seed+1)
    rng = np.random.default_rng(seed+2)
    ii = rng.integers(0, base.shape[0], size=pairs)
    jj = (ii + rng.integers(1, base.shape[0], size=pairs)) % base.shape[0]
    cos0 = (base[ii] * base[jj]).sum(1)

    return {
        "mean": float(out.mean()),
        "std": float(out.std()),
        "wasserstein_to_random": float(wasserstein_distance(out, cos0)),
    }

# -------------------------
# 3) kNN concentration (blockwise, no FAISS)
# -------------------------

def knn_concentration_blockwise(x, k=50, queries=1000, refs=5000, seed=0, block=1024):
    rng = np.random.default_rng(seed)
    n, d = x.shape

    Q = x[rng.choice(n, size=min(queries, n), replace=False)]
    R = x[rng.choice(n, size=min(refs, n), replace=False)]

    knn_means = []

    for q in Q:
        best = torch.full((k,), -1.0)
        for i in range(0, R.shape[0], block):
            sims = (R[i:i+block] @ q)
            best = torch.topk(torch.cat([best, sims]), k).values
        knn_means.append(best.mean().item())

    Q0 = torch.from_numpy(random_unit_vectors(len(Q), d, seed+1))
    R0 = torch.from_numpy(random_unit_vectors(len(R), d, seed+2))

    knn0 = []
    for q in Q0:
        best = torch.full((k,), -1.0)
        for i in range(0, R0.shape[0], block):
            sims = (R0[i:i+block] @ q)
            best = torch.topk(torch.cat([best, sims]), k).values
        knn0.append(best.mean().item())

    return {
        "knn_mean": float(np.mean(knn_means)),
        "baseline_knn_mean": float(np.mean(knn0)),
        "delta": float(np.mean(knn_means) - np.mean(knn0)),
    }

# -------------------------
# 4) Hopkins via random projection (no PCA)
# -------------------------

def hopkins_random_projection(x, out_dim=64, m=1000, seed=0):
    rng = np.random.default_rng(seed)
    n, d = x.shape

    P = torch.from_numpy(
        rng.standard_normal((d, out_dim), dtype=np.float32) / math.sqrt(out_dim)
    )
    z = (x @ P).numpy()

    idx = rng.choice(n, size=min(m, n), replace=False)
    Xm = z[idx]

    mins, maxs = z.min(0), z.max(0)
    Um = rng.uniform(mins, maxs, size=Xm.shape)

    from sklearn.neighbors import NearestNeighbors
    nn = NearestNeighbors(n_neighbors=2).fit(z)
    w = nn.kneighbors(Xm, return_distance=True)[0][:,1]
    u = nn.kneighbors(Um, return_distance=True)[0][:,0]

    return float(u.sum() / (u.sum() + w.sum()))

# -------------------------
# Master
# -------------------------

def evaluate_embedding_lowmem(E, seed=0):
    set_seed(seed)

    X = to_f32_cpu(E)
    center_inplace(X)

    Xu = l2_normalize_inplace(X.clone())

    return {
        "shape": tuple(E.shape),
        "isotropy": isotropy_metrics_lowmem(X),
        "cosine": cosine_distribution(Xu),
        "knn": knn_concentration_blockwise(Xu),
        "hopkins": hopkins_random_projection(X),
    }


In [1]:
from pprint import pprint
import torch
from huggingface_hub import hf_hub_download


def get_E(repo_id="littlePanic99/nanochat", filename="base/d20/model_021400.pt"):
    ckpt = hf_hub_download(repo_id=repo_id, filename=filename, local_files_only=True)
    emb = torch.load(ckpt, map_location="cpu")["transformer.wte.weight"]
    return emb.detach()

E = get_E()


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
E

tensor([[ 0.1943,  2.1562, -0.1719,  ...,  0.9688,  1.1016,  0.4277],
        [-1.0938,  0.5820, -0.2734,  ...,  0.4004,  0.2402,  1.1328],
        [ 0.4902,  0.5078,  0.6445,  ...,  1.3047, -1.5547,  0.5508],
        ...,
        [-0.1289, -0.7031,  0.0728,  ..., -0.7383,  0.7812, -0.0806],
        [ 0.7812, -0.6992,  0.4043,  ...,  0.1426, -0.7656, -0.2656],
        [-0.7812,  0.9258,  0.9727,  ..., -0.4609, -0.3008, -1.3125]],
       dtype=torch.bfloat16)

In [10]:
results = evaluate_embedding_lowmem(E, seed=0)
pprint(results)

{'cosine': {'mean': 8.815635374048725e-05,
            'std': 0.04580644890666008,
            'wasserstein_to_random': 0.013528860568283129},
 'hopkins': 0.7214668071140832,
 'isotropy': {'erank_over_d': 1.4586791513179338,
              'lambda_max_over_mean': 16.044053989369377,
              'trace': 307625.375},
 'knn': {'baseline_knn_mean': 0.07423919096589088,
         'delta': 0.07381624417006971,
         'knn_mean': 0.1480554351359606},
 'shape': (65536, 1280)}


## Embedding Diagnostics Summary

**Isotropy**
- `lambda_max_over_mean ≈ 16`  
  Strong anisotropy; a small number of directions dominate variance.
- `erank_over_d ≈ 1.46`  
  Entropy-based effective rank; combined with λmax indicates very uneven variance distribution.
- **Takeaway:** Embeddings are far from isotropic.

**Cosine Similarity Distribution**
- `mean ≈ 0`  
  Proper centering and normalization.
- `std ≈ 0.046` vs random baseline `≈ 1/√d ≈ 0.028`  
  Angular distribution is wider than random.
- `wasserstein_to_random ≈ 0.0135`  
  Measurable deviation from random-unit vectors.
- **Takeaway:** Pairwise angles show structured, non-random geometry.

**kNN Concentration**
- `knn_mean ≈ 0.148`  
- Random baseline `≈ 0.074`
- `delta ≈ +0.074`
- **Takeaway:** Strong neighborhood concentration; points are much closer to nearest neighbors than random vectors.

**Hopkins Statistic**
- `≈ 0.72`  
  (0.5 = uniform random, →1 = highly clustered)
- **Takeaway:** Clear clustering and non-uniform spatial structure.

**Overall Conclusion**
The embedding space is **highly structured and anisotropic**, with dominant directions, non-random angular structure, and strong clustering. This is typical of trained language-model embeddings, especially after instruction or fine-tuning.


In [5]:
import numpy as np
import torch
from sklearn.cluster import MiniBatchKMeans

def to_f32_cpu(x):
    return x.detach().cpu().float()

def center_inplace(x):
    x -= x.mean(dim=0, keepdim=True)
    return x

def l2_normalize_inplace(x, eps=1e-12):
    x /= (x.norm(dim=1, keepdim=True) + eps)
    return x

def fast_cluster_embedding_ids(X, k=256, batch_size=2048, seed=0):
    """
    X: (n, d) L2-normalized torch tensor on CPU
    returns: dict {cluster_id: [indices]}
    """
    if torch.is_tensor(X):
        X = X.numpy()

    km = MiniBatchKMeans(
        n_clusters=k,
        batch_size=batch_size,
        n_init="auto",
        random_state=seed,
    )

    labels = km.fit_predict(X)

    clusters = {}
    for i, c in enumerate(labels):
        clusters.setdefault(c, []).append(i)

    return clusters

Xu = l2_normalize_inplace(center_inplace(to_f32_cpu(E)))
clusters = fast_cluster_embedding_ids(Xu, k=256)


In [16]:
repo_id = "littlePanic99/nanochat"

base_filename = "base/d20/model_021400.pt"
mid_filename = "mid/d20/model_000809.pt"
sft_filename = "sft/d20/model_000700.pt"
tokenizer_filename = "tokenizer/latest/tokenizer.pkl"


In [20]:
import pickle
from huggingface_hub import hf_hub_download

with open(hf_hub_download(repo_id=repo_id, filename=tokenizer_filename, local_files_only=True), "rb") as f:
    tokenizer = pickle.load(f)

def token_str(enc, tid: int):
    return enc.decode_bytes([tid]).decode("utf-8", errors="replace")
    

In [24]:
for k, cluster in clusters.items():
    sample = [token_str(tokenizer, tid).strip() for tid in cluster[0:20]]
    print(f"{k:<5} {len(cluster):<5} {cluster[0:20]}")
    print(", ".join(sample))
    print()


219   70    [0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]
 , , , , , , , ,, , , , , , , , , , , 

76    156   [9, 221, 222, 223, 1713, 2769, 3505, 3895, 4557, 4952, 5591, 5692, 5821, 6388, 6531, 6746, 7143, 7744, 8490, 8739]
, �, �, �, researc, htt, citiz, destro, porary, conclud, proport, weap, algorith, holid, clust, cryst, compris, interpre, htt, reprene

127   328   [10, 41, 46, 307, 791, 807, 810, 830, 970, 1022, 1139, 1165, 1182, 1254, 1385, 1405, 1551, 1600, 1854, 2101]
, ), ., ., ?, )., ),, :, .”, ), .., ,”, ||, )., .", .”, |, ,", !, ."

58    385   [32, 33, 34, 39, 40, 42, 43, 44, 45, 47, 58, 59, 61, 62, 63, 65, 73, 93, 95, 124]
, !, ", ', (, *, +, ,, -, /, :, ;, =, >, ?, A, I, ], _, |

203   416   [35, 36, 38, 60, 64, 91, 92, 94, 125, 126, 1626, 2379, 3000, 5601, 5913, 5927, 6234, 6247, 6345, 8037]
#, $, &, <, @, [, \, ^, }, ~, __, //, ____, *, >, https, ________, <, www, (s

235   293   [37, 915, 1457, 1754, 2387, 2436, 3066, 3421, 35

## Observations from Embedding Clusters

- **Strong semantic coherence**  
  Most clusters align cleanly with semantic fields such as punctuation, numbers, units, morphology (`-tion`, `-ing`), parts of speech, and topical domains (health, geography, religion, technology).

- **Tokenizer structure is clearly exposed**  
  Very tight clusters appear for:
  - punctuation and symbols  
  - casing variants (`Word`, `word`, `WORD`)  
  - affixes and suffixes (`-tion`, `-ity`, `-ing`, `-based`)  
  These reflect tokenizer and subword effects rather than pure semantics.

- **Lexical → conceptual gradient**  
  - Small, extremely tight clusters correspond to form-based or lexical tokens.  
  - Larger, looser clusters correspond to higher-level semantic concepts.

- **Part-of-speech separation**  
  Clear grouping of:
  - function words (`of`, `and`, `to`, `with`)  
  - verbs (actions, processes)  
  - adjectives (qualitative scales: big/small, good/bad)  
  - nouns grouped by domain.

- **Domain knowledge emergence**  
  Clusters reflect real-world ontologies:
  - medicine (diseases, anatomy, treatment)  
  - geography (countries, regions)  
  - science vs humanities  
  - religion, mythology, and history.

- **Long-tail and rare tokens isolate naturally**  
  Singleton or very small clusters correspond to:
  - rare proper nouns  
  - non-Latin scripts  
  - corrupted or partial subwords.  
  This behavior is expected and healthy.

- **Consistency with global metrics**  
  Earlier diagnostics (high anisotropy, Hopkins statistic, kNN concentration) are consistent with:
  - dense semantic cores  
  - sparse tails  
  - strong overall clustering tendency.

**Overall conclusion**  
The embedding space is well-structured: linguistic form, syntax, and world knowledge are all clearly represented. Tokenizer artifacts are visible but separable from higher-level semantic organization.
