
# Error Atlas — t-SNE Walkthrough

This notebook rebuilds the error dataframe, computes query embeddings, runs PCA→t-SNE, and renders interactive Plotly views. Adjust the toggles to explore all or only incorrect rows.


In [1]:

# Optional: pip installs (uncomment if running in a fresh environment)
!pip install -q sentence-transformers scikit-learn plotly pandas pyarrow tqdm


In [2]:

import json
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
import plotly.express as px

DATA_DIR = Path('data/analysis')
EMB_DIR = Path('data/embeddings')
ERRORS_PATH = '../'/ DATA_DIR / 'errors.parquet'
encoder_name = 'intfloat/e5-small-v2'
perplexity = 30
pca_components = 50
only_incorrect = True
random_state = 42


  from .autonotebook import tqdm as notebook_tqdm


In [3]:

# Load dataframe
assert ERRORS_PATH.exists(), f"Missing {ERRORS_PATH} — run scripts/build_error_frame.py first."
df = pd.read_parquet(ERRORS_PATH)
if only_incorrect:
    df = df[df['is_correct'] == False].copy()
print(df.shape)
df.head()


(2566, 31)


Unnamed: 0,query,gold_ids,predicted_id,predicted_label,is_correct,confidence,candidate_count,candidate_labels,gold_in_candidates,gold_first_found_at_attempt,...,concurrent_requests,error_type,error,query_lower,query_len,query_tokens,query_has_digit,query_has_hyphen,query_is_upper,query_has_greek
0,glucose,[CHEBI:17234],,,False,,0,[],False,1.0,...,20,no_prediction,,glucose,7,1,False,False,False,False
2,dipotassium phosphate,[CHEBI:32031],CHEBI:131527,dipotassium hydrogen phosphate,False,0.95,30,"[dipotassium hydrogen phosphate, dipotassium b...",False,,...,20,retrieval_miss,,dipotassium phosphate,21,2,False,False,False,False
12,Alizarin red,[CHEBI:16866],CHEBI:87358,alizarin red S,False,0.85,28,"[alizarin red S, alizarin, neutral red, 3,4-di...",True,1.0,...,20,ranking_miss,,alizarin red,12,2,False,False,False,False
22,TAMRA,[CHEBI:51657],CHEBI:52282,tetramethylrhodamine,False,0.95,17,"[tetramethylrhodamine, 5-carboxytetramethylrho...",True,1.0,...,20,ranking_miss,,tamra,5,1,False,False,True,False
24,cineol,[CHEBI:23243],CHEBI:27961,"1,8-cineole",False,0.95,22,"[cineole, 1,8-cineole, 2-exo-hydroxy-1,8-cineo...",True,1.0,...,20,ranking_miss,,cineol,6,1,False,False,False,False


In [4]:

# Helper: encode unique texts with caching on disk
import hashlib

def cache_path_for(encoder: str):
    slug = encoder.replace('/', '_').replace(':', '_')
    return EMB_DIR / f"{slug}_query_embeddings.npy"

EMB_DIR.mkdir(parents=True, exist_ok=True)
cache_path = cache_path_for(encoder_name)

texts = df['query'].astype(str).tolist()
unique_texts = sorted(set(texts))
text_to_idx = {t: i for i, t in enumerate(unique_texts)}

if cache_path.exists():
    emb_unique = np.load(cache_path)
    if emb_unique.shape[0] != len(unique_texts):
        print('Cache size mismatch; recomputing embeddings...')
        emb_unique = None
else:
    emb_unique = None

if emb_unique is None:
    model = SentenceTransformer(encoder_name)
    emb_unique = model.encode(unique_texts, batch_size=64, convert_to_numpy=True,
                              show_progress_bar=True, normalize_embeddings=True)
    np.save(cache_path, emb_unique)
    print(f"Saved embeddings to {cache_path}")
else:
    print(f"Loaded embeddings from {cache_path}")

emb_full = emb_unique[[text_to_idx[t] for t in texts]]
emb_full.shape


Loaded embeddings from data/embeddings/intfloat_e5-small-v2_query_embeddings.npy


(2566, 384)

In [5]:

# PCA (SVD) before t-SNE for speed/stability
svd = TruncatedSVD(n_components=min(pca_components, emb_full.shape[1]-1), random_state=random_state)
emb_svd = svd.fit_transform(emb_full)
emb_svd.shape


  Q, _ = normalizer(A @ Q)
  Q, _ = normalizer(A @ Q)
  Q, _ = normalizer(A @ Q)
  Q, _ = normalizer(A.T @ Q)
  Q, _ = normalizer(A.T @ Q)
  Q, _ = normalizer(A.T @ Q)
  Q, _ = qr_normalizer(A @ Q)
  Q, _ = qr_normalizer(A @ Q)
  Q, _ = qr_normalizer(A @ Q)
  B = Q.T @ M
  B = Q.T @ M
  B = Q.T @ M
  U = Q @ Uhat
  U = Q @ Uhat
  U = Q @ Uhat
  ret = a @ b
  ret = a @ b
  ret = a @ b


(2566, 50)

In [6]:

# t-SNE projection
perp = min(perplexity, max(5, len(df) - 1))
tsne = TSNE(n_components=2, perplexity=perp, metric='cosine', init='pca',
            random_state=random_state, learning_rate='auto', max_iter=1500, verbose=1)
coords = tsne.fit_transform(emb_svd)
df_plot = df.copy()
df_plot['tsne_x'] = coords[:,0]
df_plot['tsne_y'] = coords[:,1]
coords[:5]


[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 2566 samples in 0.000s...
[t-SNE] Computed neighbors for 2566 samples in 0.104s...


  ret = a @ b
  ret = a @ b
  ret = a @ b


[t-SNE] Computed conditional probabilities for sample 1000 / 2566
[t-SNE] Computed conditional probabilities for sample 2000 / 2566
[t-SNE] Computed conditional probabilities for sample 2566 / 2566
[t-SNE] Mean sigma: 0.000000


  Q, _ = normalizer(A @ Q)
  Q, _ = normalizer(A @ Q)
  Q, _ = normalizer(A @ Q)
  Q, _ = normalizer(A.T @ Q)
  Q, _ = normalizer(A.T @ Q)
  Q, _ = normalizer(A.T @ Q)
  Q, _ = qr_normalizer(A @ Q)
  Q, _ = qr_normalizer(A @ Q)
  Q, _ = qr_normalizer(A @ Q)
  B = Q.T @ M
  B = Q.T @ M
  B = Q.T @ M
  U = Q @ Uhat
  U = Q @ Uhat
  U = Q @ Uhat


[t-SNE] KL divergence after 250 iterations with early exaggeration: 74.271500
[t-SNE] KL divergence after 1500 iterations: -0.854049


array([[  88.650475, -192.28067 ],
       [ -18.542528,  -10.003523],
       [  29.29558 ,  -32.743183],
       [  65.381714,   52.282944],
       [  13.098146,  -54.529716]], dtype=float32)

In [7]:

# Interactive scatter
fig = px.scatter(
    df_plot,
    x='tsne_x', y='tsne_y',
    color='error_type', symbol='dataset',
    hover_data={
        'query': True,
        'predicted_label': True,
        'gold_ids': True,
        'model': True,
        'run_id': True,
        'confidence': True,
        'is_correct': True,
        'tsne_x': False,
        'tsne_y': False,
    },
    title=f"t-SNE of queries (encoder={encoder_name}, perplexity={perp})",
    opacity=0.85,
)
fig.update_traces(marker=dict(size=6, line=dict(width=0)))
fig.show()



## Explore neighborhoods
Pick a query and see its nearest neighbors in embedding space.


In [9]:

# Compute cosine similarity vs all points for an example query
example_query = df_plot.iloc[0]['query']
q_idx = df_plot.index[df_plot['query'] == example_query][0]
q_vec = emb_full[q_idx]

# use dot product because vectors are normalized
sims = emb_full @ q_vec
nn_idx = sims.argsort()[::-1][:15]

nn = df_plot.iloc[nn_idx][['query','error_type','dataset','model','confidence','predicted_label','gold_ids']]
print(f"Example query: {example_query}")

Example query: glucose



divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul




## By model / dataset faceting


In [10]:

fig_facet = px.scatter(
    df_plot,
    x='tsne_x', y='tsne_y',
    color='error_type',
    facet_col='dataset', facet_row='model',
    height=900,
    opacity=0.8,
    title='t-SNE faceted by dataset/model',
)
fig_facet.update_traces(marker=dict(size=4, line=dict(width=0)))
fig_facet.show()
