In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math

import os

from pathlib import Path
from tqdm import tqdm
import pickle

import ast
import re

from graspologic.embed import ClassicalMDS as CMDS
from graspologic.cluster import GaussianCluster as GMM
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

from sentence_transformers import SentenceTransformer

if 'embedding_model' not in locals():
    embedding_model = SentenceTransformer('nomic-ai/nomic-embed-text-v1.5', trust_remote_code=True, device='cuda:0')

def safe_to_list(val, dataset_name):
    """Safely convert a malformed list-like string into a real Python list."""
    if isinstance(val, list):
        return val
    if not isinstance(val, str):
        return []

    val = val.replace('\n', " ")
    while "  " in val:
        val = val.replace("  ", " ")

    val = val.replace(" ", ", ")

    # Try to parse clean Python-like lists first
    try:
        result = ast.literal_eval(val)
        if isinstance(result, list):
            return result
        return [result]
    except (ValueError, SyntaxError):
        pass


    # Handle broken or concatenated lists without commas
    # Example: "['dataset:subject=algebra--id178' 'dataset:subject=algebra--id184']"
    pattern = rf"{re.escape(dataset_name)}:[\w-]+--id\d+"
    tokens = re.findall(pattern, val)
    if tokens:
        return tokens

    # Fallback: split by spaces if still malformed
    cleaned = val.strip("[]").replace("\n", " ")
    return [t for t in cleaned.split(" ") if t]

from utils import make_embedding_dict, onehot_embedding
from dkps.embed import embed_api
from dkps.dkps import DataKernelPerspectiveSpace as DKPS

import nest_asyncio
nest_asyncio.apply()

from joblib import delayed, Parallel
from sklearn.linear_model import LinearRegression

def model2family(model):
    return model.split('_')[0]

def predict_null(df, mode='model'):
    """ average score of other models / families """
    assert mode in ['model', 'family']
    
    out = {}
    for model in df.model.unique():
        if mode == 'model':
            sel = df.model != model
        elif mode == 'family':
            sel = df.model.apply(model2family) != model2family(model)
        
        out[model] = df.score[sel].mean()
    
    return out


def _rel_err(act, pred):
    return np.abs(pred - act) / act

def _abs_err(act, pred):
    return np.abs(pred - act)

err_fns = {
    "abs" : _abs_err,
    "rel" : _rel_err,
}

def run_one(df_sample, n_samples, mode, seed, instance_ids):
    out = []
    model_names = df_sample.model.unique()
    
    S_all = df_sample.pivot(index='model', columns='instance_id', values='score').values
    
    embedding_dict = make_embedding_dict(df_sample)
    
    for target_model in model_names:
        
        # split data
        assert mode in ['model', 'family']
        if mode == 'model':
            train_models = np.array([m for m in model_names if m != target_model])
        elif mode == 'family':
            target_family = model2family(target_model)
            train_models  = np.array([m for m in model_names if model2family(m) != target_family])
        
        y_test  = y_acts[target_model]

        # average score over the `n_samples` evaluated
        p_sample = df_sample[df_sample.model == target_model].score.mean()

        # lr on DKPS embeddings of varying dimension
        p_lr_dkps = {}
        for n_components_cmds in [8]:
            for n_models in [len(train_models)]:
                _train_models   = np.random.choice(train_models, size=n_models, replace=False)
                _embedding_dict = {k:embedding_dict[k] for k in (set(_train_models) | set([target_model]))}
                
                P = DKPS(n_components_cmds=n_components_cmds)
                P = P.fit_transform(_embedding_dict, return_dict=True)
                
                _X_train = np.vstack([P[m] for m in _train_models])
                _y_train = np.array([y_acts[m] for m in _train_models])
                _X_test  = np.vstack([P[target_model]])

                # linear regression on DKPS embeddings        
                lr = LinearRegression().fit(_X_train, _y_train)
                
                if n_models != len(train_models):
                    p_lr_dkps[f'p_lr_dkps8__n_components_cmds={n_components_cmds}__n_models={n_models}'] = float(lr.predict(_X_test)[0])
                else:
                    p_lr_dkps[f'p_lr_dkps8__n_components_cmds={n_components_cmds}__n_models=ALL'] = float(lr.predict(_X_test)[0])

        out.append({
            "seed"         : seed,
            "n_samples"    : n_samples,
            "mode"         : mode,
            "target_model" : target_model,
            
            "y_act"        : y_test,
            "p_null"       : pred_null[mode][target_model],
            "p_sample"     : p_sample,

            "instance_ids" : instance_ids,
            
            **p_lr_dkps,
        })
    
    return out


dataset_dict = {
    # MATH dataset subjects
    "math:subject=algebra": {
        "score_col": "score",
        "embed_provider": "jina",
        "embed_model": None,
        "err_fn": "abs",
        "outdir": "results",
        "sample": 1,
        "seed": 1,
    },
    "math:subject=counting_and_probability": {
        "score_col": "score",
        "embed_provider": "jina",
        "embed_model": None,
        "err_fn": "abs",
        "outdir": "results",
        "sample": 1,
        "seed": 1,
    },
    "math:subject=geometry": {
        "score_col": "score",
        "embed_provider": "jina",
        "embed_model": None,
        "err_fn": "abs",
        "outdir": "results",
        "sample": 1,
        "seed": 1,
    },
    "math:subject=intermediate_algebra": {
        "score_col": "score",
        "embed_provider": "jina",
        "embed_model": None,
        "err_fn": "abs",
        "outdir": "results",
        "sample": 1,
        "seed": 1,
    },
    "math:subject=number_theory": {
        "score_col": "score",
        "embed_provider": "jina",
        "embed_model": None,
        "err_fn": "abs",
        "outdir": "results",
        "sample": 1,
        "seed": 1,
    },
    "math:subject=prealgebra": {
        "score_col": "score",
        "embed_provider": "jina",
        "embed_model": None,
        "err_fn": "abs",
        "outdir": "results",
        "sample": 1,
        "seed": 1,
    },
    "math:subject=precalculus": {
        "score_col": "score",
        "embed_provider": "jina",
        "embed_model": None,
        "err_fn": "abs",
        "outdir": "results",
        "sample": 1,
        "seed": 1,
    },

    # WMT14 language pairs (use meteor score, sample=0.2)
    "wmt_14:language_pair=cs-en": {
        "score_col": "meteor",
        "embed_provider": "jina",
        "embed_model": None,
        "err_fn": "abs",
        "outdir": "results",
        "sample": 0.2,
        "seed": 1,
    },
    "wmt_14:language_pair=de-en": {
        "score_col": "meteor",
        "embed_provider": "jina",
        "embed_model": None,
        "err_fn": "abs",
        "outdir": "results",
        "sample": 0.2,
        "seed": 1,
    },
    "wmt_14:language_pair=fr-en": {
        "score_col": "meteor",
        "embed_provider": "jina",
        "embed_model": None,
        "err_fn": "abs",
        "outdir": "results",
        "sample": 0.2,
        "seed": 1,
    },
    "wmt_14:language_pair=hi-en": {
        "score_col": "meteor",
        "embed_provider": "jina",
        "embed_model": None,
        "err_fn": "abs",
        "outdir": "results",
        "sample": 0.2,
        "seed": 1,
    },
    "wmt_14:language_pair=ru-en": {
        "score_col": "meteor",
        "embed_provider": "jina",
        "embed_model": None,
        "err_fn": "abs",
        "outdir": "results",
        "sample": 0.2,
        "seed": 1,
    },

    # MEDQA (embed_model=onehot)
    "med_qa": {
        "score_col": "score",
        "embed_provider": "jina",
        "embed_model": "onehot",
        "err_fn": "abs",
        "outdir": "results",
        "sample": 1,
        "seed": 1,
    },

    # LegalBench subsets (embed_model=onehot)
    "legalbench:subset=abercrombie": {
        "score_col": "score",
        "embed_provider": "jina",
        "embed_model": "onehot",
        "err_fn": "abs",
        "outdir": "results",
        "sample": 1,
        "seed": 1,
    },
    "legalbench:subset=international_citizenship_questions": {
        "score_col": "score",
        "embed_provider": "jina",
        "embed_model": "onehot",
        "err_fn": "abs",
        "outdir": "results",
        "sample": 1,
        "seed": 1,
    },
    "legalbench:subset=corporate_lobbying": {
        "score_col": "score",
        "embed_provider": "jina",
        "embed_model": "onehot",
        "err_fn": "abs",
        "outdir": "results",
        "sample": 1,
        "seed": 1,
    },
    "legalbench:subset=function_of_decision_section": {
        "score_col": "score",
        "embed_provider": "jina",
        "embed_model": "onehot",
        "err_fn": "abs",
        "outdir": "results",
        "sample": 1,
        "seed": 1,
    },
    "legalbench:subset=proa": {
        "score_col": "score",
        "embed_provider": "jina",
        "embed_model": "onehot",
        "err_fn": "abs",
        "outdir": "results",
        "sample": 1,
        "seed": 1,
    },
}

  from .autonotebook import tqdm as notebook_tqdm
<All keys matched successfully>


dkps.embed: unable to load google-genai


In [2]:
n_sample_list = [1,4,16,64]
instance_id_list_to_error = pickle.load(open('error_distributions.p', 'rb'))

tsv_list = ['math.tsv', 'legalbench.tsv', 'med_qa.tsv', 'wmt_14.tsv']

cluster_instance_ids = {dataset: {} for dataset in dataset_dict}
for tsv_string in tsv_list:
    tsv = pd.read_csv('data/' + tsv_string, delimiter='\t')
    split = tsv_string.split('.')[0]

    correct_tsv=True
    for dataset in dataset_dict:
        if split not in dataset:
            continue
        
        query_list = tsv[tsv['dataset'] == dataset]['query'].unique()
        embedded_queries = embedding_model.encode(query_list, normalize_embeddings=True)

        cmds_embds = CMDS().fit_transform(embedded_queries)

        for n_clusters in n_sample_list:
            print(dataset, n_clusters)
            cluster_instance_ids[dataset][n_clusters] = []
            for _ in range(10):
                if n_clusters == 1:
                    labels = np.array([0 for i in range(len(cmds_embds))])
                else:
                    kmeans = GMM(min_components=n_clusters, max_components=n_clusters, n_init=10)
                    labels = kmeans.fit_predict(cmds_embds)
                cluster_centers_ = np.array([np.mean(cmds_embds[labels == c], axis=0) for c in np.unique(labels)])
            
                closest_indices, _ = pairwise_distances_argmin_min(cluster_centers_, cmds_embds)
                representative_queries = [query_list[i] for i in closest_indices]
    
                rep_instance_ids = []
                for idx in closest_indices:
                    q = query_list[idx]
                    row = tsv[(tsv['dataset'] == dataset) & (tsv['query'] == q)]
                    rep_instance_ids.append(row['instance_id'].iloc[0])
                cluster_instance_ids[dataset][n_clusters].append(rep_instance_ids)

math:subject=algebra 1
math:subject=algebra 4
math:subject=algebra 16
math:subject=algebra 64
math:subject=counting_and_probability 1
math:subject=counting_and_probability 4
math:subject=counting_and_probability 16
math:subject=counting_and_probability 64
math:subject=geometry 1
math:subject=geometry 4
math:subject=geometry 16
math:subject=geometry 64
math:subject=intermediate_algebra 1
math:subject=intermediate_algebra 4
math:subject=intermediate_algebra 16
math:subject=intermediate_algebra 64
math:subject=number_theory 1
math:subject=number_theory 4
math:subject=number_theory 16
math:subject=number_theory 64
math:subject=prealgebra 1
math:subject=prealgebra 4
math:subject=prealgebra 16
math:subject=prealgebra 64
math:subject=precalculus 1
math:subject=precalculus 4
math:subject=precalculus 16
math:subject=precalculus 64
legalbench:subset=abercrombie 1
legalbench:subset=abercrombie 4
legalbench:subset=abercrombie 16
legalbench:subset=abercrombie 64
legalbench:subset=international_citi

In [None]:
for dataset in dataset_dict:
    print(dataset)
    
    score_col = dataset_dict[dataset]['score_col']
    embed_provider = dataset_dict[dataset]['embed_provider']
    embed_provider = dataset_dict[dataset]['embed_provider']
    embed_model = dataset_dict[dataset]['embed_model']
    err_fn = dataset_dict[dataset]['err_fn']
    outdir=dataset_dict[dataset]['outdir']
    sample = dataset_dict[dataset]['sample']
    seed=dataset_dict[dataset]['seed']
    
    inpath = Path('data') / f'{dataset.split(":")[0]}.tsv'
    outdir = Path(outdir)
    outdir.mkdir(parents=True, exist_ok=True)

    df = pd.read_csv(inpath, sep='\t')
    df = df[df.dataset == dataset]

    if sample:
        rng           = np.random.default_rng(seed)
        uinstance_ids = df.instance_id.unique()
        keep          = rng.choice(uinstance_ids, int(len(uinstance_ids) * sample), replace=False)
        df            = df[df.instance_id.isin(keep)]

    df = df.sort_values(['model', 'instance_id']).reset_index(drop=True)
    
    if score_col != 'score':
        print(f'{score_col} -> score')
        df['score'] = df[score_col]

    # --
    # QC
    print(f'{len(df.response.unique())} / {df.shape[0]} responses are unique')
    _instance_ids = df.groupby('model').instance_id.apply(list)
    assert all([_instance_ids.iloc[0] == _instance_ids.iloc[i] for i in range(len(_instance_ids))]), 'instance_ids are not the same for each model'
    
    # --
    # Get embeddings
    
    if embed_model == 'onehot':
        df = onehot_embedding(df, dataset=dataset)
    else:
        df['embedding'] = list(embed_api(
            provider   = embed_provider, 
            input_strs = [str(xx) for xx in df.response.values],
            model      = embed_model
        ))
    
    # --
    # Run
    model_names  = df.model.unique()
    instance_ids = df.instance_id.unique()
    y_acts       = df.groupby('model').score.mean().to_dict()

    modes = ['model', 'family']
    pred_null = {mode: predict_null(df, mode=mode) for mode in modes}
    
    # --
    # Simple - DKPS w/ more than one example
    
    outpath = outdir / f'{dataset}-{score_col}-cluster-res.tsv'
    
    jobs = []
    for n_samples in n_sample_list:
        for i,instance_ids_sample in enumerate(cluster_instance_ids[dataset][n_samples]):
            df_sample           = df[df.instance_id.isin(instance_ids_sample)]
            jobs.append(delayed(run_one)(df_sample=df_sample, n_samples=n_samples, mode='family', seed=i, instance_ids=instance_ids_sample))
            
    
    res    = sum(Parallel(n_jobs=-2, verbose=10)(jobs), [])
    df_res = pd.DataFrame(res)
    
    for c in df_res.columns:
        if 'p_' in c:
            df_res[c.replace('p_', 'e_')] = err_fns[err_fn](df_res.y_act, df_res[c])
    
    df_res.to_csv(outpath, sep='\t', index=False)

math:subject=algebra
12045 / 12825 responses are unique


Embedding chunks: 100%|█████████████████████████████████████████████████████████████| 257/257 [00:00<00:00, 2211.21it/s]
[Parallel(n_jobs=-2)]: Using backend LokyBackend with 25 concurrent workers.
[Parallel(n_jobs=-2)]: Done  1 out of 40 | elapsed:    0.5s
[Parallel(n_jobs=-2)]: Done  6 out of 40 | elapsed:    0.6s remaining:    3.4s
[Parallel(n_jobs=-2)]: Done 11 out of 40 | elapsed:    1.3s remaining:    3.4s
[Parallel(n_jobs=-2)]: Done 16 out of 40 | elapsed:    1.3s remaining:    2.0s
[Parallel(n_jobs=-2)]: Done 21 out of 40 | elapsed:    3.1s remaining:    2.8s
[Parallel(n_jobs=-2)]: Done 26 out of 40 | elapsed:    4.7s remaining:    2.5s
[Parallel(n_jobs=-2)]: Done 31 out of 40 | elapsed:   11.4s remaining:    3.3s
[Parallel(n_jobs=-2)]: Done 36 out of 40 | elapsed:   15.6s remaining:    1.7s
[Parallel(n_jobs=-2)]: Done 40 out of 40 | elapsed:   18.6s finished


math:subject=counting_and_probability
3537 / 3705 responses are unique


Embedding chunks: 100%|███████████████████████████████████████████████████████████████| 75/75 [00:00<00:00, 1836.21it/s]
[Parallel(n_jobs=-2)]: Using backend LokyBackend with 25 concurrent workers.
[Parallel(n_jobs=-2)]: Done  1 out of 40 | elapsed:    0.5s
[Parallel(n_jobs=-2)]: Done  6 out of 40 | elapsed:    0.6s remaining:    3.3s
[Parallel(n_jobs=-2)]: Done 11 out of 40 | elapsed:    1.0s remaining:    2.6s
[Parallel(n_jobs=-2)]: Done 16 out of 40 | elapsed:    1.1s remaining:    1.6s
[Parallel(n_jobs=-2)]: Done 21 out of 40 | elapsed:    2.5s remaining:    2.3s
[Parallel(n_jobs=-2)]: Done 26 out of 40 | elapsed:    3.7s remaining:    2.0s
[Parallel(n_jobs=-2)]: Done 31 out of 40 | elapsed:    6.2s remaining:    1.8s
[Parallel(n_jobs=-2)]: Done 36 out of 40 | elapsed:    8.6s remaining:    1.0s
[Parallel(n_jobs=-2)]: Done 40 out of 40 | elapsed:   10.0s finished


math:subject=geometry
3436 / 3610 responses are unique


Embedding chunks: 100%|███████████████████████████████████████████████████████████████| 73/73 [00:00<00:00, 1748.31it/s]
[Parallel(n_jobs=-2)]: Using backend LokyBackend with 25 concurrent workers.
[Parallel(n_jobs=-2)]: Done  1 out of 40 | elapsed:    0.5s
[Parallel(n_jobs=-2)]: Done  6 out of 40 | elapsed:    0.6s remaining:    3.3s
[Parallel(n_jobs=-2)]: Done 11 out of 40 | elapsed:    1.1s remaining:    2.8s
[Parallel(n_jobs=-2)]: Done 16 out of 40 | elapsed:    1.4s remaining:    2.0s
[Parallel(n_jobs=-2)]: Done 21 out of 40 | elapsed:    3.4s remaining:    3.1s
