# End-to-end SVCCA & CKA Report (Single Notebook)

This notebook:
1. Connects to MinIO (S3-compatible).
2. Discovers trials under a prefix (e.g., `exp1/`).
3. Loads per-trial activation snapshots (`.npz`) for selected epochs and layers.
4. Computes **Linear CKA** and **SVCCA** pairwise across trials.
5. Produces heatmaps and summary tables using **matplotlib** (no seaborn).

In [None]:
import os

MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "10.249.190.44:9000")
MINIO_BUCKET   = os.getenv("MINIO_BUCKET",   "katib-artifacts")
MINIO_ACCESS   = os.getenv("MINIO_ACCESS_KEY", "minioadmin")
MINIO_SECRET   = os.getenv("MINIO_SECRET_KEY", "minioadmin")
MINIO_PREFIX   = os.getenv("MINIO_PREFIX",   "exp1")

ANALYSIS_EPOCHS = [int(x) for x in os.getenv("ANALYSIS_EPOCHS", "1,2,4,8,16,20").split(",")]
ANALYSIS_LAYERS = os.getenv(
    "ANALYSIS_LAYERS",
    "conv1,layer1.1.relu,layer2.1.relu,layer3.1.relu,layer4.1.relu,fc"
).split(",")

SVCCA_VAR_KEEP = float(os.getenv("SVCCA_VAR_KEEP", "0.99"))
UPLOAD_RESULTS = os.getenv("UPLOAD_RESULTS", "1") == "1"
OUTDIR_LOCAL   = os.getenv("OUTDIR_LOCAL", "/home/san/mac-linux/study/NCA-GENL/kubeflow-experiments/SVCCA/output")


print("Endpoint:", MINIO_ENDPOINT)
print("Bucket  :", MINIO_BUCKET)
print("Prefix  :", MINIO_PREFIX)
print("Epochs  :", ANALYSIS_EPOCHS)
print("Layers  :", ANALYSIS_LAYERS)
print("SVCCA_VAR_KEEP:", SVCCA_VAR_KEEP, " | Upload:", UPLOAD_RESULTS)
print("Outdir  :", OUTDIR_LOCAL)

Endpoint: 10.249.190.44:9000
Bucket  : katib-artifacts
Prefix  : exp1
Epochs  : [1, 2, 4, 8, 16, 20]
Layers  : ['conv1', 'layer1.1.relu', 'layer2.1.relu', 'layer3.1.relu', 'layer4.1.relu', 'fc']
SVCCA_VAR_KEEP: 0.99  | Upload: True
Outdir  : /home/san/mac-linux/study/NCA-GENL/kubeflow-experiments/SVCCA/output


In [9]:
import io
import itertools
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

try:
    import boto3
except Exception as e:
    raise RuntimeError("boto3 is required to access MinIO") from e

try:
    import torch
    TORCH_OK = True
except Exception:
    TORCH_OK = False

def s3_client():
    ep = MINIO_ENDPOINT if MINIO_ENDPOINT.startswith("http") else "http://" + MINIO_ENDPOINT
    return boto3.client(
        "s3",
        endpoint_url=ep,
        aws_access_key_id=MINIO_ACCESS,
        aws_secret_access_key=MINIO_SECRET,
    )

s3 = s3_client()
Path(OUTDIR_LOCAL).mkdir(parents=True, exist_ok=True)

In [10]:
def list_trials():
    trials = set()
    base = f"{MINIO_PREFIX}/"
    resp = s3.list_objects_v2(Bucket=MINIO_BUCKET, Prefix=base)
    while True:
        for obj in resp.get("Contents", []):
            parts = obj["Key"].split("/")
            if len(parts) >= 4 and parts[2] == "activations":
                trials.add(parts[1])
        if resp.get("IsTruncated"):
            resp = s3.list_objects_v2(Bucket=MINIO_BUCKET, Prefix=base,
                                      ContinuationToken=resp["NextContinuationToken"])
        else:
            break
    return sorted(trials)

def fetch_npz_array(key: str, arr_key: str = "activations"):
    obj = s3.get_object(Bucket=MINIO_BUCKET, Key=key)
    data = obj["Body"].read()
    with np.load(io.BytesIO(data)) as npz:
        return npz[arr_key]

def fetch_acts(trial: str, epoch: int, layer: str):
    key = f"{MINIO_PREFIX}/{trial}/activations/epoch_{epoch}/{layer}.npz"
    try:
        A = fetch_npz_array(key, "activations")
        A = A - A.mean(axis=0, keepdims=True)
        return A
    except Exception:
        return None

def fetch_best(trial: str):
    if not TORCH_OK:
        return None, None
    key = f"{MINIO_PREFIX}/{trial}/checkpoints/best.pt"
    try:
        obj = s3.get_object(Bucket=MINIO_BUCKET, Key=key)
        buf = obj["Body"].read()
        state = torch.load(io.BytesIO(buf), map_location="cpu")
        acc = float(state.get("val_acc")) if isinstance(state, dict) and "val_acc" in state else None
        ep  = int(state.get("epoch")) if isinstance(state, dict) and "epoch" in state else None
        return acc, ep
    except Exception:
        return None, None

trials = list_trials()
print("Found trials:", len(trials))
trials[:5]

Found trials: 16


['cifar10-reuse-discovery-2dzlbqn4',
 'cifar10-reuse-discovery-7dhrcnrv',
 'cifar10-reuse-discovery-88fkk4m2',
 'cifar10-reuse-discovery-9gwg6rqx',
 'cifar10-reuse-discovery-9lc75hwb']

In [11]:
def center_gram(X):
    G = X @ X.T
    n = G.shape[0]
    H = np.eye(n) - np.ones((n, n))/n
    return H @ G @ H

def cka_linear(X, Y):
    n = min(len(X), len(Y))
    if len(X) != len(Y):
        X = X[:n]; Y = Y[:n]
    Kx = center_gram(X)
    Ky = center_gram(Y)
    hsic = np.sum(Kx * Ky)
    denom = np.sqrt(np.sum(Kx*Kx) * np.sum(Ky*Ky)) + 1e-12
    return float(hsic / denom)

def svd_keep(X, var_keep=0.99):
    U, S, Vt = np.linalg.svd(X, full_matrices=False)
    var = S**2
    cum = np.cumsum(var) / (var.sum() + 1e-12)
    k = int(np.searchsorted(cum, var_keep) + 1)
    Xr = U[:, :k] * S[:k]
    return Xr

def invsqrt(mat):
    eigvals, eigvecs = np.linalg.eigh(mat)
    eigvals = np.clip(eigvals, 1e-12, None)
    Dm12 = np.diag(1.0/np.sqrt(eigvals))
    return eigvecs @ Dm12 @ eigvecs.T

def svcca_score(X, Y, var_keep=0.99):
    n = min(len(X), len(Y))
    if len(X) != len(Y):
        X = X[:n]; Y = Y[:n]
    Xr = svd_keep(X, var_keep)
    Yr = svd_keep(Y, var_keep)
    Cxx = Xr.T @ Xr
    Cyy = Yr.T @ Yr
    Cxy = Xr.T @ Yr
    Cxx_invh = invsqrt(Cxx)
    Cyy_invh = invsqrt(Cyy)
    T = Cxx_invh @ Cxy @ Cyy_invh
    s = np.linalg.svd(T, compute_uv=False)
    return float(np.mean(s)), s

In [None]:
rows = []
summary_rows = []

for t in trials:
    acc, ep = fetch_best(t)
    summary_rows.append({"trial": t, "best_val_accuracy": acc, "best_epoch": ep})

for epoch in ANALYSIS_EPOCHS:
    for layer in ANALYSIS_LAYERS:
        acts = {}
        for t in trials:
            A = fetch_acts(t, epoch, layer)
            if A is not None and A.ndim == 2 and A.shape[0] >= 2:
                acts[t] = A
        kept = sorted(acts.keys())
        n = len(kept)
        if n < 2:
            print(f"[skip] epoch={epoch} layer={layer}: not enough trials with data")
            continue

        for i in range(n):
            for j in range(i+1, n):
                ti, tj = kept[i], kept[j]
                c = cka_linear(acts[ti], acts[tj])
                s_mean, _ = svcca_score(acts[ti], acts[tj], var_keep=SVCCA_VAR_KEEP)
                rows.append([epoch, layer, "CKA", ti, tj, c])
                rows.append([epoch, layer, "SVCCA", ti, tj, s_mean])

df = pd.DataFrame(rows, columns=["epoch","layer","metric","trial_i","trial_j","score"])
df_summary = pd.DataFrame(summary_rows)

print("Pairwise rows:", len(df))
display(df.head())

In [None]:
from pathlib import Path

pairwise_csv = str(Path(OUTDIR_LOCAL) / "pairwise_scores.csv")
summary_csv  = str(Path(OUTDIR_LOCAL) / "trial_summary.csv")

Path(OUTDIR_LOCAL).mkdir(parents=True, exist_ok=True)
df.to_csv(pairwise_csv, index=False)
df_summary.to_csv(summary_csv, index=False)
print("Wrote:", pairwise_csv)
print("Wrote:", summary_csv)

def upload_csv(local_path, s3_key):
    with open(local_path, "rb") as fh:
        s3.put_object(Bucket=MINIO_BUCKET, Key=s3_key, Body=fh, ContentType="text/csv")

if UPLOAD_RESULTS:
    upload_csv(pairwise_csv, f"{MINIO_PREFIX}/analysis/pairwise_scores.csv")
    upload_csv(summary_csv,  f"{MINIO_PREFIX}/analysis/trial_summary.csv")
    print("Uploaded to s3://%s/%s/analysis/" % (MINIO_BUCKET, MINIO_PREFIX))

In [None]:
def matrix_for(dframe, epoch, layer, metric):
    d = dframe[(dframe['epoch']==epoch) & (dframe['layer']==layer) & (dframe['metric']==metric)]
    trials = sorted(set(d['trial_i']).union(set(d['trial_j'])))
    idx = {t:i for i,t in enumerate(trials)}
    M = np.eye(len(trials))
    for _, row in d.iterrows():
        i, j = idx[row['trial_i']], idx[row['trial_j']]
        M[i,j] = M[j,i] = float(row['score'])
    return trials, M

epochs = sorted(df['epoch'].unique().tolist())
layers = sorted(df['layer'].unique().tolist())
metrics = sorted(df['metric'].unique().tolist())

print("Epochs:", epochs)
print("Layers:", layers)
print("Metrics:", metrics)

E = epochs[-1] if epochs else None
L = layers[min(2, len(layers)-1)] if layers else None
E, L

In [None]:
def show_heatmap(M, labels, title):
    plt.figure(figsize=(6,5))
    plt.imshow(M, aspect='auto')
    plt.xticks(range(len(labels)), labels, rotation=90)
    plt.yticks(range(len(labels)), labels)
    plt.title(title)
    plt.colorbar()
    plt.tight_layout()
    plt.show()

if E is not None and L is not None:
    trials_, M_cka = matrix_for(df, E, L, 'CKA')
    _,       M_sv  = matrix_for(df, E, L, 'SVCCA')
    show_heatmap(M_cka, trials_, f"CKA heatmap (epoch={E}, layer={L})")
    show_heatmap(M_sv,  trials_, f"SVCCA heatmap (epoch={E}, layer={L})")
else:
    print("No data yet to plot.")

In [None]:
if E is not None and L is not None:
    d = df[(df['epoch']==E) & (df['layer']==L)]
    dd = d.pivot_table(index=['trial_i','trial_j'], columns='metric', values='score').reset_index()
    dd = dd.dropna()

    plt.figure(figsize=(5,4))
    plt.scatter(dd['CKA'], dd['SVCCA'])
    plt.xlabel("CKA")
    plt.ylabel("SVCCA")
    plt.title(f"CKA vs SVCCA (epoch={E}, layer={L})")
    plt.tight_layout()
    plt.show()

    display(dd.head())
else:
    print("No data for scatter.")

In [None]:
summary_by_layer = df.groupby(['layer','metric'])['score'].agg(['mean','median','std','count']).reset_index()
summary_by_epoch = df.groupby(['epoch','metric'])['score'].agg(['mean','median','std','count']).reset_index()

summary_by_layer.sort_values(['metric','mean'], ascending=[True, False], inplace=True)
summary_by_epoch.sort_values(['metric','epoch'], ascending=[True, True], inplace=True)

print("Summary by layer (top 10):")
display(summary_by_layer.head(10))

print("Summary by epoch:")
display(summary_by_epoch)

if not df_summary.empty:
    print("Best validation accuracy (if available):")
    display(df_summary.sort_values('best_val_accuracy', ascending=False).head(10))