In [None]:
import os
import sys

import torch
from sklearn import linear_model
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.decomposition import PCA
import numpy as np

# This notebook assumes execution from the project root.
# It explicitly sets cwd to external/genomic-FM so GV-Rep loaders work.

os.chdir("../external/genomic-FM")
sys.path.append(".")

In [2]:
path = "root/data/clinvar_embeddings__n155__len1024__layer-1__maskedmean.pt"
payload = torch.load(path, map_location="cpu")

print(payload.keys())
print("Embeddings shape:", payload["delta_embeddings"].shape)
print("Labels shape:", len(payload["labels"]))

X = payload["delta_embeddings"]
labels = payload["labels"]

#Encode labels (Classes 1,2,3 --> 0 and Classes 3,4,5 --> 1)
# = [0 if label in ["Class 1", "Class 2", "Class 3"] else 1 for label in labels]

mask = np.isin(labels, ["Class 1", "Class 5"])
y = [0 if label == "Class 1" else 1 for label in labels if label in ["Class 1", "Class 5"]]
X = X[mask]
print("Encoded labels:", y)

dict_keys(['model_name', 'seq_len', 'pooling', 'layer', 'labels', 'ref_embeddings', 'alt_embeddings', 'delta_embeddings', 'ref_sequences', 'alt_sequences'])
Embeddings shape: torch.Size([155, 1024])
Labels shape: 155
Encoded labels: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [3]:
clf = make_pipeline(
    StandardScaler(),
    PCA(n_components=50),  # optional: reduce dimensionality
    LogisticRegression(
        l1_ratio=0,
        C=0.1,               # smaller = stronger regularization; start 0.1 or 0.01
        solver="liblinear",  # good for small datasets
        max_iter=5000
    )
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

res = cross_validate(
    clf, X, y,
    cv=cv,
    scoring=["roc_auc", "accuracy", "f1"],
    return_train_score=False
)

print("ROC-AUC:", res["test_roc_auc"].mean(), "+/-", res["test_roc_auc"].std())
print("Acc:    ", res["test_accuracy"].mean(), "+/-", res["test_accuracy"].std())
print("F1:     ", res["test_f1"].mean(), "+/-", res["test_f1"].std())

ROC-AUC: 0.7237878787878789 +/- 0.04456606408414642
Acc:     0.6940711462450594 +/- 0.07514456688263174
F1:      0.703091111786764 +/- 0.08076593303148627


This tells us there is enough signal in the embeddings to somewhat discern the two extremes. The nucleotide-transformer last-layer representations encode information that distinguishes benign from pathogenic variants. ClinVar classes dilute separability due to level ambiguity, not model failure.

PCA also improves performance a bit, showing how discriminative information lives in a small subspace, and that most dimensions are noise or irrelevant variation. 


In [20]:
path = "root/data/clinvar_pooled_embeddings__n155__bp3000__tok505__layers10.pt"
payload = torch.load(path, map_location="cpu")

print("Loading embeddings from:", path)
print(payload.keys())

layers = payload["layers"]
labels = np.array(payload["labels"])
n = len(labels)  # number of variants (N)

# pick a layer you want
layer = 28  # e.g., last layer
E = payload["embeddings_by_layer"][layer]  # shape (2N, 1024), torch.Tensor
print("Size of E:", E.shape)

# split REF and ALT
ref = E[:n].numpy()      # (N, 1024)
alt = E[n:].numpy()      # (N, 1024)

# delta
X = alt - ref            # (N, 1024)

Loading embeddings from: root/data/clinvar_pooled_embeddings__n155__bp3000__tok505__layers10.pt
dict_keys(['model_name', 'bp_window_len', 'token_max_length', 'pooling', 'layers', 'labels', 'embeddings_by_layer'])
Size of E: torch.Size([310, 1024])


In [19]:
mask = np.isin(labels, ["Class 1", "Class 5"])
print("Using only Class 1 and Class 5 samples:", np.sum(mask), "out of", n)
X_bin = X[mask]
print("y shape before encoding:", labels.shape)
y_bin = (labels[mask] == "Class 5").astype(int)  # Class 1 -> 0, Class 5 -> 1

print("X shape:", X_bin.shape)
print("y counts:", len(y_bin), "with", np.sum(y_bin), "positives")
print("Len of record =", X_bin[0].shape[0])

Using only Class 1 and Class 5 samples: 114 out of 155
y shape before encoding: (155,)
X shape: (114, 1024)
y counts: 114 with 60 positives
Len of record = 1024


In [17]:
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

clf = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LogisticRegression(max_iter=5000, class_weight="balanced"))
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
auc = cross_val_score(clf, X_bin, y_bin, cv=cv, scoring="roc_auc")

print(f"ROC-AUC: {auc.mean():.3f} +/- {auc.std():.3f}")


ROC-AUC: 0.864 +/- 0.045
