In [1]:
import os
import sys

import torch
from sklearn import linear_model
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.decomposition import PCA
import numpy as np


# NOTE:
# This notebook assumes execution from the project root.
# It explicitly sets cwd to external/genomic-FM so GV-Rep loaders work.

os.chdir("../external/genomic-FM")
sys.path.append(".")

In [5]:
path = "root/data/clinvar_embeddings__n155__len1024__layer-1__maskedmean.pt"
payload = torch.load(path, map_location="cpu")

print(payload.keys())
print("Embeddings shape:", payload["delta_embeddings"].shape)
print("Labels shape:", len(payload["labels"]))

X = payload["delta_embeddings"]
labels = payload["labels"]

#Encode labels (Classes 1,2,3 --> 0 and Classes 3,4,5 --> 1)
# = [0 if label in ["Class 1", "Class 2", "Class 3"] else 1 for label in labels]

mask = np.isin(labels, ["Class 1", "Class 5"])
y = [0 if label == "Class 1" else 1 for label in labels if label in ["Class 1", "Class 5"]]
X = X[mask]
print("Encoded labels:", y)

dict_keys(['model_name', 'seq_len', 'pooling', 'layer', 'labels', 'ref_embeddings', 'alt_embeddings', 'delta_embeddings', 'ref_sequences', 'alt_sequences'])
Embeddings shape: torch.Size([155, 1024])
Labels shape: 155
Encoded labels: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [6]:
clf = make_pipeline(
    StandardScaler(),
    PCA(n_components=50),  # optional: reduce dimensionality
    LogisticRegression(
        penalty="l2",
        C=0.1,               # smaller = stronger regularization; start 0.1 or 0.01
        solver="liblinear",  # good for small datasets
        max_iter=5000
    )
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

res = cross_validate(
    clf, X, y,
    cv=cv,
    scoring=["roc_auc", "accuracy", "f1"],
    return_train_score=False
)

print("ROC-AUC:", res["test_roc_auc"].mean(), "+/-", res["test_roc_auc"].std())
print("Acc:    ", res["test_accuracy"].mean(), "+/-", res["test_accuracy"].std())
print("F1:     ", res["test_f1"].mean(), "+/-", res["test_f1"].std())

ROC-AUC: 0.7253030303030303 +/- 0.044787017665072246
Acc:     0.7114624505928855 +/- 0.073479525935951
F1:      0.725752508361204 +/- 0.07510328653498402




This tells us there is enough signal in the embeddings to somewhat discern the two extremes. The nucleotide-transformer last-layer representations encode information that distinguishes benign from pathogenic variants. ClinVar classes dilute separability due to level ambiguity, not model failure.

PCA also improves performance a bit, showing how discriminative information lives in a small subspace, and that most dimensions are noise or irrelevant variation. 
