# Linear probing demo 
In this notebook, you can evalate slide embeddings for TITAN using linear probing.

In [2]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.extend(["/home/user/wangtao/prov-gigapath/TITAN"])
import numpy as np
import pandas as pd
import torch
import yaml
from transformers import AutoModel
from titan.eval_linear_probe import train_and_evaluate_logistic_regression_with_val
from titan.utils import bootstrap

import os
os.environ["OMP_NUM_THREADS"] = "8"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload




In [3]:
# load model from huggingface
model = AutoModel.from_pretrained('MahmoodLab/TITAN', trust_remote_code=True)
model = model.to(device)

2024-12-08 15:56:17.822169: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-08 15:56:18.075241: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
# load task configs
with open('../datasets/config_tcga-ot.yaml', 'r') as file:
    task_config = yaml.load(file, Loader=yaml.FullLoader)
target = task_config['target']
label_dict = task_config['label_dict']

In [5]:
# load pre-extracted TITAN slide embeddings for TCGA
import pickle
from huggingface_hub import hf_hub_download
slide_feature_path = hf_hub_download(
    "MahmoodLab/TITAN", 
    filename="TCGA_TITAN_features.pkl",
)
with open(slide_feature_path, 'rb') as file:
  data = pickle.load(file)
embeddings_df = pd.DataFrame({'slide_id': data['filenames'], 'embeddings': list(data['embeddings'][:])})

In [14]:
tcga_slide_path = "/home/user/sngp/tcga_slides/slides"
tcga_slide_list = os.listdir(tcga_slide_path)
tcga_slide_name = [slide.strip('.svs') for slide in tcga_slide_list]
embeddings_df = embeddings_df[embeddings_df['slide_id'].isin(tcga_slide_name)]

In [15]:
# load splits
train_split = pd.read_csv('../datasets/tcga-ot_train.csv')
train_df = pd.merge(embeddings_df, train_split, on='slide_id')
val_split = pd.read_csv('../datasets/tcga-ot_val.csv')
val_df = pd.merge(embeddings_df, val_split, on='slide_id')
test_split = pd.read_csv('../datasets/tcga-ot_test.csv')
test_df = pd.merge(embeddings_df, test_split, on='slide_id')

In [16]:
print(f"len(train_df): {len(train_df)}, len(val_df): {len(val_df)}, len(test_df): {len(test_df)}")

len(train_df): 661, len(val_df): 92, len(test_df): 188


In [17]:
train_data = np.stack(train_df.embeddings.values)
train_labels = train_df[target].apply(lambda x: label_dict[x]).values
val_data = np.stack(val_df.embeddings.values)
val_labels = val_df[target].apply(lambda x: label_dict[x]).values
test_data = np.stack(test_df.embeddings.values)
test_labels = test_df[target].apply(lambda x: label_dict[x]).values

In [21]:
print("train df", pd.DataFrame(train_labels).value_counts())
print("val df", pd.DataFrame(val_labels).value_counts())
print("test df", pd.DataFrame(test_labels).value_counts())

train df 0 
21    360
20    301
Name: count, dtype: int64
val df 0 
21    47
20    45
Name: count, dtype: int64
test df 0 
20    121
21     67
Name: count, dtype: int64


In [22]:
# log_spaced_values = np.logspace(np.log10(10e-2), np.log10(10e2), num=3)
results, outputs = train_and_evaluate_logistic_regression_with_val(train_data, train_labels, val_data, val_labels, test_data, test_labels, log_spaced_values=None)
# to use the default setting from our paper use the default value for searching C (log_spaced_values = np.logspace(np.log10(10e-6), np.log10(10e5), num=45))
# results = train_and_evaluate_logistic_regression_with_val(train_data, train_labels, val_data, val_labels, test_data, test_labels)
for key, value in results.items():
    print(f"{key.split('/')[-1]: <12}: {value:.4f}")

Finding best C: 100%|██████████| 45/45 [00:24<00:00,  1.86it/s]

Best C: 56.23413251903491
acc         : 0.9043
bacc        : 0.9090
kappa       : 0.7967
nw_kappa    : 0.7967
weighted_f1 : 0.9053
loss        : 0.2681
auroc       : 0.9631





In [23]:
bootstrap_kwargs = {'n': 1000, 'alpha': 0.95}
results_mean, results_std = bootstrap(results_dict=outputs, **bootstrap_kwargs)  # takes a while as 46 imbalanced classes are bootstrapped
for keys, values in results_mean.items():
    print(f"{keys.split('/')[-1]: <12}: {values:.4f} ± {results_std[keys]:.4f}")

100%|██████████| 1000/1000 [00:08<00:00, 119.22it/s]

acc         : 0.9042 ± 0.0212
bacc        : 0.9093 ± 0.0207
kappa       : 0.7957 ± 0.0450
nw_kappa    : 0.7957 ± 0.0450
weighted_f1 : 0.9053 ± 0.0207
loss        : 0.2684 ± 0.0542
auroc       : 0.9633 ± 0.0130





In [12]:
slide_feature_path

'/home/user/.cache/huggingface/hub/models--MahmoodLab--TITAN/snapshots/b2fb4f475256eb67c6e9ccbf2d6c9c3f25f20791/TCGA_TITAN_features.pkl'