# Linear probing demo 
In this notebook, you can evalate slide embeddings for TITAN using linear probing.

In [20]:
# %load_ext autoreload
# %autoreload 2

import sys
sys.path.extend(["/home/user/wangtao/prov-gigapath/TITAN"])
import numpy as np
import pandas as pd
import torch
import yaml
from transformers import AutoModel
from titan.eval_linear_probe import train_and_evaluate_logistic_regression_with_val, train_and_evaluate_logistic_regression_with_both_metrics
from titan.utils import bootstrap

import os
import psutil

physical_cores = psutil.cpu_count(logical=False)
print("physical_cores", physical_cores)

# Set the MKL_NUM_THREADS to the number of physical cores
os.environ["OMP_NUM_THREADS"] = str(int(physical_cores / 2))
os.environ["MKL_NUM_THREADS"] = str(int(physical_cores / 2))
os.environ["OPENBLAS_NUM_THREADS"] = str(int(physical_cores / 2))
os.environ["OPENBLAS_NUM_THREADS"] = str(int(physical_cores / 2))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

physical_cores 24


In [21]:
# load model from huggingface
model = AutoModel.from_pretrained('MahmoodLab/TITAN', trust_remote_code=True)
model = model.to(device)

In [22]:
# load task configs
with open('/home/user/wangtao/prov-gigapath/TITAN/datasets/config_tcga-ot.yaml', 'r') as file:
    task_config = yaml.load(file, Loader=yaml.FullLoader)
target = task_config['target']
label_dict = task_config['label_dict']

In [23]:
# load pre-extracted TITAN slide embeddings for TCGA
import pickle
from huggingface_hub import hf_hub_download
# slide_feature_path = hf_hub_download(
#     "MahmoodLab/TITAN",
#     filename="TCGA_TITAN_features.pkl",
# )
slide_feature_path = '/home/user/sngp/TCGA-OT/Patch512/TITAN/h5_files_slide/TCGA_TITAN_custom_features.pkl'
with open(slide_feature_path, 'rb') as file:
  data = pickle.load(file)
embeddings_df = pd.DataFrame({'slide_id': data['filenames'], 'embeddings': list(data['embeddings'][:])})
print("embeddings_df", embeddings_df.shape)

embeddings_df (11186, 2)


In [24]:
# tcga_slide_path = "/home/user/sngp/tcga_slides/slides"
# tcga_slide_list = os.listdir(tcga_slide_path)
# tcga_slide_name = [slide.strip('.svs') for slide in tcga_slide_list]
# embeddings_df = embeddings_df[embeddings_df['slide_id'].isin(tcga_slide_name)]

In [25]:
# load splits
train_split = pd.read_csv('/home/user/wangtao/prov-gigapath/TITAN/datasets/tcga-ot_train.csv')
# only change a value from TCGA-F5-6861-01Z-00-DX1.011B771B-F52E-412E-9352-1578349BEAF1 to TCGA-F5-6861-01Z-00-DX1.011B771B-F52E-412E-9352-1578349beaf1,
train_split["slide_id"] = train_split["slide_id"].str.replace("TCGA-F5-6861-01Z-00-DX1.011B771B-F52E-412E-9352-1578349beaf1", "TCGA-F5-6861-01Z-00-DX1.011B771B-F52E-412E-9352-1578349BEAF1")
train_df = pd.merge(embeddings_df, train_split, on='slide_id')
val_split = pd.read_csv('/home/user/wangtao/prov-gigapath/TITAN/datasets/tcga-ot_val.csv')
val_df = pd.merge(embeddings_df, val_split, on='slide_id')
test_split = pd.read_csv('/home/user/wangtao/prov-gigapath/TITAN//datasets/tcga-ot_test.csv')
test_df = pd.merge(embeddings_df, test_split, on='slide_id')
# Slide TCGA-F5-6861-01Z-00-DX1.011B771B-F52E-412E-9352-1578349BEAF1 not found in embeddings_df

In [26]:
print(f"len(train_df): {len(train_df)}, len(val_df): {len(val_df)}, len(test_df): {len(test_df)}")

len(train_df): 8226, len(val_df): 1612, len(test_df): 1348


In [27]:
train_data = np.stack(train_df.embeddings.values)
train_labels = train_df[target].apply(lambda x: label_dict[x]).values
val_data = np.stack(val_df.embeddings.values)
val_labels = val_df[target].apply(lambda x: label_dict[x]).values
test_data = np.stack(test_df.embeddings.values)
test_labels = test_df[target].apply(lambda x: label_dict[x]).values

In [28]:
print("train df", pd.DataFrame(train_labels).value_counts())
print("val df", pd.DataFrame(val_labels).value_counts())
print("test df", pd.DataFrame(test_labels).value_counts())

train df 0 
13    798
17    743
5     455
32    382
4     376
21    364
23    347
16    338
43    308
20    305
14    303
8     278
39    253
33    197
27    165
1     158
18    154
28    152
29    135
6     134
24    119
30    118
37    114
34    113
0     113
2     112
19    108
25     91
40     86
26     78
15     75
7      65
3      64
45     64
12     60
10     57
9      56
35     55
22     48
41     45
11     44
31     43
42     42
38     41
36     36
44     34
Name: count, dtype: int64
val df 0 
1     50
13    50
4     49
5     49
17    49
8     49
21    48
20    48
16    47
28    46
37    46
33    46
14    46
27    45
39    45
32    44
43    44
26    43
40    41
2     41
25    41
24    40
6     40
45    40
0     39
19    37
18    36
42    34
3     33
35    32
29    32
23    31
36    29
9     29
30    27
34    25
15    23
7     19
31    17
10    14
41    14
12    12
38    11
22    11
11    10
44    10
Name: count, dtype: int64
test df 0 
20    178
39    110
21    100
16     87
4

In [29]:
# log_spaced_values = np.logspace(np.log10(10e-2), np.log10(10e2), num=3)
results, outputs = train_and_evaluate_logistic_regression_with_val(train_data, train_labels, val_data, val_labels, test_data, test_labels, test_slide_id=None, test_patient_id=None, log_spaced_values=None)
# to use the default setting from our paper use the default value for searching C (log_spaced_values = np.logspace(np.log10(10e-6), np.log10(10e5), num=45))
# results = train_and_evaluate_logistic_regression_with_val(train_data, train_labels, val_data, val_labels, test_data, test_labels)
for key, value in results.items():
    print(f"{key.split('/')[-1]: <12}: {value:.4f}")

Finding best C: 100%|██████████| 45/45 [01:12<00:00,  1.61s/it]


Best C: 17.78279410038923
acc         : 0.7745
bacc        : 0.6919
kappa       : 0.8007
nw_kappa    : 0.8130
weighted_f1 : 0.7586
loss        : 0.6296
auroc       : 0.9897


In [120]:
# log_spaced_values = np.logspace(np.log10(10e-2), np.log10(10e2), num=3)
results, outputs = train_and_evaluate_logistic_regression_with_val(train_data, train_labels, val_data, val_labels, test_data, test_labels, test_slide_id=None, test_patient_id=None, log_spaced_values=[0.03162277660168379])
# to use the default setting from our paper use the default value for searching C (log_spaced_values = np.logspace(np.log10(10e-6), np.log10(10e5), num=45))
val_labels, test_data, test_labels)
for key, value in results.items():
    print(f"{key.split('/')[-1]: <12}: {value:.4f}")

val_results, test_results, outputs = train_and_evaluate_logistic_regression_with_both_metrics(
    train_data, train_labels, val_data, val_labels, None, None, test_data, test_labels,
    test_slide_id=None, test_patient_id=None, log_spaced_values=[0.03162277660168379]
)
# results = train_and_evaluate_logistic_regression_with_val(train_data, train_labels, val_data, val_labels, test_data, test_labels)

for key, value in val_results.items():
    print(f"VAL {key.split('/')[-1]: <12}: {value:.4f}")

for key, value in test_results.items():
    print(f"TEST {key.split('/')[-1]: <12}: {value:.4f}")

SyntaxError: unmatched ')' (2895622222.py, line 4)

In [23]:
bootstrap_kwargs = {'n': 1000, 'alpha': 0.95}
results_mean, results_std = bootstrap(results_dict=outputs, **bootstrap_kwargs)  # takes a while as 46 imbalanced classes are bootstrapped
for keys, values in results_mean.items():
    print(f"{keys.split('/')[-1]: <12}: {values:.4f} ± {results_std[keys]:.4f}")

100%|██████████| 1000/1000 [00:08<00:00, 119.22it/s]

acc         : 0.9042 ± 0.0212
bacc        : 0.9093 ± 0.0207
kappa       : 0.7957 ± 0.0450
nw_kappa    : 0.7957 ± 0.0450
weighted_f1 : 0.9053 ± 0.0207
loss        : 0.2684 ± 0.0542
auroc       : 0.9633 ± 0.0130





In [12]:
slide_feature_path

'/home/user/.cache/huggingface/hub/models--MahmoodLab--TITAN/snapshots/b2fb4f475256eb67c6e9ccbf2d6c9c3f25f20791/TCGA_TITAN_features.pkl'