In [1]:
import json
import numpy as np


def load_cell_features(data_path="/playpen/jesse/HE_IF/graph_comparison/cell_feature"):
    feature_data = np.load(f"{data_path}/cell_features.npz")
    cell_ids = feature_data['cell_ids']
    features = feature_data['features']
    coords = feature_data['coords']
    
    with open(f"{data_path}/data_splits.json", 'r') as f:
        data_splits = json.load(f)
    
    return cell_ids, features, coords, data_splits

In [2]:
import pandas as pd


cell_feature = "/playpen/jesse/HE_IF/graph_comparison/cell_feature/cell_features.npz"
data_path = "./cell_feature"
cell_ids, features, coords, data_splits = load_cell_features(data_path)

csv_file = "/playpen/jesse/HE_IF/graph_comparison/cell_feature/CRC03_new_coordinates.csv"
df = pd.read_csv(csv_file)

biomarker_cols = [
    'Hoechst1', 'Hoechst2', 'Hoechst3', 'Hoechst4', 'Hoechst5', 
    'Hoechst6', 'Hoechst7', 'Hoechst8', 'Hoechst9', 'A488', 
    'CD3', 'Ki67', 'CD4', 'CD20', 'CD163', 'Ecadherin', 
    'LaminABC', 'PCNA', 'A555', 'NaKATPase', 'Keratin', 
    'CD45', 'CD68', 'FOXP3', 'Vimentin', 'Desmin', 
    'Ki67_570', 'A647', 'CD45RO', 'aSMA', 'PD1', 
    'CD8a', 'PDL1', 'CDX2', 'CD31', 'Collagen'
]

In [None]:
from sklearn.preprocessing import StandardScaler

train_indices = data_splits['train']
val_indices = data_splits['val']
test_indices = data_splits['test']

train_indices = [int(idx) for idx in train_indices]
val_indices = [int(idx) for idx in val_indices]
test_indices = [int(idx) for idx in test_indices]

biomarker_data = np.log1p(df[biomarker_cols].values)

train_targets = biomarker_data[train_indices]
val_targets = biomarker_data[val_indices]
test_targets = biomarker_data[test_indices]

print(f"Train: {len(train_indices)}, Val: {len(val_indices)}, Test: {len(test_indices)} cells")

# Normalize features
scaler = StandardScaler()
train_features = scaler.fit_transform(features[train_indices])
val_features = scaler.transform(features[val_indices])
test_features = scaler.transform(features[test_indices])

train_coords = coords[train_indices]
val_coords = coords[val_indices]
test_coords = coords[test_indices]

Train: 451101, Val: 56388, Test: 56388 cells


In [7]:
train_features.shape, val_features.shape, test_features.shape


((451101, 768), (56388, 768), (56388, 768))

In [9]:
train_coords.shape, val_coords.shape, test_coords.shape

((451101, 2), (56388, 2), (56388, 2))

In [8]:
train_targets.shape, val_targets.shape, test_targets.shape

((451101, 36), (56388, 36), (56388, 36))