In [None]:
from huggingface_hub import hf_hub_download
import pandas as pd

table = hf_hub_download(
    "rendeirolab/lazyslide-data", "GTEx_artery_dataset.csv.gz", repo_type="dataset"
)

dataset = pd.read_csv(table)
dataset.head()

Unnamed: 0,Tissue Sample Id,Sex,Age Bracket,Pathology Categories
0,GTEX-111YS-2226,male,60-69,calcification
1,GTEX-11GSP-2926,female,60-69,calcification
2,GTEX-11LCK-1426,male,30-39,clean_specimens
3,GTEX-11ONC-2726,male,60-69,calcification
4,GTEX-12126-0726,male,20-29,clean_specimens


In [None]:
terms = [
    "BMP-2",
    "Monckeberg sclerosis",
    "Runx2",
    "adventitia",
    "apoptosis",
    "arterial hardening",
    "arterial narrowing",
    "arterial remodeling",
    "arterial stiffness",
    "arteriole",
    "artery",
    "atherosclerosis",
    "basement membrane",
    "blood flow",
    "bone morphogenetic protein",
    "calcification",
    "calcified nodule",
    "calcium deposition",
    "calcium phosphate",
    "chronic kidney disease",
    "collagen",
    "compliance",
    "connective tissue",
    "elastic fibers",
    "elasticity",
    "endothelial dysfunction",
    "endothelium",
    "epithelium",
    "external elastic lamina",
    "extracellular matrix",
    "fibroblast",
    "fibrosis",
    "fibrous cap",
    "gap junction",
    "hemodynamics",
    "hydroxyapatite",
    "hyperphosphatemia",
    "inflammation",
    "internal elastic lamina",
    "interstitial space",
    "intima",
    "intimal calcification",
    "intimal thickening",
    "ischemia",
    "lamina propria",
    "lumen",
    "macrocalcification",
    "macrophage",
    "matrix vesicle",
    "mechanotransduction",
    "media",
    "medial calcification",
    "microcalcification",
    "mineralization",
    "myofibroblast",
    "necrotic core",
    "osteoblast-like cell",
    "osteocalcin",
    "osteogenic",
    "osteopontin",
    "oxidative stress",
    "pericyte",
    "phosphate transporter",
    "plaque",
    "shear stress",
    "smooth muscle",
    "tight junction",
    "tunica",
    "vasa vasorum",
    "vascular basement membrane",
    "vascular compliance",
    "vascular integrity",
    "vascular niche",
    "vascular ossification",
    "vascular remodeling",
    "vascular smooth muscle cell",
    "vascular stiffness",
    "vascular tone",
    "vascular wall",
]

In [None]:
from wsidata import open_wsi
import lazyslide as zs


def wsi_feature_extraction(slide):
    s = hf_hub_download(
        "rendeirolab/lazyslide-data",
        f"gtex_artery_data/{slide}.svs",
        repo_type="dataset",
    )
    wsi = open_wsi(s, attach_thumbnail=False, store="data")
    zs.pp.find_tissues(wsi)
    zs.pp.tile_tissues(wsi, 256, mpp=0.5, background_fraction=0.5)

    # conch feature
    zs.tl.feature_extraction(wsi, "conch", pbar=False)
    zs.tl.feature_aggregation(wsi, "conch")
    embed = zs.tl.text_embedding(terms, "conch")
    zs.tl.text_image_similarity(wsi, embed, "conch")
    wsi.write()

In [None]:
for slide in dataset["Tissue Sample Id"]:
    wsi_feature_extraction(slide)

Here are code snippet to run on different architectures

Run local with CPUs:

```python
from dask.distributed import LocalCluster
cluster = LocalCluster()
```

Run local with many GPUs:

```python
from dask_cuda import LocalCUDACluster
cluster = LocalCUDACluster()
```

Run on a SLURM cluster with GPUs (Example script, may not work on users' cluster):

```python
from dask_jobqueue import SLURMCluster

cluster = SLURMCluster(
    queue="gpu",
    cores=8,
    processes=1,
    memory="20 GB",
    # For SLURM, use --gres flag to get GPU
    job_extra_directives=["--gres=gpu:h100pcie:1"],
    # Each work must one GPU
    worker_extra_args=["--resources GPU=1"],
)
```

In [None]:


from dask.distributed import LocalCluster
cluster = LocalCluster()



In [None]:
from dask_cuda import LocalCUDACluster
cluster = LocalCUDACluster()


In [None]:
from dask_jobqueue import SLURMCluster

cluster = SLURMCluster(
    queue="gpu",
    cores=8,
    processes=1,
    memory="20 GB",
    # For SLURM, use --gres flag to get GPU
    job_extra_directives=["--gres=gpu:h100pcie:1"],
    # Each work must one GPU
    worker_extra_args=["--resources GPU=1"],
)

In [None]:
from dask_jobqueue import SLURMCluster

cluster = SLURMCluster(
    queue="gpu",
    cores=8,
    processes=1,
    memory="20 GB",
    interface="ib1",
    job_extra_directives=["-q gpu", "--gres=gpu:l4_gpu:1", "--time=2:00:00"],
    worker_extra_args=["--resources GPU=1"],
    log_directory="./dask-logs",
)

In [None]:
from dask.distributed import Client

client = Client(cluster)
cluster.adapt(minimum=1, maximum=10)

<distributed.deploy.adaptive.Adaptive at 0x1555114dd700>

In [None]:
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: http://10.110.89.41:8787/status,

0,1
Dashboard: http://10.110.89.41:8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://10.110.89.41:36261,Workers: 0
Dashboard: http://10.110.89.41:8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [None]:
futures = [
    client.submit(wsi_feature_extraction, slide, resources={"GPU": 1})
    for slide in dataset["Tissue Sample Id"]
]

In [None]:
from dask.distributed import as_completed
from tqdm.auto import tqdm

for _ in tqdm(as_completed(futures), total=len(futures)):
    pass

  0%|          | 0/45 [00:00<?, ?it/s]

In [None]:
client.shutdown()

In [None]:
from pathlib import Path
from anndata import read_zarr

slide_scores = {}
for store in Path("data").glob("*.zarr"):
    adata = read_zarr(store / "tables" / "conch_tiles_text_similarity")
    scores = zs.metrics.topk_score(adata, k=100)
    slide_scores[store.stem] = dict(zip(adata.var.index, scores))

In [None]:
slide_scores = pd.DataFrame(slide_scores).T

In [None]:
from wsidata import agg_wsi

dataset["store"] = [f"data/{s}.zarr" for s in dataset["Tissue Sample Id"]]
agg_data = agg_wsi(dataset, "conch", store_col="store", agg_key="agg_slide")
agg_data.obs = agg_data.obs.join(slide_scores, on="Tissue Sample Id")
agg_data

AnnData object with n_obs × n_vars = 45 × 512
    obs: 'Tissue Sample Id', 'Sex', 'Age Bracket', 'Pathology Categories', 'store', 'BMP-2', 'Monckeberg sclerosis', 'Runx2', 'adventitia', 'apoptosis', 'arterial hardening', 'arterial narrowing', 'arterial remodeling', 'arterial stiffness', 'arteriole', 'artery', 'atherosclerosis', 'basement membrane', 'blood flow', 'bone morphogenetic protein', 'calcification', 'calcified nodule', 'calcium deposition', 'calcium phosphate', 'chronic kidney disease', 'collagen', 'compliance', 'connective tissue', 'elastic fibers', 'elasticity', 'endothelial dysfunction', 'endothelium', 'epithelium', 'external elastic lamina', 'extracellular matrix', 'fibroblast', 'fibrosis', 'fibrous cap', 'gap junction', 'hemodynamics', 'hydroxyapatite', 'hyperphosphatemia', 'inflammation', 'internal elastic lamina', 'interstitial space', 'intima', 'intimal calcification', 'intimal thickening', 'ischemia', 'lamina propria', 'lumen', 'macrocalcification', 'macrophage', 'mat

In [None]:
agg_data.write_h5ad("agg_conch_features.h5ad")