In [1]:
import os,sys,time
from loguru import logger 

import scimap as sm

import numpy as np
import pandas as pd
import shapely
import geopandas as gpd
import anndata as ad

import pyarrow
import ast

#plotting
import matplotlib.pyplot as plt
import seaborn as sns

#custom functions
sys.path.append(os.path.abspath('/Users/jnimoca/Jose_BI/1_Pipelines/openDVP/src/'))
import opendvp.anndata_utils
import opendvp.filtering
import importlib

for package in [np,pd,gpd,ad,sm]:
    print(f"{package.__name__} {package.__version__}")

Running SCIMAP  2.2.11
numpy 1.26.4
pandas 2.2.3
geopandas 1.0.1
anndata 0.10.9
scimap 2.2.11


In [4]:
adata = ad.read_h5ad("../data/perSample/991_992/0_concat/20250303_1336_0_concat_adata.h5ad")


Observation names are not unique. To make them unique, call `.obs_names_make_unique`.



In [5]:
adata.obs_names_make_unique()

In [7]:
knn_list = [7,14,21,30,40,50]

for knn_number in knn_list:
    logger.info(f"Processing knn={knn_number}")
    start = time.time()

    adata = sm.tl.spatial_lda(adata, 
                              method='knn', 
                              knn=knn_number, 
                              label=f'spatial_lda_knn{knn_number}')
    
    adata = sm.tl.spatial_cluster(adata, 
                                  df_name=f'spatial_lda_knn{knn_number}', 
                                  method='kmeans', 
                                  k=7, 
                                  label=f'spatial_lda_knn{knn_number}_kmeans_k7')

    logger.info(f"Done in {time.time()-start} seconds")

[32m2025-03-03 15:55:02.641[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mProcessing knn=7[0m


Processing: [991]
Identifying the 7 nearest neighbours for every cell
Processing: [992]
Identifying the 7 nearest neighbours for every cell
Pre-Processing Spatial LDA
Training Spatial LDA
Calculating the Coherence Score

Coherence Score:  0.3178528435660482
Gathering the latent weights
Kmeans clustering


[32m2025-03-03 15:57:25.338[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mDone in 142.69472813606262 seconds[0m
[32m2025-03-03 15:57:25.339[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mProcessing knn=14[0m


Processing: [991]
Identifying the 14 nearest neighbours for every cell
Processing: [992]
Identifying the 14 nearest neighbours for every cell
Pre-Processing Spatial LDA
Training Spatial LDA
Calculating the Coherence Score

Coherence Score:  0.34420324107822475
Gathering the latent weights
Kmeans clustering


[32m2025-03-03 16:00:48.351[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mDone in 203.0119869709015 seconds[0m
[32m2025-03-03 16:00:48.352[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mProcessing knn=21[0m


Processing: [991]
Identifying the 21 nearest neighbours for every cell
Processing: [992]
Identifying the 21 nearest neighbours for every cell
Pre-Processing Spatial LDA
Training Spatial LDA
Calculating the Coherence Score

Coherence Score:  0.3516907772022454
Gathering the latent weights
Kmeans clustering


[32m2025-03-03 16:04:48.706[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mDone in 240.3536880016327 seconds[0m
[32m2025-03-03 16:04:48.706[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mProcessing knn=30[0m


Processing: [991]
Identifying the 30 nearest neighbours for every cell
Processing: [992]
Identifying the 30 nearest neighbours for every cell
Pre-Processing Spatial LDA
Training Spatial LDA
Calculating the Coherence Score

Coherence Score:  0.35583355796432947
Gathering the latent weights
Kmeans clustering


[32m2025-03-03 16:09:31.288[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mDone in 282.58131408691406 seconds[0m
[32m2025-03-03 16:09:31.289[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mProcessing knn=40[0m


Processing: [991]
Identifying the 40 nearest neighbours for every cell
Processing: [992]
Identifying the 40 nearest neighbours for every cell
Pre-Processing Spatial LDA
Training Spatial LDA
Calculating the Coherence Score

Coherence Score:  0.3577975141441595
Gathering the latent weights
Kmeans clustering


[32m2025-03-03 16:15:07.620[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mDone in 336.3305866718292 seconds[0m
[32m2025-03-03 16:15:07.621[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m4[0m - [1mProcessing knn=50[0m


Processing: [991]
Identifying the 50 nearest neighbours for every cell
Processing: [992]
Identifying the 50 nearest neighbours for every cell
Pre-Processing Spatial LDA
Training Spatial LDA
Calculating the Coherence Score

Coherence Score:  0.3593188273293075
Gathering the latent weights
Kmeans clustering


[32m2025-03-03 16:38:11.138[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m18[0m - [1mDone in 1383.5169188976288 seconds[0m


In [9]:
adata

AnnData object with n_obs × n_vars = 1615233 × 8
    obs: 'CellID', 'Y_centroid', 'X_centroid', 'Area', 'MajorAxisLength', 'MinorAxisLength', 'Eccentricity', 'Orientation', 'Extent', 'Solidity', 'artefact', 'Area_filter_nottoobig', 'Area_filter_nottoolow', 'Area_filter', 'DAPI_ratio', 'DAPI_ratio_pass_nottoolow', 'DAPI_ratio_pass_nottoohigh', 'DAPI_ratio_pass', 'filtering', 'imageid', 'phenotype', 'cell_id', 'spatial_lda_knn7_kmeans_k7', 'spatial_lda_knn14_kmeans_k7', 'spatial_lda_knn21_kmeans_k7', 'spatial_lda_knn30_kmeans_k7', 'spatial_lda_knn40_kmeans_k7', 'spatial_lda_knn50_kmeans_k7'
    uns: 'spatial_lda_knn7', 'spatial_lda_knn7_probability', 'spatial_lda_knn14', 'spatial_lda_knn14_probability', 'spatial_lda_knn21', 'spatial_lda_knn21_probability', 'spatial_lda_knn30', 'spatial_lda_knn30_probability', 'spatial_lda_knn40', 'spatial_lda_knn40_probability', 'spatial_lda_knn50', 'spatial_lda_knn50_probability'
    layers: 'log'

In [None]:
import opendvp.anndata_utils

In [11]:
opendvp.anndata_utils.save_adata_checkpoint(adata, path_to_dir="../data/perSample/991_992", checkpoint_name="1_spatial_LDA" )

[32m2025-03-03 16:54:17.626[0m | [1mINFO    [0m | [36mopendvp.anndata_utils[0m:[36msave_adata_checkpoint[0m:[36m73[0m - [1mWriting h5ad[0m
[32m2025-03-03 16:54:22.675[0m | [32m[1mSUCCESS [0m | [36mopendvp.anndata_utils[0m:[36msave_adata_checkpoint[0m:[36m75[0m - [32m[1mWrote h5ad file[0m
[32m2025-03-03 16:54:22.676[0m | [1mINFO    [0m | [36mopendvp.anndata_utils[0m:[36msave_adata_checkpoint[0m:[36m82[0m - [1mWriting parquet[0m
[32m2025-03-03 16:54:23.129[0m | [32m[1mSUCCESS [0m | [36mopendvp.anndata_utils[0m:[36msave_adata_checkpoint[0m:[36m84[0m - [32m[1mWrote parquet file[0m


In [12]:
adata

AnnData object with n_obs × n_vars = 1615233 × 8
    obs: 'CellID', 'Y_centroid', 'X_centroid', 'Area', 'MajorAxisLength', 'MinorAxisLength', 'Eccentricity', 'Orientation', 'Extent', 'Solidity', 'artefact', 'Area_filter_nottoobig', 'Area_filter_nottoolow', 'Area_filter', 'DAPI_ratio', 'DAPI_ratio_pass_nottoolow', 'DAPI_ratio_pass_nottoohigh', 'DAPI_ratio_pass', 'filtering', 'imageid', 'phenotype', 'cell_id', 'spatial_lda_knn7_kmeans_k7', 'spatial_lda_knn14_kmeans_k7', 'spatial_lda_knn21_kmeans_k7', 'spatial_lda_knn30_kmeans_k7', 'spatial_lda_knn40_kmeans_k7', 'spatial_lda_knn50_kmeans_k7'
    uns: 'spatial_lda_knn7', 'spatial_lda_knn7_probability', 'spatial_lda_knn14', 'spatial_lda_knn14_probability', 'spatial_lda_knn21', 'spatial_lda_knn21_probability', 'spatial_lda_knn30', 'spatial_lda_knn30_probability', 'spatial_lda_knn40', 'spatial_lda_knn40_probability', 'spatial_lda_knn50', 'spatial_lda_knn50_probability'
    layers: 'log'