In [None]:
%load_ext autoreload
%autoreload 2

from bidcell import BIDCellModel
import pandas as pd
import numpy as np
import skimage.io as sio
import scanpy as sc

In [2]:
import sys
sys.path.append('../../../../scripts/')

from paths import get_data_paths

In [6]:
DATA_DIR = get_data_paths('../../../../data_mapping.yml')['human_ovarian_cancer']

## Prepare data

In [3]:
INPUT_DIR = f'{DATA_DIR}/seg_method_results/bidcell/input/'
!mkdir -p $INPUT_DIR

In [4]:
df = pd.read_parquet(DATA_DIR / 'transcripts.parquet')

In [None]:
stains = sio.imread(f'{DATA_DIR}/morphology_focus/morphology_focus_0000.ome.tif')
sio.imsave(f'{INPUT_DIR}/dapi.tif', stains[:,:, 0])

In [None]:
RNA_PATH = get_data_paths('../../../../data_mapping.yml')['human_ovarian_cancer_rna']
cell_meta = pd.read_csv(f'{RNA_PATH}/FLEX_Ovarian_Barcode_Cluster_Annotation.csv')
cell_annot = cell_meta.set_index('Barcode')['Cell Annotation']
cm_full = sc.read_10x_h5(f'{RNA_PATH}/17k_Ovarian_Cancer_scFFPE_count_filtered_feature_bc_matrix.h5').to_df()

In [7]:
common_genes = sorted(set(df.feature_name.unique()) & set(cm_full.columns))
cm_full = cm_full.loc[cell_annot.index, common_genes]
cm_full.shape

(17050, 4912)

In [8]:
cm_full['atlas'] = 'sample1'
cm_full['cell_type'] = cell_annot[cm_full.index]
cm_full['ct_idx'] = pd.factorize(cm_full['cell_type'])[0]

In [9]:
cm_bulk = cm_full.groupby(['atlas', 'cell_type', 'ct_idx']).sum()
cm_bulk = np.log1p(10000 * (cm_bulk.T / cm_bulk.T.sum()).T).reset_index()
cm_bulk = pd.concat([cm_bulk.iloc[:,3:], cm_bulk[['ct_idx', 'cell_type', 'atlas']]], axis=1)

In [10]:
cm_bulk.to_csv(f'{INPUT_DIR}/sc_ref.csv')

In [None]:
df[df.feature_name.isin(common_genes)].to_csv(f'{INPUT_DIR}/molecules.csv', index=False)

In [None]:
!gzip $INPUT_DIR/molecules.csv

### Find markers

Copied from https://github.com/SydneyBioX/BIDCell/issues/9

In [None]:
# import numpy as np
# import pandas as pd
import natsort

max_overlaps_neg = 15
max_overlaps_pos = 4

ref_df = cm_bulk
n_genes = ref_df.shape[1] - 3
print("Ref data shape", ref_df.shape)

In [None]:
cell_types = ref_df["cell_type"].tolist()
cell_types = natsort.natsorted(list(set(cell_types)))
print(cell_types)
n_cell_types = len(cell_types)

ref_expr = ref_df.iloc[:, :n_genes].to_numpy()
gene_names = ref_df.columns[:n_genes]

# Find genes with expressions in bottom 10% percentile for every ref cell type
pct_10 = np.percentile(ref_expr, 10, axis=1, keepdims=True)
pct_10 = np.tile(pct_10, (1, n_genes))
low_expr_true = np.zeros(pct_10.shape)
low_expr_true[ref_expr <= pct_10] = 1

# Find overlap for different ref samples of the same cell type
ct_idx = ref_df["ct_idx"].to_numpy()
low_expr_true_agg = np.zeros((n_cell_types, n_genes))
for ct in range(n_cell_types):
    rows = np.where(ct_idx == ct)[0]
    low_expr_true_ct = low_expr_true[rows]
    low_expr_true_agg[ct, :] = np.prod(low_expr_true_ct, axis=0)

# Set overlaps to 0
overlaps = np.sum(low_expr_true_agg, 0)
too_many = np.where(overlaps > max_overlaps_neg)[0]
low_expr_true_agg[:, too_many] = 0

In [None]:
cell_types = ref_df["cell_type"].tolist()
cell_types = natsort.natsorted(list(set(cell_types)))
print(cell_types)
n_cell_types = len(cell_types)

ref_expr = ref_df.iloc[:, :n_genes].to_numpy()
gene_names = ref_df.columns[:n_genes]

# Find genes with expressions in bottom 10% percentile for every ref cell type
pct_10 = np.percentile(ref_expr, 10, axis=1, keepdims=True)
pct_10 = np.tile(pct_10, (1, n_genes))
low_expr_true = np.zeros(pct_10.shape)
low_expr_true[ref_expr <= pct_10] = 1

# Find overlap for different ref samples of the same cell type
ct_idx = ref_df["ct_idx"].to_numpy()
low_expr_true_agg = np.zeros((n_cell_types, n_genes))
for ct in range(n_cell_types):
    rows = np.where(ct_idx == ct)[0]
    low_expr_true_ct = low_expr_true[rows]
    low_expr_true_agg[ct, :] = np.prod(low_expr_true_ct, axis=0)

# Set overlaps to 0
overlaps = np.sum(low_expr_true_agg, 0)
too_many = np.where(overlaps > max_overlaps_neg)[0]
low_expr_true_agg[:, too_many] = 0

# print("num neg genes per cell type")
# print(np.sum(low_expr_true_agg, 1))

df_neg = pd.DataFrame(low_expr_true_agg, index=cell_types, columns=gene_names)

# Find genes with expressions in top 90% percentile for every ref cell type
pct_90 = np.percentile(ref_expr, 90, axis=1, keepdims=True)
pct_90 = np.tile(pct_90, (1, n_genes))
high_expr_true = np.zeros(pct_90.shape)
high_expr_true[ref_expr >= pct_90] = 1

# Find overlap for different ref samples of the same cell type
ct_idx = ref_df["ct_idx"].to_numpy()
high_expr_true_agg = np.zeros((n_cell_types, n_genes))
for ct in range(n_cell_types):
    rows = np.where(ct_idx == ct)[0]
    high_expr_true_ct = high_expr_true[rows]
    high_expr_true_agg[ct, :] = np.prod(high_expr_true_ct, axis=0)

# print("num pos genes per cell type")
# print(np.sum(high_expr_true_agg, 1))

# Set overlaps to 0
overlaps = np.sum(high_expr_true_agg, 0)
too_many = np.where(overlaps > max_overlaps_pos)[0]
high_expr_true_agg[:, too_many] = 0

df_pos = pd.DataFrame(high_expr_true_agg, index=cell_types, columns=gene_names)

In [None]:
df_pos.to_csv(f'{INPUT_DIR}/sc_pos_markers.csv')
df_neg.to_csv(f'{INPUT_DIR}/sc_neg_markers.csv')

## Run BIDCell

For convenience, it's easier to run [`run_bidcell.py`](./run_bidcell.py) from the command line instead of this section.

In [None]:
model = BIDCellModel("xenium_config.yaml")

In [None]:
import torch # Otherwise BIDCell fails version check with conda torch
torch.__version__ = '.'.join(torch.__version__.split('.')[:3])

In [None]:
model.preprocess()

In [None]:
model.train()

In [None]:
model.predict()

## Check results

In [None]:
model.config.experiment_dirs.test_output_dir

In [None]:
im = sio.imread(f'{DATA_DIR}/output/model_outputs/2024_11_12_06_37_26/test_output/epoch_10_step_120_connected.tif')
print(im.max())
sio.imshow(im)