In [1]:
import scanpy as sc
import numpy as np
import pandas as pd
import pooch

In [2]:
EXAMPLE_DATA = pooch.create(
    path=pooch.os_cache("scverse_tutorials"),
    base_url="doi:10.6084/m9.figshare.22716739.v1/",
)
EXAMPLE_DATA.load_registry_from_doi()

In [3]:
path = EXAMPLE_DATA.fetch('s1d1_filtered_feature_bc_matrix.h5')
adata = sc.read_10x_h5(path)
adata.var_names_make_unique()

adata

  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


AnnData object with n_obs × n_vars = 8785 × 36601
    var: 'gene_ids', 'feature_types', 'genome', 'pattern', 'read', 'sequence'

In [4]:
sc.pp.filter_cells(adata, min_genes=100)
sc.pp.filter_genes(adata, min_cells=3)

raw_adata = adata.copy()

In [5]:
dat = pd.DataFrame(
    raw_adata.X.T.toarray() if not isinstance(raw_adata.X, np.ndarray) else raw_adata.X.T,
    index=adata.var_names,       # genes
    columns=adata.obs_names      # cells
)

filtered_dat = dat.loc[['IFNG', 'IGF1', 'LEF1']]

In [6]:
test_matrix = pd.DataFrame(
    {
        "Cell1": [10, 0, 2, 9, 0],
        "Cell2": [0, 3, 3, 1, 0],
        "Cell3": [5, 0, 2, 0, 1],
        "Cell4": [0, 7, 3, 0, 9],
    },
    index=["GeneA", "GeneB", "GeneC", "GeneD", "GeneE"]
)
test_matrix

Unnamed: 0,Cell1,Cell2,Cell3,Cell4
GeneA,10,0,5,0
GeneB,0,3,0,7
GeneC,2,3,2,3
GeneD,9,1,0,0
GeneE,0,0,1,9


In [7]:
from cluster_genes import CalculateGeneDistance

dist = CalculateGeneDistance(filtered_dat, method="jaccard")
dist

Unnamed: 0,IFNG,IGF1,LEF1
IFNG,0.0,0.996721,0.979988
IGF1,0.996721,0.0,0.999318
LEF1,0.979988,0.999318,0.0


In [8]:
from cluster_genes import ClusterGenes

gene_cl_res = ClusterGenes(dist, clustering_method="average", return_tree=True, deepSplit=1)
gene_cl_res # Could better display using a seaborn cluster heatmap

..cutHeight not given, setting it to 0.9978392779851702  ===>  99% of the (truncated) height range in dendro.
cutHeight set too low; no merges below the cut.


  med_pos = (gene_partition.index.to_series().map(label_pos).groupby(gene_partition).median().sort_values())


(Series([], dtype: category
 Categories (0, int64): []),
 array([[0.        , 2.        , 0.97998823, 2.        ],
        [1.        , 3.        , 0.99801959, 3.        ]]))