```bash
{
conda activate
cd ~/link/res_publish/run
jupyter nbconvert info_model_gene.ipynb --to python

conda activate publish
nohup python info_model_gene.py > nohup_info_model_gene &
sleep 30 && rm info_model_gene.py
conda activate
clear
}
jobs
```

In [1]:
import sys
from pathlib import Path
p_root = Path('~/link/res_publish').expanduser()
None if str(p_root) in sys.path else sys.path.append(str(p_root))

In [2]:
from func import *

import utils as ut
from utils.general import *
pl = ut.pl

p_res_model_gene  = p_res.joinpath('model_gene')
p_res_model_gene.mkdir(exist_ok=True,parents=True)

2024-08-19 02:05:17.713959: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.



-------------------------func_help-------------------------
> parameter
    p_root	[name] res_publish
        p_run, p_plot, p_res, p_cache, p_pdf
    p_df_varmap
    map_sp_reverse
    rng
> run
    run_cross_species_models
    h5ad_to_mtx
    load_adata
    get_path_varmap
    find_path_from_para
    load_normalized_adata

> res
    get_test_result_df
    get_res_obs
    get_adata_umap
    show_umap

> plot
    get_color_map
    show_color_map
    show_color
    plot_umap
    savefig



In [3]:
Tensor = csMAHN.pp.Tensor
torch = csMAHN.pp.torch
with Block("run_csMAHN_info_model_gene"):
    def run_csMAHN_info_model_gene(
        path_adata1,
        path_adata2,
        key_class1,
        key_class2,
        sp1,
        sp2,
        tissue_name,
        path_varmap,
        aligned=False,
        resdir_tag=".",
        resdir=Path('.'),
        limite_func=lambda adata1, adata2: (adata1, adata2),
        **kvargs
    ):
        """
        version = 0.0.9
        kvargs:
            n_epochs:
                default,[100, 200, 300]
                stages,即res_0,res_1，res_2 的 epochs
                累加制，res_0,res_1，res_2,实际epochs分别为100,300,600
                故最终epochs为stages之和
                stages = kvargs.setdefault("n_epochs",[100, 200, 300])
    
            is_1v1: bool
                default,False
            n_hvgs:
                default,2000
            n_degs:
                default,50
        """
        homo_method = 'biomart'
        n_hvgs = kvargs.setdefault('n_hvgs', 2000)
        n_degs = kvargs.setdefault('n_degs', 50)
        seed = 123
        stages = kvargs.setdefault(
            'n_epochs', [
                100, 200, 200])  # [200, 200, 200]
        nfeats = kvargs.setdefault('nfeats', 64)  # 64  # embedding size #128
        hidden = kvargs.setdefault('hidden', 64)  # 64  # 128
        input_drop = 0.2
        att_drop = 0.2
        residual = True
    
        threshold = 0.9  # 0.8
        lr = 0.01  # lr = 0.01
        weight_decay = 0.001
        patience = 100
        enhance_gama = 10
        simi_gama = 0.1
    
        dsnames = (
            '{}_{}'.format(
                tissue_name, sp1), '{}_{}'.format(
                tissue_name, sp2))
        assert key_class1 == key_class2, "key_class is not equal"
        key_class = key_class1
    
        # make file to save
        resdir_tag = "{}_{}-corss-{};{}".format(tissue_name, sp1, sp2, resdir_tag) if len(
            resdir_tag) > 0 else "{}_{}-corss-{}".format(tissue_name, sp1, sp2)
        # curdir = os.path.join()
        resdir = Path(resdir).joinpath(resdir_tag)
        model_dir = resdir.joinpath('model_')
        figdir = resdir.joinpath('figs')
        
        resdir.mkdir(exist_ok=True, parents=True)
        # [_.mkdir(exist_ok=True, parents=True)
        #  for _ in [resdir, model_dir, figdir]]
        # [resdir.joinpath('res_{}'.format(i)).mkdir(
        #     exist_ok=True, parents=True) for i in range(len(stages))]
    
        checkpt_file = model_dir.joinpath("mutistages")
    
        # is finish
        # p_finish = Path(resdir).joinpath("finish")
        # if p_finish.exists():
        if resdir.joinpath('info_mode_gene.json').exists() and 
        np.setdiff1d(
            "reference_hvgs,reference_degs,reference_higs,query_hvgs,query_degs,query_higs,reference_gene_nodes,query_gene_nodes,ortholog_one2one,ortholog_one2many,ortholog_many2many".split(','),
            list(json.loads(resdir.joinpath('info_mode_gene.json').read_text()).keys())
        ).size == 0 :
            print(
                "[has finish]{} {}".format(
                    time.strftime('%y%m%d-%H%M', time.localtime()), resdir.name
                ))
            return
        else:
            pass
        print(
            "[start]{} {}".format(
                time.strftime('%y%m%d-%H%M', time.localtime()),
                resdir.name
            ))
    
        finish_content = ["[strat] {}".format(time.time())]
        print('[path_varmap] {}'.format(path_varmap))
        adata_raw1 = load_adata(path_adata1)
        adata_raw2 = load_adata(path_adata2)
        if key_class not in adata_raw2.obs.columns:
            adata_raw2.obs[key_class] = ''
        adata_raw1.obs[key_class] = adata_raw1.obs[key_class].astype(str)
        adata_raw2.obs[key_class] = adata_raw2.obs[key_class].astype(str)
        # limite 进一步对adata进行限制，默认不操作直接返回
        adata_raw1, adata_raw2 = limite_func(
            adata_raw1, adata_raw2
        )
        # group_counts_unalign.csv
        pd.concat([adata_raw1.obs[key_class].value_counts(),
                   adata_raw2.obs[key_class].value_counts(),],
                  axis=1, keys=dsnames,)
        # .to_csv(
        #     resdir.joinpath("group_counts_unalign.csv"), index=True
        # )
        # 仅保留公共细胞类群
        if aligned:
            adata_raw1, adata_raw2 = csMAHN.pp.aligned_type(
                [adata_raw1, adata_raw2], key_class
            )
    
        # group_counts.csv
        temp = pd.concat([adata_raw1.obs[key_class].value_counts(),
                          adata_raw2.obs[key_class].value_counts(),],
                         axis=1, keys=dsnames)
        print(temp)
        # temp.to_csv(resdir.joinpath("group_counts.csv"), index=True)
        # adata_raw1.obs.to_csv(resdir.joinpath("obs_ref.csv"), index=True)
        # adata_raw2.obs.to_csv(resdir.joinpath("obs_que.csv"), index=True)
    
        # homo = pd.read_csv(path_varmap)
        homo = pd.read_csv(path_varmap, usecols=range(3))
        homo.columns = ["gn_ref", "gn_que", "homology_type"]
        if kvargs.setdefault("is_1v1", False):
            homo = get_1v1_matches(homo)
            homology_parameter = get_homology_parameters(
                adata_raw1, adata_raw2, homo)
            print("""
    [homology one2one]find {homology_one2one_find} genes
    [homology one2one]use {homology_one2one_use} genes""".format(
                **homology_parameter))
            kvargs.update(homology_parameter)
    
        kvargs.update({'path_adata1': str(path_adata1),
                       'path_adata2': str(path_adata2),
                       'key_class1': key_class1,
                       'key_class2': key_class2,
                       'sp1': sp1,
                       'sp2': sp2,
                       'tissue_name': tissue_name,
                       'path_varmap': str(path_varmap),
                       'aligned': aligned,
                       'resdir_tag': resdir_tag,
                       'resdir': str(resdir),
                      'n_hvgs': n_hvgs,
                       'n_degs': n_degs,
                       'nfeats': nfeats,
                       'hidden': hidden
                       })
        resdir.joinpath("kvargs.json").write_text(dumps(kvargs))
        print(
            """Task: refernece:{} {} cells x {} gene -> query:{} {} cells x {} gene in {}""".format(
                dsnames[0],
                adata_raw1.shape[0],
                adata_raw1.shape[1],
                dsnames[1],
                adata_raw2.shape[0],
                adata_raw2.shape[1],
                tissue_name))
    
        start = time.time()
        finish_content.append("[finish before run] {}".format(time.time()))
        # knn时间较长
        print("\n[process_for_graph]\n".center(100, '-'))
        adatas, features_genes, nodes_genes, scnets, one2one, n2n = csMAHN.pp.process_for_graph(
            [adata_raw1, adata_raw2], homo, key_class, 'leiden', n_hvgs=n_hvgs, n_degs=n_degs,
            resdir=resdir)
        g, inter_net, one2one_gene_nodes_net, cell_label, n_classes, list_idx = csMAHN.pp.make_graph(
            adatas, aligned, key_class, features_genes, nodes_genes, scnets,
            one2one, n2n, has_mnn=True, seed=seed,resdir=resdir)
        end = time.time()
        # 包括预处理时间
        print('Times preprocess for graph:{:.2f}'.format(end - start))
        print('\n[end][run_csMAHN_info_model_gene]\n'.center(100,'-'))

with Block("[redefine] csMAHN.pp.get_adj_gene"):
    def _func(nodes_genes, homo,**kvarg):
        """
        Outputs a gene aparse adjacent matrix
        :param homo: genes homology relation (select one2one or many2many to control)
        """
        reference_gene_nodes = nodes_genes[0]
        query_gene_nodes = nodes_genes[1]
        adj = csMAHN.pp.match_bigraph(reference_gene_nodes, query_gene_nodes, homo)
        print("--------------homo edges---------------")
        print(adj.iloc[:, 2].value_counts())
        
        # [redefine][add]
        with Block("info_mode_gene.json",context=dict(
            p_out = kvarg['resdir'].joinpath('info_mode_gene.json'),
            data = {}
        )) as context:
            if context.p_out.exists():
                context.data.update(json.loads(context.p_out.read_text()))
            context.data.update(
                adj.iloc[:, 2].value_counts().to_frame()\
                    .reset_index(names='value')\
                    .pipe(ut.df.to_dict,'value','count'))
            context.p_out.write_text(json.dumps(context.data))
    
        gn = np.append(reference_gene_nodes, query_gene_nodes)
        gnind_1 = pd.DataFrame(data=np.arange(reference_gene_nodes.size)[None, :], columns=reference_gene_nodes)
        gnind_2 = pd.DataFrame(data=np.arange(gn.size)[None, :], columns=gn)
        gnind_2 = gnind_2.iloc[:, len(reference_gene_nodes):]
        a = np.array(adj.iloc[:, 0]).tolist()
        b = np.array(adj.iloc[:, 1]).tolist()
        reference_indic = pd.Index(gnind_1[a].values.flatten())
        query_indic = pd.Index(gnind_2[b].values.flatten())
        indice1 = np.vstack((reference_indic, query_indic))
        indice2 = np.vstack((query_indic, reference_indic))
        gene_adj = np.concatenate((indice1, indice2), axis=1)
    
        return torch.LongTensor(gene_adj)

    csMAHN.pp.get_adj_gene = _func
    del _func

with Block("[redefine] csMAHN.pp.select_gene_nodes"):
    def _func(reference_all_gene, query_all_gene, higs_reference, higs_query, homo,**kvarg):
        # 物种1的higs与物种2的higs在物种1的同源 联合
        query_homo = homo[homo.iloc[:, 1].isin(higs_query)].iloc[:, 0]
        reference_gene_nodes = np.union1d(query_homo, higs_reference)
    
        reference_homo = homo[homo.iloc[:, 0].isin(higs_reference)].iloc[:, 1]
        query_gene_nodes = np.union1d(reference_homo, higs_query)
    
        # biomart和数据集基因名的一致性筛选
        reference_gene_nodes = np.intersect1d(reference_all_gene, reference_gene_nodes)
        query_gene_nodes = np.intersect1d(query_all_gene, query_gene_nodes)
        print("--------------gene nodes info---------------")
        print("num of reference_gene_node is {0}".format(len(reference_gene_nodes)))
        print("num of query_gene_node is {0}".format(len(query_gene_nodes)))
        
        # [redefine][add]
        with Block("info_mode_gene.json",context=dict(
            p_out = kvarg['resdir'].joinpath('info_mode_gene.json'),
            data = {}
        )) as context:
            if context.p_out.exists():
                context.data.update(json.loads(context.p_out.read_text()))
            context.data.update(
                reference_gene_nodes=len(reference_gene_nodes),
                query_gene_nodes=len(query_gene_nodes)
                )
            context.p_out.write_text(json.dumps(context.data))

        return reference_gene_nodes, query_gene_nodes
    
    csMAHN.pp.select_gene_nodes = _func
    del _func
    
with Block("[redefine] csMAHN.pp.process_for_graph"):
    def _func(adatas,
                  homo,
                  key_class,
                  key_clust,
                  relation='many',
                  add_no1v1=True,
                  n_pcs=30,
                  n_hvgs=2000,
                  n_degs=50,
                  use_scnets=True,
                  n_neighbors_scnet=5,
                  n_neighbors_clust=20,
                  reso=0.4,
                  degs_method="wilcoxon",
                  **kvarg
                  ):
        
        print('*'*75,'\n[redefind] csMAHN.pp.process_for_graph\n{}\n'.format(kvarg),'*'*75)
        # process homo relation
        one2one, n2n = csMAHN.pp.biomart_process(homo)
        
        # process for adata
        params_preproc = dict(
            copy=True,
            target_sum=None,
            n_hvgs=n_hvgs,
            n_pcs=n_pcs,
            n_neighbors=n_neighbors_scnet,
        )
        time1 = time.time()
        adata1 = csMAHN.pp.preprocess_for_adata(adatas[0], **params_preproc)
        adata2 = csMAHN.pp.preprocess_for_adata(adatas[1], **params_preproc)
        time2 = time.time()
        print(f'the time2 of processing adatas is {time2-time1}')
        if use_scnets:
            scnets = [csMAHN.pp.get_intra_net_from_adata(adata1),
                      csMAHN.pp.get_intra_net_from_adata(adata2)]
            scnets[0].data[:] = 1
            scnets[1].data[:] = 1
        else:
            scnets = None
        
        reference_hvgs, query_hvgs = csMAHN.pp.get_hvgs(adata1), csMAHN.pp.get_hvgs(adata2)
        
        time1 = time.time()
        # 或许需要重新knn
        clust_lbs2 = csMAHN.pp.get_leiden_labels(adata2,
                                       n_neighbors=n_neighbors_clust,
                                       reso=reso,
                                       neighbors_key='clust',
                                       key_added='leiden',
                                       copy=False)
        time2 = time.time()
        print(f'the time of leiden is {time2 - time1}')
        
        
        adatas[1].obs[key_clust] = clust_lbs2
        
        reference_degs = csMAHN.pp.get_degs(adata1, groupby=key_class, n_degs=n_degs, method=degs_method)
        query_degs = csMAHN.pp.get_degs(adata2, groupby=key_clust, n_degs=n_degs, method=degs_method)
        time3 = time.time()
        print(f'the time of degs is {time3 - time2}')
        reference_higs = np.union1d(reference_hvgs, reference_degs)
        query_higs = np.union1d(query_hvgs, query_degs)
        _adatas = [adata1, adata2]
        
        print("--------------hvgs, degs info---------------")
        print("num of reference_hvgs,reference_degs,reference_higs are {0},{1},{2}".format(len(reference_hvgs),
                                                                                           len(reference_degs),
                                                                                           len(reference_higs)))
        print("num of query_hvgs,query_degs,query_higs are {0},{1},{2}".format(len(query_hvgs), len(query_degs),
                                                                                   len(query_higs)))
        # [redefine][add]
        with Block("info_mode_gene.json",context=dict(
            p_out = kvarg['resdir'].joinpath('info_mode_gene.json'),
            data = {}
        )) as context:


            if context.p_out.exists():
                context.data.update(json.loads(context.p_out.read_text()))
            context.data.update(
                reference_hvgs=len(reference_hvgs),
                reference_degs=len(reference_degs),
                reference_higs=len(reference_higs),
                query_hvgs=len(query_hvgs),
                query_degs=len(query_degs),
                query_higs=len(query_higs),
                )
            context.p_out.write_text(json.dumps(context.data))
        
        # nodes gene
        # 使用one2one,n2n选择加入的同源关系类型
        gene_relation = n2n if relation == 'many' else one2one
        reference_all_gene, query_all_gene = adata1.raw.var.index.tolist(), adata2.raw.var.index.tolist()
        reference_gene_nodes, query_gene_nodes = csMAHN.pp.select_gene_nodes(reference_all_gene, query_all_gene,
                                                                   reference_higs, query_higs, gene_relation,
                                                                             **kvarg)
        nodes_genes = [reference_gene_nodes, query_gene_nodes]
        # feature gene
        features_genes = csMAHN.pp.get_feature_genes(reference_all_gene, query_all_gene, reference_degs, query_degs, one2one , add_no1v1=add_no1v1, n2n=n2n)
        
        return _adatas, features_genes, nodes_genes, scnets, one2one, n2n

    csMAHN.pp.process_for_graph = _func
    del _func

with Block("[redefine] csMAHN.pp.make_graph"):
    def _func(adatas,
                   aligned,
                   key_class,
                   features_genes,
                   nodes_genes,
                   scnets,
                   one2one, n2n,
                   gene_embedding=None,
                   has_mnn=False,
                   graph_type='dgl',
                   seed=123,**kvarg):
        """
        Load cell expression matrix ,cell label, homo information to camputa cell hvgs, degs
        adata should be normalized
        """
    
        # normalized and log
        norm_counts = csMAHN.pp.get_counts_from_adatas(adatas)
        # scale
    
        # def _zscore(adata):
        #     """
        #     Adata has been normalized
        #     """
        #     _adata = adata.copy()
        #     sc.pp.scale(_adata)
        #     return pd.DataFrame(_adata.X, columns=_adata.var.index)
        #
        # scale_counts = [_zscore(adata) for adata in adatas]
        reference_labels, query_labels = csMAHN.pp.get_labels_from_adatas(adatas, key_class)
    
        cell_feature = csMAHN.pp.get_feature_counts(norm_counts, features_genes)
        cell_label = csMAHN.pp.get_labelEncoder(reference_labels, query_labels)
        if aligned:
            n_classes = max(cell_label) + 1
        else:
            n_classes = len(np.unique(adatas[0].obs[key_class]))
        train_idx, val_idx, pred_idx = csMAHN.pp.get_idx_cross_classes(reference_labels, query_labels)
        list_idx = [train_idx, val_idx, pred_idx]
    
        def _get_nums_gene(data):
            return len(data)
    
        def _get_nums_feature_from_cell(data: Tensor):
            return data.shape[1]
    
        all_gene_nums = _get_nums_gene(nodes_genes[0]) + _get_nums_gene(nodes_genes[1])
        cell_feature_nums = _get_nums_feature_from_cell(cell_feature)
        gene_feature = torch.Tensor(csMAHN.pp.get_gene_feature(all_gene_nums, cell_feature_nums, gene_embedding))
    
        cell_gene_adj, gene_cell_adj = csMAHN.pp.get_adj_cell_gene(norm_counts[0], norm_counts[1], nodes_genes[0], nodes_genes[1])
        gene_adj = csMAHN.pp.get_adj_gene(nodes_genes, n2n,**kvarg)
        one2one_gene_nodes_net = csMAHN.pp.get_adj_gene(nodes_genes, one2one,**kvarg)
    
        inter_net = csMAHN.pp.get_pair_from_adatas(norm_counts, features_genes, N1=10, N2=10, N=5, n_jobs=32)
        acc_inter_net = (cell_label[inter_net[0]] == cell_label[inter_net[1]]).sum() / len(cell_label[inter_net[0]])
        print('Inter-net pairs ACC: {0}'.format(acc_inter_net))
        cell_adj = csMAHN.pp.get_adj_cell(scnets, inter_net) if has_mnn else get_adj_cell(scnets)
    
        # cell_adj = get_adj_cell
        
        # print graph info
        print("-------------------nodes info-------------------")
        print(f'the num of cell feats is {cell_feature_nums}')
        print(f'the num of cell nodes is {cell_feature.shape[0]}')
        print(f'the num of gene nodes is {all_gene_nums}')
        dct = dict(
            cell_feature=cell_feature,
            cell_label=cell_label,
            list_idx=list_idx,
            gene_feature=gene_feature,
    
            adj_gg=gene_adj,
            adj_cc=cell_adj,
            adj_gc=gene_cell_adj,
            adj_cg=cell_gene_adj,
        )
        # if graph_type == 'pyg':
        #     g = get_pyg(dct)
        # elif graph_type == 'dgl':
        #     g = get_dgl(dct)
        g = csMAHN.pp.get_dgl(dct)
        return g, inter_net, one2one_gene_nodes_net, cell_label, n_classes, list_idx

    csMAHN.pp.make_graph = _func
    del _func

In [4]:
df_para = pd.concat([
     pd.concat([pd.read_csv(p_cache.joinpath(
        'parameter_healthy_{}.csv'.format(i))).assign(mask=i)
        for i in 'HCL_MCA,Retina,Retina-parameters'.split(',')]),
    pd.concat([pd.read_csv(p_cache.joinpath(
        'parameter_{}.csv'.format(i)
    )).assign(mask=i) for i in 'LC,RA,Mtb,LN,COVID'.split(',')])
])

df_para['path_ref'] = df_para['path_ref'].apply(lambda x:p_cache.joinpath(x))
df_para['path_que'] = df_para['path_que'].apply(lambda x:p_cache.joinpath(x))
df_para['item'] = ut.df.apply_merge_field(df_para,'{tissue};{name_ref};{name_que}')
df_para = ut.df.reindex_with_unique_col(df_para,'item',drop=True)
ut.df.show(df_para)

Unnamed: 0,tissue,sp_ref,path_ref,name_ref,sp_simple_ref,sp_que,path_que,name_que,sp_simple_que,key_cell_type,mask
Adrenal-Gland;h_adr;m_adr,Adrenal-Gland,human,/public/workspace/licanchengup/link/res_publis...,h_adr,h,mouse,/public/workspace/licanchengup/link/res_publis...,m_adr,m,CL,HCL_MCA
Bone-Marrow;h_bon;m_bon,Bone-Marrow,human,/public/workspace/licanchengup/link/res_publis...,h_bon,h,mouse,/public/workspace/licanchengup/link/res_publis...,m_bon,m,CL,HCL_MCA


(44, 11)

In [5]:
df_para['mask'].value_counts()

mask
HCL_MCA              18
Retina-parameters     6
LN                    5
Mtb                   5
LC                    4
Retina                3
COVID                 2
RA                    1
Name: count, dtype: int64

# HCL_MCA

In [19]:
df_para_item['path_ref'].apply(lambda x:x.joinpath('matrix.mtx').exists()).all()
df_para_item['path_que'].apply(lambda x:x.joinpath('matrix.mtx').exists()).all()

True

In [21]:
item = 'HCL_MCA'
df_para_item = df_para.query("mask == '{}'".format(item))
print('\n[runstart] {}\n'.format(item))

aligned=True
is_1v1 = False
n_epochs = [100, 200, 200]
for i, row in df_para_item.iterrows():
    path_varmap = get_path_varmap(
            map_sp[row['sp_simple_ref']], map_sp[row['sp_simple_que']], model='csMAHN')
    print(path_varmap)

    run_csMAHN_info_model_gene(
        path_adata1=row['path_ref'],
        path_adata2=row['path_que'],
        key_class1=row['key_cell_type'],
        key_class2=row['key_cell_type'],
        sp1=row['sp_simple_ref'],
        sp2=row['sp_simple_que'],
        path_varmap=path_varmap,
        tissue_name=row['tissue'],
        aligned=aligned,
        resdir_tag='{}-map-{};epochs={};is_1v1={}'.format(
            row['name_ref'],
            row['name_que'],
            sum(n_epochs),
            is_1v1),
        resdir=p_res_model_gene,
        models='csMAHN,came'.split(','),
        **{'n_epochs': n_epochs,
           'is_1v1': is_1v1})

print('\n[run finish] {}\n'.format(item))


[runstart] HCL_MCA

/public/workspace/licanchengup/link/res_publish/homo/human_to_mouse.txt
[has finish]240819-0218 Adrenal-Gland_h-corss-m;h_adr-map-m_adr;epochs=500;is_1v1=False
/public/workspace/licanchengup/link/res_publish/homo/human_to_mouse.txt
[has finish]240819-0218 Bone-Marrow_h-corss-m;h_bon-map-m_bon;epochs=500;is_1v1=False
/public/workspace/licanchengup/link/res_publish/homo/human_to_mouse.txt
[start]240819-0218 Brain_h-corss-m;h_bra-map-m_bra;epochs=500;is_1v1=False
[path_varmap] /public/workspace/licanchengup/link/res_publish/homo/human_to_mouse.txt


  type_counts_list.append(pd.value_counts(adatas[i].obs[key_class]))
  type_counts_list.append(pd.value_counts(adatas[i].obs[key_class]))


----raw----
                  reference    query
CL                                  
neural cell         15645.0  26621.0
oligodendrocyte      7546.0  11170.0
macrophage           6633.0  12633.0
astrocyte            6143.0      NaN
stem cell            4596.0   7878.0
fibroblast           1382.0      NaN
granule cell          686.0      NaN
endothelial cell      386.0   2264.0
T cell                  NaN  13425.0
ciliated cell           NaN   6732.0
neutrophil              NaN   2135.0
----new----
                  reference    query
CL                                  
neural cell         15645.0  26621.0
oligodendrocyte      7546.0  11170.0
macrophage           6633.0  12633.0
stem cell            4596.0   7878.0
endothelial cell      386.0   2264.0
                  Brain_h  Brain_m
CL                                
neural cell         15645    26621
oligodendrocyte      7546    11170
macrophage           6633    12633
stem cell            4596     7878
endothelial cell      386 

  disp_grouped = df.groupby('mean_bin')['dispersions']
  disp_grouped = df.groupby('mean_bin')['dispersions']


the time2 of processing adatas is 17.095465183258057
Leiden results:
leiden
0     10573
1     10021
2      7048
3      5422
4      5196
5      3356
6      3153
7      2773
8      2151
9      1766
10     1616
11     1458
12     1316
13     1202
14      771
15      696
16      689
17      503
18      385
19      354
20      117
Name: count, dtype: int64
the time of leiden is 18.82611918449402


  adatas[1].obs[key_clust] = clust_lbs2
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  self.stats[group_name, 'names'] = self.var_names[global_indices]
  self.stats[group_name, 'scores'] = scores[global_indices]
  self.stats[group_name, 'pvals'] = pvals[global_indices]
  self.stats[group_name, 'pvals_adj'] = pvals_adj[global_indices]
  self.stats[group_name, 'logfoldchanges'] = np.log2(


the time of degs is 158.77962279319763
--------------hvgs, degs info---------------
num of reference_hvgs,reference_degs,reference_higs are 2000,242,2084
num of query_hvgs,query_degs,query_higs are 2000,628,2313
--------------gene nodes info---------------
num of reference_gene_node is 2865
num of query_gene_node is 3112
--------------homo edges---------------
homology_type
ortholog_one2one      2249
ortholog_one2many      112
ortholog_many2many      34
Name: count, dtype: int64
--------------homo edges---------------
homology_type
ortholog_one2one    2249
Name: count, dtype: int64
knn time is 192.33667135238647 s
mnn time is 0.083251953125 s
the time of compute mnn is  194.8939187526703 s
Inter-net pairs ACC: 0.8554301833568406
-------------------nodes info-------------------
the num of cell feats is 517
the num of cell nodes is 95372
the num of gene nodes is 5977
Times preprocess for graph:533.74
--------------------------------
[end][run_csMAHN_info_model_gene]
---------------------

  type_counts_list.append(pd.value_counts(adatas[i].obs[key_class]))
  type_counts_list.append(pd.value_counts(adatas[i].obs[key_class]))
  disp_grouped = df.groupby('mean_bin')['dispersions']


----raw----
                     reference  query
CL                                   
cardiac muscle cell     5928.0   2890
mesenchymal cell        3129.0  50417
myeloid cell            1120.0  11505
epithelial cell          606.0   3865
T cell                     NaN   6003
----new----
                     reference  query
CL                                   
cardiac muscle cell     5928.0   2890
mesenchymal cell        3129.0  50417
myeloid cell            1120.0  11505
epithelial cell          606.0   3865
                     Heart_h  Heart_m
CL                                   
cardiac muscle cell     5928     2890
mesenchymal cell        3129    50417
myeloid cell            1120    11505
epithelial cell          606     3865
Task: refernece:Heart_h 10783 cells x 11069 gene -> query:Heart_m 68677 cells x 12528 gene in Heart
---------------------------------------
[process_for_graph]
----------------------------------------
*****************************************************

  disp_grouped = df.groupby('mean_bin')['dispersions']


the time2 of processing adatas is 10.486703634262085
Leiden results:
leiden
0     14414
1      9119
2      7755
3      7351
4      5407
5      4403
6      3833
7      2589
8      2543
9      2517
10     2069
11     1684
12     1243
13     1231
14     1063
15      665
16      355
17      305
18      131
Name: count, dtype: int64
the time of leiden is 21.714441537857056


  adatas[1].obs[key_clust] = clust_lbs2
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)


the time of degs is 72.27946710586548
--------------hvgs, degs info---------------
num of reference_hvgs,reference_degs,reference_higs are 2000,180,2085
num of query_hvgs,query_degs,query_higs are 2000,538,2217
--------------gene nodes info---------------
num of reference_gene_node is 2912
num of query_gene_node is 3292
--------------homo edges---------------
homology_type
ortholog_one2one      2302
ortholog_one2many      165
ortholog_many2many      65
Name: count, dtype: int64
--------------homo edges---------------
homology_type
ortholog_one2one    2302
Name: count, dtype: int64
knn time is 59.06409764289856 s
mnn time is 0.085113525390625 s
the time of compute mnn is  62.03797674179077 s
Inter-net pairs ACC: 0.7425983977708116
-------------------nodes info-------------------
the num of cell feats is 428
the num of cell nodes is 79460
the num of gene nodes is 6204
Times preprocess for graph:252.97
--------------------------------
[end][run_csMAHN_info_model_gene]
--------------------

  type_counts_list.append(pd.value_counts(adatas[i].obs[key_class]))
  type_counts_list.append(pd.value_counts(adatas[i].obs[key_class]))


----raw----
                      reference    query
CL                                      
enterocyte              36272.0  67108.0
B cell                  12604.0  12316.0
macrophage               8243.0   4208.0
stromal cell             6429.0      NaN
fibroblast               4843.0      NaN
endothelial cell         1227.0      NaN
enteroendocrine cell        NaN   1749.0
epithelial cell             NaN    672.0
tuft cell of colon          NaN    510.0
----new----
            reference    query
CL                            
enterocyte    36272.0  67108.0
B cell        12604.0  12316.0
macrophage     8243.0   4208.0
            Intestine_h  Intestine_m
CL                                  
enterocyte        36272        67108
B cell            12604        12316
macrophage         8243         4208
Task: refernece:Intestine_h 57119 cells x 10208 gene -> query:Intestine_m 83632 cells x 10665 gene in Intestine
---------------------------------------
[process_for_graph]
-------------

  disp_grouped = df.groupby('mean_bin')['dispersions']
  disp_grouped = df.groupby('mean_bin')['dispersions']


the time2 of processing adatas is 23.77963137626648
Leiden results:
leiden
0     16472
1     11197
2     11038
3      5430
4      5406
5      5190
6      4528
7      4411
8      4127
9      3075
10     2793
11     2709
12     2509
13     2177
14      922
15      888
16      760
Name: count, dtype: int64
the time of leiden is 40.31179070472717


  adatas[1].obs[key_clust] = clust_lbs2
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)


the time of degs is 114.65888285636902
--------------hvgs, degs info---------------
num of reference_hvgs,reference_degs,reference_higs are 2000,142,2077
num of query_hvgs,query_degs,query_higs are 2000,516,2228
--------------gene nodes info---------------
num of reference_gene_node is 2995
num of query_gene_node is 3228
--------------homo edges---------------
homology_type
ortholog_one2one      2253
ortholog_one2many      152
ortholog_many2many      46
Name: count, dtype: int64
--------------homo edges---------------
homology_type
ortholog_one2one    2253
Name: count, dtype: int64
knn time is 472.9138283729553 s
mnn time is 0.137282133102417 s
the time of compute mnn is  476.55330896377563 s
Inter-net pairs ACC: 0.8910256410256411
-------------------nodes info-------------------
the num of cell feats is 381
the num of cell nodes is 140751
the num of gene nodes is 6223
Times preprocess for graph:978.19
--------------------------------
[end][run_csMAHN_info_model_gene]
-----------------

  type_counts_list.append(pd.value_counts(adatas[i].obs[key_class]))
  type_counts_list.append(pd.value_counts(adatas[i].obs[key_class]))


----raw----
                                                    reference    query
CL                                                                    
mesenchymal cell                                      23683.0  19649.0
kidney tubule cell                                     7482.0  17060.0
endothelial cell                                       5147.0   3716.0
pancreatic ductal cell                                 4321.0   4754.0
myeloid cell                                           2748.0   4431.0
podocyte                                               1255.0      NaN
epithelial cell                                         495.0    265.0
kidney loop of Henle ascending limb epithelial ...        NaN  25916.0
----new----
                        reference    query
CL                                        
mesenchymal cell          23683.0  19649.0
kidney tubule cell         7482.0  17060.0
endothelial cell           5147.0   3716.0
pancreatic ductal cell     4321.0   4754.0
myeloid 

  disp_grouped = df.groupby('mean_bin')['dispersions']
  disp_grouped = df.groupby('mean_bin')['dispersions']


the time2 of processing adatas is 19.807610273361206
Leiden results:
leiden
0     5744
1     5080
2     4872
3     4732
4     4542
5     3928
6     3755
7     3275
8     2904
9     2267
10    1904
11    1793
12    1780
13    1567
14     676
15     511
16     290
17     255
Name: count, dtype: int64
the time of leiden is 15.803159236907959


  adatas[1].obs[key_clust] = clust_lbs2
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)


the time of degs is 91.08294034004211
--------------hvgs, degs info---------------
num of reference_hvgs,reference_degs,reference_higs are 2000,259,2105
num of query_hvgs,query_degs,query_higs are 2000,520,2196
--------------gene nodes info---------------
num of reference_gene_node is 2997
num of query_gene_node is 2788
--------------homo edges---------------
homology_type
ortholog_one2one      2040
ortholog_one2many      119
ortholog_many2many      59
Name: count, dtype: int64
--------------homo edges---------------
homology_type
ortholog_one2one    2040
Name: count, dtype: int64
knn time is 165.26057028770447 s
mnn time is 0.08535408973693848 s
the time of compute mnn is  168.9568247795105 s
Inter-net pairs ACC: 0.6558009864893846
-------------------nodes info-------------------
the num of cell feats is 459
the num of cell nodes is 93751
the num of gene nodes is 5785
Times preprocess for graph:426.83
--------------------------------
[end][run_csMAHN_info_model_gene]
-----------------

  type_counts_list.append(pd.value_counts(adatas[i].obs[key_class]))
  type_counts_list.append(pd.value_counts(adatas[i].obs[key_class]))


----raw----
                  reference    query
CL                                  
erythrocyte         17319.0   7612.0
mast cell            4628.0      NaN
hepatocyte           3400.0   5268.0
macrophage           2036.0   9342.0
neutrophil            981.0  12548.0
epithelial cell       425.0      NaN
stem cell              15.0      NaN
endothelial cell        NaN  20082.0
B cell                  NaN   9060.0
dendritic cell          NaN   8401.0
fibroblast              NaN   1514.0
----new----
             reference    query
CL                             
erythrocyte    17319.0   7612.0
hepatocyte      3400.0   5268.0
macrophage      2036.0   9342.0
neutrophil       981.0  12548.0
             Liver_h  Liver_m
CL                           
erythrocyte    17319     7612
hepatocyte      3400     5268
macrophage      2036     9342
neutrophil       981    12548
Task: refernece:Liver_h 23736 cells x 10120 gene -> query:Liver_m 34770 cells x 11437 gene in Liver
-----------------------

  disp_grouped = df.groupby('mean_bin')['dispersions']
  disp_grouped = df.groupby('mean_bin')['dispersions']


the time2 of processing adatas is 15.4103684425354
Leiden results:
leiden
0     5114
1     4600
2     4578
3     3254
4     2672
5     1869
6     1706
7     1559
8     1444
9     1368
10     986
11     909
12     828
13     783
14     648
15     589
16     538
17     465
18     236
19     233
20     229
21     162
Name: count, dtype: int64
the time of leiden is 12.26138973236084


  adatas[1].obs[key_clust] = clust_lbs2
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  self.stats[group_name, 'names'] = self.var_names[global_indices]
  self.stats[group_name, 'scores'] = scores[global_indices]
  self.stats[group_name, 'pvals'] = pvals[global_indices]
  self.stats[group_name, 'pvals_adj'] = pvals_adj[global_indices]
  self.stats[group_name, 'logfoldchanges'] = np.log2(
  self.stats[group_name, 'names'] = self.var_names[global_indices]
  self.stats[group_name, 'scores'] = scores[global_indices]
  self.stats[group_name, 'pvals'] = pvals[global_indices]
  self.stats[group_name, 'pvals_adj'] = pvals_adj[global_indices]
  self.stats[group_name, 'logfoldchanges'] = np.log2(


the time of degs is 52.75525903701782
--------------hvgs, degs info---------------
num of reference_hvgs,reference_degs,reference_higs are 2000,178,2074
num of query_hvgs,query_degs,query_higs are 2000,510,2196
--------------gene nodes info---------------
num of reference_gene_node is 2939
num of query_gene_node is 3136
--------------homo edges---------------
homology_type
ortholog_one2one      2214
ortholog_one2many      164
ortholog_many2many      92
Name: count, dtype: int64
--------------homo edges---------------
homology_type
ortholog_one2one    2214
Name: count, dtype: int64
knn time is 59.49287724494934 s
mnn time is 0.060976505279541016 s
the time of compute mnn is  61.99391055107117 s
Inter-net pairs ACC: 0.8598692810457517
-------------------nodes info-------------------
the num of cell feats is 394
the num of cell nodes is 58506
the num of gene nodes is 6075
Times preprocess for graph:188.88
--------------------------------
[end][run_csMAHN_info_model_gene]
-----------------

  type_counts_list.append(pd.value_counts(adatas[i].obs[key_class]))
  type_counts_list.append(pd.value_counts(adatas[i].obs[key_class]))


----raw----
                                            reference    query
CL                                                            
fibroblast                                       7573   8233.0
alveolar capillary type 2 endothelial cell       6473   2972.0
macrophage                                       5266   5659.0
endothelial cell                                 3981      NaN
mesenchymal cell                                 3293   8764.0
neutrophil                                       2787  23726.0
alveolar capillary type 1 endothelial cell       2192  15032.0
T cell                                           1960      NaN
----new----
                                            reference    query
CL                                                            
fibroblast                                       7573   8233.0
alveolar capillary type 2 endothelial cell       6473   2972.0
macrophage                                       5266   5659.0
mesenchymal cell               

  disp_grouped = df.groupby('mean_bin')['dispersions']
  disp_grouped = df.groupby('mean_bin')['dispersions']


the time2 of processing adatas is 12.369564533233643
Leiden results:
leiden
0     10097
1      7084
2      6203
3      5742
4      4647
5      4354
6      3525
7      3107
8      3008
9      2924
10     2856
11     2317
12     2059
13     1916
14     1352
15     1346
16     1341
17      330
18      128
19       50
Name: count, dtype: int64
the time of leiden is 25.8231143951416


  adatas[1].obs[key_clust] = clust_lbs2
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)


the time of degs is 81.39916467666626
--------------hvgs, degs info---------------
num of reference_hvgs,reference_degs,reference_higs are 2000,228,2114
num of query_hvgs,query_degs,query_higs are 2000,602,2237
--------------gene nodes info---------------
num of reference_gene_node is 3444
num of query_gene_node is 3025
--------------homo edges---------------
homology_type
ortholog_one2one      2490
ortholog_one2many      136
ortholog_many2many      42
Name: count, dtype: int64
--------------homo edges---------------
homology_type
ortholog_one2one    2490
Name: count, dtype: int64
knn time is 146.6163272857666 s
mnn time is 0.10057449340820312 s
the time of compute mnn is  149.81316757202148 s
Inter-net pairs ACC: 0.4270571827057183
-------------------nodes info-------------------
the num of cell feats is 495
the num of cell nodes is 91970
the num of gene nodes is 6469
Times preprocess for graph:398.13
--------------------------------
[end][run_csMAHN_info_model_gene]
-----------------

  type_counts_list.append(pd.value_counts(adatas[i].obs[key_class]))
  type_counts_list.append(pd.value_counts(adatas[i].obs[key_class]))


----raw----
                  reference    query
CL                                  
B cell              10987.0   5481.0
T cell               1698.0  11732.0
neutrophil           1681.0   1935.0
endothelial cell      748.0      NaN
macrophage            578.0   3618.0
erythrocyte           114.0   6963.0
plasma cell             NaN   2791.0
----new----
             reference    query
CL                             
B cell         10987.0   5481.0
T cell          1698.0  11732.0
neutrophil      1681.0   1935.0
macrophage       578.0   3618.0
erythrocyte      114.0   6963.0
             Spleen_h  Spleen_m
CL                             
B cell          10987      5481
T cell           1698     11732
neutrophil       1681      1935
macrophage        578      3618
erythrocyte       114      6963
Task: refernece:Spleen_h 15058 cells x 17582 gene -> query:Spleen_m 29729 cells x 10866 gene in Spleen
---------------------------------------
[process_for_graph]
--------------------------------

  disp_grouped = df.groupby('mean_bin')['dispersions']
  disp_grouped = df.groupby('mean_bin')['dispersions']


the time2 of processing adatas is 7.155946731567383
Leiden results:
leiden
0     8202
1     3858
2     3666
3     3621
4     1742
5     1564
6     1425
7     1199
8      829
9      781
10     769
11     742
12     501
13     371
14     349
15     110
Name: count, dtype: int64
the time of leiden is 8.421804428100586


  adatas[1].obs[key_clust] = clust_lbs2
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)


the time of degs is 37.18666362762451
--------------hvgs, degs info---------------
num of reference_hvgs,reference_degs,reference_higs are 2000,221,2144
num of query_hvgs,query_degs,query_higs are 2000,456,2189
--------------gene nodes info---------------
num of reference_gene_node is 3180
num of query_gene_node is 2484
--------------homo edges---------------
homology_type
ortholog_one2one      1502
ortholog_one2many      185
ortholog_many2many     136
Name: count, dtype: int64
--------------homo edges---------------
homology_type
ortholog_one2one    1502
Name: count, dtype: int64
knn time is 50.78779053688049 s
mnn time is 0.05328011512756348 s
the time of compute mnn is  52.11101007461548 s
Inter-net pairs ACC: 0.6114141838019304
-------------------nodes info-------------------
the num of cell feats is 393
the num of cell nodes is 44787
the num of gene nodes is 5664
Times preprocess for graph:137.28
--------------------------------
[end][run_csMAHN_info_model_gene]
------------------

  type_counts_list.append(pd.value_counts(adatas[i].obs[key_class]))
  type_counts_list.append(pd.value_counts(adatas[i].obs[key_class]))


----raw----
                                reference    query
CL                                                
cortical cell of adrenal gland     6547.0  21097.0
macrophage                         2080.0      NaN
endothelial cell                   1876.0   4421.0
dendritic cell                      714.0      NaN
fibroblast                          533.0   4467.0
chromaffin cell                      65.0    996.0
neutrophil                            NaN   8094.0
epithelial cell                       NaN   2928.0
T cell                                NaN   1473.0
----new----
                                reference    query
CL                                                
cortical cell of adrenal gland     6547.0  21097.0
endothelial cell                   1876.0   4421.0
fibroblast                          533.0   4467.0
chromaffin cell                      65.0    996.0
                                Adrenal-Gland_m  Adrenal-Gland_h
CL                                          

  disp_grouped = df.groupby('mean_bin')['dispersions']
  disp_grouped = df.groupby('mean_bin')['dispersions']


the time2 of processing adatas is 8.075146675109863
Leiden results:
leiden
0     5617
1     4398
2     3926
3     3885
4     2885
5     2088
6     1957
7     1612
8     1512
9     1307
10     592
11     575
12     540
13      87
Name: count, dtype: int64
the time of leiden is 10.507326364517212


  adatas[1].obs[key_clust] = clust_lbs2
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)


the time of degs is 53.845629930496216
--------------hvgs, degs info---------------
num of reference_hvgs,reference_degs,reference_higs are 2000,194,2055
num of query_hvgs,query_degs,query_higs are 2000,394,2149
--------------gene nodes info---------------
num of reference_gene_node is 3256
num of query_gene_node is 2456
--------------homo edges---------------
homology_type
ortholog_one2one      1915
ortholog_one2many      112
ortholog_many2many      81
Name: count, dtype: int64
--------------homo edges---------------
homology_type
ortholog_one2one    1915
Name: count, dtype: int64
knn time is 35.23018026351929 s
mnn time is 0.07409143447875977 s
the time of compute mnn is  36.276020765304565 s
Inter-net pairs ACC: 0.8871449925261584
-------------------nodes info-------------------
the num of cell feats is 357
the num of cell nodes is 40002
the num of gene nodes is 5712
Times preprocess for graph:148.47
--------------------------------
[end][run_csMAHN_info_model_gene]
----------------

  type_counts_list.append(pd.value_counts(adatas[i].obs[key_class]))
  type_counts_list.append(pd.value_counts(adatas[i].obs[key_class]))


----raw----
                           reference    query
CL                                           
neutrophil                   23804.0  16685.0
eosinophil                    8032.0      NaN
erythrocyte                   5326.0   4813.0
fibroblast                    2931.0      NaN
macrophage                    2111.0   3394.0
basophil                       693.0      NaN
hematopoietic stem cell          NaN  20807.0
common myeloid progenitor        NaN  20447.0
B cell                           NaN   1237.0
----new----
             reference    query
CL                             
neutrophil     23804.0  16685.0
erythrocyte     5326.0   4813.0
macrophage      2111.0   3394.0
             Bone-Marrow_m  Bone-Marrow_h
CL                                       
neutrophil           23804          16685
erythrocyte           5326           4813
macrophage            2111           3394
Task: refernece:Bone-Marrow_m 31241 cells x 13315 gene -> query:Bone-Marrow_h 24892 cells x 10494 ge

  disp_grouped = df.groupby('mean_bin')['dispersions']
  disp_grouped = df.groupby('mean_bin')['dispersions']


the time2 of processing adatas is 10.514947414398193
Leiden results:
leiden
0     5032
1     3745
2     3571
3     3198
4     2821
5     2001
6     1940
7     1562
8      755
9      222
10      45
Name: count, dtype: int64
the time of leiden is 7.840217590332031


  adatas[1].obs[key_clust] = clust_lbs2
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)


the time of degs is 50.23713421821594
--------------hvgs, degs info---------------
num of reference_hvgs,reference_degs,reference_higs are 2000,150,2086
num of query_hvgs,query_degs,query_higs are 2000,303,2198
--------------gene nodes info---------------
num of reference_gene_node is 3378
num of query_gene_node is 2853
--------------homo edges---------------
homology_type
ortholog_one2one      2249
ortholog_one2many      130
ortholog_many2many      86
Name: count, dtype: int64
--------------homo edges---------------
homology_type
ortholog_one2one    2249
Name: count, dtype: int64
knn time is 86.29372119903564 s
mnn time is 0.04094362258911133 s
the time of compute mnn is  87.50010299682617 s
Inter-net pairs ACC: 0.9032393773664282
-------------------nodes info-------------------
the num of cell feats is 256
the num of cell nodes is 56133
the num of gene nodes is 6231
Times preprocess for graph:211.09
--------------------------------
[end][run_csMAHN_info_model_gene]
------------------

  type_counts_list.append(pd.value_counts(adatas[i].obs[key_class]))
  type_counts_list.append(pd.value_counts(adatas[i].obs[key_class]))


----raw----
                  reference    query
CL                                  
neural cell         26621.0  15645.0
T cell              13425.0      NaN
macrophage          12633.0   6633.0
oligodendrocyte     11170.0   7546.0
stem cell            7878.0   4596.0
ciliated cell        6732.0      NaN
endothelial cell     2264.0    386.0
neutrophil           2135.0      NaN
astrocyte               NaN   6143.0
fibroblast              NaN   1382.0
granule cell            NaN    686.0
----new----
                  reference    query
CL                                  
neural cell         26621.0  15645.0
macrophage          12633.0   6633.0
oligodendrocyte     11170.0   7546.0
stem cell            7878.0   4596.0
endothelial cell     2264.0    386.0
                  Brain_m  Brain_h
CL                                
neural cell         26621    15645
macrophage          12633     6633
oligodendrocyte     11170     7546
stem cell            7878     4596
endothelial cell     2264 

  disp_grouped = df.groupby('mean_bin')['dispersions']
  disp_grouped = df.groupby('mean_bin')['dispersions']


the time2 of processing adatas is 15.892364978790283
Leiden results:
leiden
0     7452
1     6513
2     4877
3     4035
4     3903
5     1917
6     1493
7     1040
8      999
9      814
10     800
11     360
12     333
13     155
14     115
Name: count, dtype: int64
the time of leiden is 9.285213947296143


  adatas[1].obs[key_clust] = clust_lbs2
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)


the time of degs is 117.32652378082275
--------------hvgs, degs info---------------
num of reference_hvgs,reference_degs,reference_higs are 2000,244,2108
num of query_hvgs,query_degs,query_higs are 2000,511,2198
--------------gene nodes info---------------
num of reference_gene_node is 3031
num of query_gene_node is 2860
--------------homo edges---------------
homology_type
ortholog_one2one      2265
ortholog_one2many       82
ortholog_many2many      19
Name: count, dtype: int64
--------------homo edges---------------
homology_type
ortholog_one2one    2265
Name: count, dtype: int64
knn time is 214.11131811141968 s
mnn time is 0.06461286544799805 s
the time of compute mnn is  217.33705878257751 s
Inter-net pairs ACC: 0.8638466622604097
-------------------nodes info-------------------
the num of cell feats is 490
the num of cell nodes is 95372
the num of gene nodes is 5891
Times preprocess for graph:497.45
--------------------------------
[end][run_csMAHN_info_model_gene]
---------------

  type_counts_list.append(pd.value_counts(adatas[i].obs[key_class]))
  type_counts_list.append(pd.value_counts(adatas[i].obs[key_class]))


----raw----
                     reference   query
CL                                    
mesenchymal cell         50417  3129.0
myeloid cell             11505  1120.0
T cell                    6003     NaN
epithelial cell           3865   606.0
cardiac muscle cell       2890  5928.0
----new----
                     reference   query
CL                                    
mesenchymal cell         50417  3129.0
myeloid cell             11505  1120.0
epithelial cell           3865   606.0
cardiac muscle cell       2890  5928.0
                     Heart_m  Heart_h
CL                                   
mesenchymal cell       50417     3129
myeloid cell           11505     1120
epithelial cell         3865      606
cardiac muscle cell     2890     5928
Task: refernece:Heart_m 68677 cells x 12528 gene -> query:Heart_h 10783 cells x 11069 gene in Heart
---------------------------------------
[process_for_graph]
----------------------------------------
****************************************

  disp_grouped = df.groupby('mean_bin')['dispersions']
  disp_grouped = df.groupby('mean_bin')['dispersions']


the time2 of processing adatas is 11.072198152542114
Leiden results:
leiden
0     1858
1     1753
2     1243
3     1124
4      948
5      910
6      732
7      687
8      595
9      446
10     410
11      77
Name: count, dtype: int64
the time of leiden is 2.840965509414673


  adatas[1].obs[key_clust] = clust_lbs2
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)


the time of degs is 70.73217988014221
--------------hvgs, degs info---------------
num of reference_hvgs,reference_degs,reference_higs are 2000,199,2084
num of query_hvgs,query_degs,query_higs are 2000,407,2213
--------------gene nodes info---------------
num of reference_gene_node is 3274
num of query_gene_node is 2935
--------------homo edges---------------
homology_type
ortholog_one2one      2368
ortholog_one2many       99
ortholog_many2many      51
Name: count, dtype: int64
--------------homo edges---------------
homology_type
ortholog_one2one    2368
Name: count, dtype: int64
knn time is 79.87338709831238 s
mnn time is 0.038923025131225586 s
the time of compute mnn is  83.80200695991516 s
Inter-net pairs ACC: 0.7166310950582827
-------------------nodes info-------------------
the num of cell feats is 365
the num of cell nodes is 79460
the num of gene nodes is 6209
Times preprocess for graph:271.24
--------------------------------
[end][run_csMAHN_info_model_gene]
-----------------

  type_counts_list.append(pd.value_counts(adatas[i].obs[key_class]))
  type_counts_list.append(pd.value_counts(adatas[i].obs[key_class]))


----raw----
                      reference    query
CL                                      
enterocyte              67108.0  36272.0
B cell                  12316.0  12604.0
macrophage               4208.0   8243.0
enteroendocrine cell     1749.0      NaN
epithelial cell           672.0      NaN
tuft cell of colon        510.0      NaN
stromal cell                NaN   6429.0
fibroblast                  NaN   4843.0
endothelial cell            NaN   1227.0
----new----
            reference    query
CL                            
enterocyte    67108.0  36272.0
B cell        12316.0  12604.0
macrophage     4208.0   8243.0
            Intestine_m  Intestine_h
CL                                  
enterocyte        67108        36272
B cell            12316        12604
macrophage         4208         8243
Task: refernece:Intestine_m 83632 cells x 10665 gene -> query:Intestine_h 57119 cells x 10208 gene in Intestine
---------------------------------------
[process_for_graph]
-------------

  disp_grouped = df.groupby('mean_bin')['dispersions']
  disp_grouped = df.groupby('mean_bin')['dispersions']


the time2 of processing adatas is 22.639941453933716
Leiden results:
leiden
0     15016
1      9113
2      6267
3      5787
4      5692
5      4060
6      3179
7      2455
8      2321
9      1637
10     1227
11      365
Name: count, dtype: int64
the time of leiden is 18.847252368927002


  adatas[1].obs[key_clust] = clust_lbs2
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)


the time of degs is 111.4465262889862
--------------hvgs, degs info---------------
num of reference_hvgs,reference_degs,reference_higs are 2000,149,2088
num of query_hvgs,query_degs,query_higs are 2000,355,2171
--------------gene nodes info---------------
num of reference_gene_node is 3148
num of query_gene_node is 2965
--------------homo edges---------------
homology_type
ortholog_one2one      2268
ortholog_one2many       94
ortholog_many2many      27
Name: count, dtype: int64
--------------homo edges---------------
homology_type
ortholog_one2one    2268
Name: count, dtype: int64
knn time is 479.2949640750885 s
mnn time is 0.11287164688110352 s
the time of compute mnn is  482.82108759880066 s
Inter-net pairs ACC: 0.8883681295270558
-------------------nodes info-------------------
the num of cell feats is 283
the num of cell nodes is 140751
the num of gene nodes is 6113
Times preprocess for graph:950.98
--------------------------------
[end][run_csMAHN_info_model_gene]
----------------

  type_counts_list.append(pd.value_counts(adatas[i].obs[key_class]))
  type_counts_list.append(pd.value_counts(adatas[i].obs[key_class]))


----raw----
                                                    reference    query
CL                                                                    
kidney loop of Henle ascending limb epithelial ...    25916.0      NaN
mesenchymal cell                                      19649.0  23683.0
kidney tubule cell                                    17060.0   7482.0
pancreatic ductal cell                                 4754.0   4321.0
myeloid cell                                           4431.0   2748.0
endothelial cell                                       3716.0   5147.0
epithelial cell                                         265.0    495.0
podocyte                                                  NaN   1255.0
----new----
                        reference    query
CL                                        
mesenchymal cell          19649.0  23683.0
kidney tubule cell        17060.0   7482.0
pancreatic ductal cell     4754.0   4321.0
myeloid cell               4431.0   2748.0
endothel

  disp_grouped = df.groupby('mean_bin')['dispersions']
  disp_grouped = df.groupby('mean_bin')['dispersions']


the time2 of processing adatas is 12.11421537399292
Leiden results:
leiden
0     9310
1     6135
2     4559
3     3999
4     3549
5     2924
6     2878
7     2473
8     2417
9     2288
10    1754
11     548
12     374
13     312
14     211
15     145
Name: count, dtype: int64
the time of leiden is 14.987849473953247


  adatas[1].obs[key_clust] = clust_lbs2
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)

KeyboardInterrupt



# Retina

In [None]:
item = 'Retina'
df_para_item = df_para.query("mask == '{}'".format(item))
print('\n[runstart] {}\n'.format(item))

aligned=True
is_1v1 = False
n_epochs = [100, 200, 200]
for is_1v1 in [False,True] :
    for i, row in df_para_item.iterrows():
        path_varmap = get_path_varmap(
                map_sp[row['sp_simple_ref']], map_sp[row['sp_simple_que']], model='csMAHN')
        print(path_varmap)
    
        run_csMAHN_info_model_gene(
            path_adata1=row['path_ref'],
            path_adata2=row['path_que'],
            key_class1=row['key_cell_type'],
            key_class2=row['key_cell_type'],
            sp1=row['sp_simple_ref'],
            sp2=row['sp_simple_que'],
            path_varmap=path_varmap,
            tissue_name=row['tissue'],
            aligned=aligned,
            resdir_tag='{}-map-{};epochs={};is_1v1={}'.format(
                row['name_ref'],
                row['name_que'],
                sum(n_epochs),
                is_1v1),
            resdir=p_res_model_gene,
            models='csMAHN,came'.split(','),
            **{'n_epochs': n_epochs,
               'is_1v1': is_1v1})

print('\n[run finish] {}\n'.format(item))

# LC

In [None]:
item = 'LC'
df_para_item = df_para.query("tissue == '{}'".format(item))
# df_para_item = df_para.loc[['LC;LChDCs;LCmDCs'],:]
aligned=False
is_1v1 = False

print('\n[run start] {}\n'.format(item))
display(df_para_item)

n_epochs = [100, 200, 200]
for i, row in df_para_item.iterrows():
    path_varmap = get_path_varmap(
            map_sp[row['sp_simple_ref']], map_sp[row['sp_simple_que']], model='csMAHN')
    print(path_varmap)

    run_csMAHN_info_model_gene(
        path_adata1=row['path_ref'],
        path_adata2=row['path_que'],
        key_class1=row['key_cell_type'],
        key_class2=row['key_cell_type'],
        sp1=row['sp_simple_ref'],
        sp2=row['sp_simple_que'],
        path_varmap=path_varmap,
        tissue_name=row['tissue'],
        aligned=aligned,
        resdir_tag='{}-map-{};epochs={};is_1v1={}'.format(
            row['name_ref'],
            row['name_que'],
            sum(n_epochs),
            is_1v1),
        resdir=p_res_model_gene,
        models='csMAHN,came'.split(','),
        **{'n_epochs': n_epochs,
           'is_1v1': is_1v1})

print('\n[run finish] {}\n'.format(item))

# RA

In [None]:
item = 'RA'
df_para_item = df_para.query("tissue == '{}'".format(item))
aligned=False
is_1v1 = False

print('\n[run start] {}\n'.format(item))
display(df_para_item)

n_epochs = [100, 100, 100]
for i, row in df_para_item.iterrows():
    path_varmap = get_path_varmap(
            map_sp[row['sp_simple_ref']], map_sp[row['sp_simple_que']], model='csMAHN')
    print(path_varmap)

    run_csMAHN_info_model_gene(
        path_adata1=row['path_ref'],
        path_adata2=row['path_que'],
        key_class1=row['key_cell_type'],
        key_class2=row['key_cell_type'],
        sp1=row['sp_simple_ref'],
        sp2=row['sp_simple_que'],
        path_varmap=path_varmap,
        tissue_name=row['tissue'],
        aligned=aligned,
        resdir_tag='{}-map-{};epochs={};is_1v1={}'.format(
            row['name_ref'],
            row['name_que'],
            sum(n_epochs),
            is_1v1),
        resdir=p_res_model_gene,
        models='csMAHN,came'.split(','),
        **{'n_epochs': n_epochs,
           'is_1v1': is_1v1})

print('\n[run finish] {}\n'.format(item))

# LN

In [None]:
item = 'LN'
df_para_item = df_para.query("tissue == '{}'".format(item))
aligned=False
is_1v1 = False

print('\n[run start] {}\n'.format(item))
display(df_para_item)

n_epochs = [100, 200, 200]
for i, row in df_para_item.iterrows():
    path_varmap = get_path_varmap(
            map_sp[row['sp_simple_ref']], map_sp[row['sp_simple_que']], model='csMAHN')
    print(path_varmap)

    run_csMAHN_info_model_gene(
        path_adata1=row['path_ref'],
        path_adata2=row['path_que'],
        key_class1=row['key_cell_type'],
        key_class2=row['key_cell_type'],
        sp1=row['sp_simple_ref'],
        sp2=row['sp_simple_que'],
        path_varmap=path_varmap,
        tissue_name=row['tissue'],
        aligned=aligned,
        resdir_tag='{}-map-{};epochs={};is_1v1={}'.format(
            row['name_ref'],
            row['name_que'],
            sum(n_epochs),
            is_1v1),
        resdir=p_res_model_gene,
        models='csMAHN,came'.split(','),
        **{'n_epochs': n_epochs,
           'is_1v1': is_1v1})

print('\n[run finish] {}\n'.format(item))