In [1]:
import pandas as pd
import os
from scipy.sparse import csc_matrix

In [2]:
import scanpy as sc

In [3]:
from arboreto.utils import load_tf_names

In [4]:
from dask.distributed import Client
from dask_jobqueue import SLURMCluster
import subprocess as sp

In [5]:
from arboreto.algo import grnboost2
import distributed

In [6]:
def fetch_adata(adata):
    return csc_matrix(adata.X).toarray(), adata.var_names.values, adata.obs_names.values

In [7]:
data_dir = "/home/igarzonalva/Proyecto_SC_TNBC/GSE161529/Integration/adata/adata_GenAnno.h5ad"
tf_dir = "/home/igarzonalva/Proyecto_SC_TNBC/GSE161529/grn/Common_files/TF_names_v_1.01.txt"
network_dir = "/home/igarzonalva/Proyecto_SC_TNBC/GSE161529/grn"

In [8]:
adata = sc.read_h5ad(data_dir)
tf_names = load_tf_names(tf_dir)

In [9]:
for tumor_type in ['ER', 'HER2', 'TNBC']:
    adata_sub = adata[adata.obs['subtype'] == tumor_type,:].copy()
    mat, genes, cells = fetch_adata(adata_sub)
    n_genes = len(genes)
    gene_names = pd.Series(genes)
    n_matching_genes = gene_names.isin(tf_names).sum() # cuántos de los genes de la matriz de expresión son TFs
    print(type(mat))
    print(mat.shape)
    print(f'the number of genes is {n_genes}')
    print(f'Out of {n_genes} genes, {n_matching_genes} are TFs')
    portdash = 40748
    cluster = SLURMCluster(queue = "short", cores=16, processes=1, 
                       memory="16GB", walltime="05:00:00",
                       scheduler_options={"dashboard_address": f":{portdash}"})
    cluster.scale(10)
    client = Client(cluster)
    network = grnboost2(
                expression_data=mat,
                gene_names=genes,
                tf_names=tf_names,
                client_or_address=client)
    network_file = os.path.join(network_dir, f"{tumor_type}_network.tsv")
    network.to_csv(network_file, sep='\t', header=False, index=False)
    
    client.close()
    cluster.close()

<class 'numpy.ndarray'>
(59062, 18088)
the number of genes is 18088
Out of 18088 genes, 1375 are TFs


This may cause some slowdown.
Consider scattering data ahead of time and using futures.
