# TUMOR SELECTION: 

In [36]:
TUMOR = 'TNBC'

# SET UP ENVIRONMENT

In [33]:
import sys
print(f'This notebook was last run with this kernel {sys.executable}')

This notebook was last run with this kernel /opt/venv/bin/python


In [34]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import loompy as lp
from MulticoreTSNE import MulticoreTSNE as TSNE

In [49]:
os.chdir('/home/igarzonalva/Proyecto_SC_TNBC/GSE161529/04_grn/Jan19Try')

In [41]:
ADATA_FNAME = "/home/igarzonalva/Proyecto_SC_TNBC/GSE161529/02_Integration/adata/adata_scanvi_cuda_refinement.h5ad"

In [42]:
adata = sc.read_h5ad(ADATA_FNAME)

In [43]:
adata_raw = adata.raw.to_adata()
adata_tumor = adata_raw[adata_raw.obs.subtype == TUMOR,:].copy()

In [45]:
TF_FNAME = "/home/igarzonalva/Proyecto_SC_TNBC/GSE161529/04_grn/Common_files/TF_names_v_1.01.txt"
NETWORK_FAME = f'./{TUMOR}_network.csv'
f_loom_path_scenic = f"{TUMOR}_data.loom"

In [50]:
# create basic row and column attributes for the loom file:
row_attrs = {
    "Gene": np.array(adata_tumor.var_names) ,
}
col_attrs = {
    "CellID": np.array(adata_tumor.obs_names) ,
    "nGene": np.array( np.sum(adata_tumor.X.transpose()>0 , axis=0)).flatten() ,
    "nUMI": np.array( np.sum(adata_tumor.X.transpose() , axis=0)).flatten() ,
}
lp.create( f_loom_path_scenic, adata_tumor.X.transpose(), row_attrs, col_attrs)


# STEP 1: Network inference based on GRNBoost2 from CLI

https://github.com/aertslab/SCENICprotocol/blob/master/notebooks/PBMC10k_SCENIC-protocol-CLI.ipynb

For this step the CLI version of SCENIC is used. This step can be deployed on an High Performance Computing system. We use the counts matrix (without log transformation or further processing) from the loom file we wrote earlier. Output: List of adjacencies between a TF and its targets stored in ADJACENCIES_FNAME.



In [None]:
!pyscenic grn {f_loom_path_scenic} {TF_FNAME} -o {NETWORK_FAME} --num_workers 16


  if type(slice_) is not tuple or len(slice_) is not 2:

2025-01-19 19:51:24,826 - pyscenic.cli.pyscenic - INFO - Loading expression matrix.

2025-01-19 19:51:38,356 - pyscenic.cli.pyscenic - INFO - Inferring regulatory networks.
preparing dask client
parsing input
creating dask graph
16 partitions
computing dask graph


In [3]:
import pandas as pd
import os
from scipy.sparse import csc_matrix
import scanpy as sc

In [3]:
from arboreto.utils import load_tf_names
from arboreto.algo import grnboost2

In [4]:
from dask.distributed import Client
from dask_jobqueue import SLURMCluster
# import subprocess as sp
# import distributed
# import socket
# hostname = socket.gethostname()


In [13]:
def fetch_adata(adata):
    return csc_matrix(adata.X).toarray(), adata.var_names.values, adata.obs_names.values
adata = sc.read_h5ad(data_dir)

In [7]:

tf_names = load_tf_names(tf_dir)

In [None]:
for tumor_type in ['TNBC', 'HER2', 'ER']:
    
    adata_sub = adata[adata.obs.subtype == tumor_type,:].copy()
    mat, genes, cells = fetch_adata(adata_sub)
    #n_genes = len(genes)
    #gene_names = pd.Series(genes)
    #n_matching_genes = gene_names.isin(tf_names).sum() # cuántos de los genes de la matriz de expresión son TFs
    #print(type(mat))
    #print(mat.shape)
    #print(f'the number of genes is {n_genes}')
    #print(f'Out of {n_genes} genes, {n_matching_genes} are TFs')
    
    portdash = 40748
        
    cluster = SLURMCluster(queue = "short", cores=8, processes=1, 
                       memory="16GB", walltime="05:00:00", scheduler_options={"dashboard_address": f":{portdash}", "host": 'nodo05'})
    cluster.scale(6)
    client = Client(cluster)
    #mat_future = client.scatter(mat, broadcast=True)

    print(client)
    print(cluster)
    print(client.scheduler_info())
    
    network = grnboost2(
                expression_data=mat,
                gene_names=genes,
                tf_names=tf_names,
                client_or_address=client,
                verbose = True,)
    
    network_file = os.path.join(network_dir, f"{tumor_type}_network.tsv")
    network.to_csv(network_file, sep='\t', header=False, index=False)
    
    client.close()
    cluster.close()

Perhaps you already have a cluster running?
Hosting the HTTP server on port 41229 instead


<Client: 'tcp://172.16.2.5:38102' processes=0 threads=0, memory=0 B>
SLURMCluster(3339a63a, 'tcp://172.16.2.5:38102', workers=0, threads=0, memory=0 B)
{'type': 'Scheduler', 'id': 'Scheduler-08713568-f752-4fad-a504-0803b0d1a0bb', 'address': 'tcp://172.16.2.5:38102', 'services': {'dashboard': 41229}, 'started': 1737233253.5331335, 'workers': {}}
preparing dask client
parsing input
creating dask graph
6 partitions
computing dask graph


This may cause some slowdown.
Consider scattering data ahead of time and using futures.


In [10]:
portdash = 40748
cluster = SLURMCluster(queue = "short", cores=16, processes=1, 
                   memory="16GB", walltime="05:00:00",
                   scheduler_options={"dashboard_address": f":{portdash}", 'host':'nodo05'})
cluster.scale(2)
client = Client(cluster)

In [32]:
client.scheduler_info()

0,1
Comm: tcp://172.16.2.5:34936,Workers: 2
Dashboard: http://172.16.2.5:40748/status,Total threads: 32
Started: Just now,Total memory: 29.80 GiB

0,1
Comm: tcp://172.16.2.7:40742,Total threads: 16
Dashboard: http://172.16.2.7:36240/status,Memory: 14.90 GiB
Nanny: tcp://172.16.2.7:46531,
Local directory: /tmp/dask-scratch-space/worker-kss9s3qg,Local directory: /tmp/dask-scratch-space/worker-kss9s3qg
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 12.3%,Last seen: Just now
Memory usage: 127.92 MiB,Spilled bytes: 0 B
Read bytes: 19.16 MiB,Write bytes: 132.95 kiB

0,1
Comm: tcp://172.16.2.7:46182,Total threads: 16
Dashboard: http://172.16.2.7:39084/status,Memory: 14.90 GiB
Nanny: tcp://172.16.2.7:37627,
Local directory: /tmp/dask-scratch-space/worker-ssuapi85,Local directory: /tmp/dask-scratch-space/worker-ssuapi85
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 13.2%,Last seen: Just now
Memory usage: 128.15 MiB,Spilled bytes: 0 B
Read bytes: 19.16 MiB,Write bytes: 132.96 kiB


In [29]:
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.SLURMCluster
Dashboard: http://172.16.2.5:40748/status,

0,1
Dashboard: http://172.16.2.5:40748/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://172.16.2.5:34936,Workers: 0
Dashboard: http://172.16.2.5:40748/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [6]:
cluster

0,1
Dashboard: http://172.16.2.5:40748/status,Workers: 2
Total threads: 32,Total memory: 29.80 GiB

0,1
Comm: tcp://172.16.2.5:43447,Workers: 2
Dashboard: http://172.16.2.5:40748/status,Total threads: 32
Started: Just now,Total memory: 29.80 GiB

0,1
Comm: tcp://172.16.2.7:38642,Total threads: 16
Dashboard: http://172.16.2.7:36403/status,Memory: 14.90 GiB
Nanny: tcp://172.16.2.7:46697,
Local directory: /tmp/dask-scratch-space/worker-ztvt6ent,Local directory: /tmp/dask-scratch-space/worker-ztvt6ent

0,1
Comm: tcp://172.16.2.6:45498,Total threads: 16
Dashboard: http://172.16.2.6:45813/status,Memory: 14.90 GiB
Nanny: tcp://172.16.2.6:33965,
Local directory: /tmp/dask-scratch-space/worker-a5ulnw6_,Local directory: /tmp/dask-scratch-space/worker-a5ulnw6_


In [41]:
adata_sub = adata[adata.obs['subtype'] == 'TNBC',:].copy()

In [42]:
adata_sub

AnnData object with n_obs × n_vars = 20721 × 22788
    obs: 'batch', 'subtype', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'complexity', 'n_genes', 'n_counts', 'predicted_labels', 'over_clustering', 'majority_voting', 'doublet_score', 'predicted_doublet', 'leiden', 'IGA_First_GenAnno', 'scanvi_prediction', 'IGA_PostScAnvi_GenAnno_colors'
    var: 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'IGA_PostScAnvi_GenAnno_colors_colors', 'batch_colors', 'hvg', 'leiden', 'log1p', 'majority_voting_colors', 'neighbors', 'predicted_labels_colors', 'scanvi_prediction_colors', 'subtype_colors', 'umap'
    obsm: 'X_scANVI', 'X_scVI', 'X_umap'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [43]:
mat, genes, cells = fetch_adata(adata_sub)


In [36]:
n_rows, n_cols = mat.shape
col_names = [f"G{i+1}" for i in range(n_cols)]
row_index = [str(i+1) for i in range(n_rows)]
df = pd.DataFrame(mat, index=row_index, columns=genes)

In [37]:
df

Unnamed: 0,A1BG,A1BG-AS1,A2M,A2M-AS1,A2ML1,A4GALT,AAAS,AACS,AADAC,AADACL2-AS1,...,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1
1,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.00000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0,0.000000,0.000000
2,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.00000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0,1.067101,0.000000
3,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.71297,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.71297,0.000000,0.0,0.0,0.000000,0.000000
4,0.710277,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.00000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0,0.710277,0.000000
5,0.000000,0.0,1.91206,0.0,0.0,0.000000,0.000000,0.00000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0,1.356738,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20717,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.00000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0,0.000000,0.000000
20718,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.80246,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.00000,0.343831,0.0,0.0,0.599211,0.343831
20719,0.000000,0.0,0.00000,0.0,0.0,0.000000,0.552708,0.00000,0.0,0.0,...,0.552708,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0,0.906606,0.000000
20720,0.962858,0.0,0.00000,0.0,0.0,0.962858,0.000000,0.00000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0,0.000000,0.000000


In [45]:
genes

array(['A1BG', 'A1BG-AS1', 'A2M', ..., 'ZYG11B', 'ZYX', 'ZZEF1'],
      dtype=object)

In [46]:
#adata_sub = adata[adata.obs['subtype'] == 'TNBC',:].copy()
#mat, genes, cells = fetch_adata(adata_sub)
network = grnboost2(
                expression_data=mat,
                gene_names=genes,
                tf_names=tf_names,
                client_or_address=client, verbose = True)

network_file = os.path.join(network_dir, f"{tumor_type}_network.tsv")
network.to_csv(network_file, sep='\t', header=False, index=False)
    

preparing dask client
parsing input
creating dask graph
2 partitions
computing dask graph


This may cause some slowdown.
Consider scattering data ahead of time and using futures.


not shutting down client, client was created externally
finished


KeyboardInterrupt: 

In [59]:
client.close()

In [60]:
cluster.close()

In [56]:
import dask
from dask import delayed
import dask.array as da

from dask_jobqueue import SLURMCluster
from dask.distributed import Client

In [50]:
tf_names = load_tf_names(tf_dir)

In [52]:
def fetch_adata(adata):
    return csc_matrix(adata.X).toarray(), adata.var_names.values, adata.obs_names.values

In [48]:
def run_grnboost_for_subtype(tumor_type, adata, tf_names, network_dir):
   
    adata_sub = adata[adata.obs['subtype'] == tumor_type,:].copy()
    
    
    mat, genes, cells = fetch_adata(adata_sub)
    
    # 3. run GRNBoost2
    network = grnboost2(
        expression_data=mat,
        gene_names=genes,
        tf_names=tf_names,
        verbose = True
        
    )
    
    network_file = os.path.join(network_dir, f"{tumor_type}_network.tsv")
    network.to_csv(network_file, sep='\t', header=False, index=False)

    return f"{tumor_type} done"

In [49]:
### Step 2: in your main code (the single Jupyter cell or script)
def main():
    # define tumor types
    tumor_types = ['ER','HER2','TNBC']

    # set up the cluster once
    portdash = 40748    
    cluster = SLURMCluster(
        queue="short",
        cores=16,
        processes=1,
        memory="16GB",
        walltime="05:00:00",
        scheduler_options={
            "dashboard_address": f":{portdash}",
            "host": 'nodo05'
        }
    )
    cluster.scale(6)  # 9 workers
    client = Client(cluster)
    print(client)
    print(cluster)
    print(client.scheduler_info())
    
    # let's define the tasks in a list using dask.delayed
    tasks = []
    for tumor_type in tumor_types:
        task = delayed(run_grnboost_for_subtype)(tumor_type, adata, tf_names, network_dir)
        tasks.append(task)

    # now compute them in parallel
    results = dask.compute(*tasks)
    print("All done:", results)

    # cleanup
    client.close()
    cluster.close()


In [57]:
main()

Perhaps you already have a cluster running?
Hosting the HTTP server on port 33306 instead


<Client: 'tcp://172.16.2.5:46508' processes=0 threads=0, memory=0 B>
SLURMCluster(9a303f70, 'tcp://172.16.2.5:46508', workers=0, threads=0, memory=0 B)
{'type': 'Scheduler', 'id': 'Scheduler-3dbc2b98-6c25-4503-81a5-7e4a3fe88ad8', 'address': 'tcp://172.16.2.5:46508', 'services': {'dashboard': 33306}, 'started': 1737231295.0733726, 'workers': {}}


This may cause some slowdown.
Consider scattering data ahead of time and using futures.
Task exception was never retrieved
future: <Task finished name='Task-101499' coro=<Client._gather.<locals>.wait() done, defined at /home/igarzonalva/.conda/envs/pyscenic-git-env/lib/python3.10/site-packages/distributed/client.py:2209> exception=AllExit()>
Traceback (most recent call last):
  File "/home/igarzonalva/.conda/envs/pyscenic-git-env/lib/python3.10/site-packages/distributed/client.py", line 2218, in wait
    raise AllExit()
distributed.client.AllExit


KeyboardInterrupt: 