In [1]:
import scanpy as sc
data_dir = "/home/igarzonalva/Proyecto_SC_TNBC/GSE161529/02_Integration/adata/adata_scanvi_cuda_refinement.h5ad"
adata = sc.read_h5ad(data_dir)

In [2]:
from arboreto.utils import load_tf_names
tf_dir = "/home/igarzonalva/Proyecto_SC_TNBC/GSE161529/04_grn/Common_files/TF_names_v_1.01.txt"
tf_names = load_tf_names(tf_dir)

In [3]:
from dask.distributed import Client
from dask_jobqueue import SLURMCluster

In [6]:
cluster = SLURMCluster(queue = "short", cores=8, processes=1, 
                       memory="16GB", walltime="05:00:00", 
                      scheduler_options={"host": 'nodo10'})

Perhaps you already have a cluster running?
Hosting the HTTP server on port 39459 instead


In [7]:
cluster

0,1
Dashboard: http://172.16.2.10:39459/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://172.16.2.10:40951,Workers: 0
Dashboard: http://172.16.2.10:39459/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [27]:
adata.obs.subtype 

AAACCTGCAAGCGTAG-1-GSM4909281    TNBC
AAACCTGCAATAGAGT-1-GSM4909281    TNBC
AAACCTGCAATTGCTG-1-GSM4909281    TNBC
AAACCTGCACTCAGGC-1-GSM4909281    TNBC
AAACCTGCAGTGACAG-1-GSM4909281    TNBC
                                 ... 
TTTGTTGGTGGGTATG-1-GSM4909317      ER
TTTGTTGTCACTTTGT-1-GSM4909317      ER
TTTGTTGTCGCACGGT-1-GSM4909317      ER
TTTGTTGTCTCCCTAG-1-GSM4909317      ER
TTTGTTGTCTCGGTAA-1-GSM4909317      ER
Name: subtype, Length: 202731, dtype: category
Categories (4, object): ['ER', 'HER2', 'TNBC', 'TNBC_BRCA']

In [34]:
adata

AnnData object with n_obs × n_vars = 202731 × 22788
    obs: 'batch', 'subtype', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'complexity', 'n_genes', 'n_counts', 'predicted_labels', 'over_clustering', 'majority_voting', 'doublet_score', 'predicted_doublet', 'leiden', 'IGA_First_GenAnno', 'scanvi_prediction', 'IGA_PostScAnvi_GenAnno_colors'
    var: 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm'
    uns: 'IGA_PostScAnvi_GenAnno_colors_colors', 'batch_colors', 'hvg', 'leiden', 'leiden_colors', 'log1p', 'majority_voting_colors', 'neighbors', 'predicted_labels_colors', 'scanvi_prediction_colors', 'subtype_colors', 'umap'
    obsm: 'X_scANVI', 'X_scVI', 'X_umap'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [39]:
adata_tnbc = adata[adata.obs['subtype'] == 'TNBC'].copy()

In [42]:
exp_mat = adata_tnbc.X.toarray()

In [43]:
exp_mat.shape

(20721, 22788)

In [59]:
import pandas as pd
df_expr = pd.DataFrame(exp_mat, index = adata_tnbc.obs.index, columns = adata_tnbc.var_names)

In [61]:
import dask.dataframe as dd

df_expr_dask = dd.from_pandas(df_expr, npartitions=(len(df_expr) // 1000) or 1)


In a future release, Dask DataFrame will use a new implementation that
contains several improvements including a logical query planning.
The user-facing DataFrame API will remain unchanged.

The new implementation is already available and can be enabled by
installing the dask-expr library:

    $ pip install dask-expr

and turning the query planning option on:

    >>> import dask
    >>> dask.config.set({'dataframe.query-planning': True})
    >>> import dask.dataframe as dd

API documentation for the new implementation is available at
https://docs.dask.org/en/stable/dask-expr-api.html

Any feedback can be reported on the Dask issue tracker
https://github.com/dask/dask/issues 


    # via Python

    # via CLI


  import dask.dataframe as dd


In [51]:
import numpy as np

In [53]:
mask = adata_tnbc.var_names.isin(tf_names)
tf_indices = np.where(mask)[0]   # numeric indices
tf_indices

array([ 1215,  1891,  2906, ..., 22781, 22782, 22783])

In [63]:
from xgboost.dask import DaskXGBRegressor
from dask_ml.metrics import mean_squared_error

def train_models_for_genes(client, df_expr_dask, tf_list, gene_list):
    results = {}
    
    for gene in gene_list:
        X = df_expr_dask[tf_list]
        y = df_expr_dask[gene]
        
        # Create a Dask XGB model (or any other regressor)
        model = DaskXGBRegressor(tree_method="hist")
        
        # Fit on the distributed dataframe
        model.fit(X, y)
        
    
    return results

ModuleNotFoundError: No module named 'xgboost'

In [None]:
def train_gbr_for_each_gene(client, df, tf_list, gene_list, n_folds=5):
    """
    For each gene in gene_list:
      - Use the columns in tf_list as features (X).
      - Use the gene column as target (y).
      - Perform cross-validation with XGBoost (hist mode).
      - Return a dict of {gene: (mean_mse, std_mse)}.
    """
    results = {}
    
    for gene in gene_list:
        print(f"\n=== Training for target gene: {gene} ===")
        
        # Collect cross-validation scores
        scores = []
        
        for fold_idx, (train_dd, test_dd) in enumerate(make_cv_splits(df, n_folds=n_folds)):
            print(f"  Fold {fold_idx+1} of {n_folds} ...")
            
            # X = TF columns, y = current target gene column
            X_train = train_dd[tf_list]
            y_train = train_dd[gene]
            X_test  = test_dd[tf_list]
            y_test  = test_dd[gene]

            # Create DaskDMatrix
            dtrain = xgboost.dask.DaskDMatrix(client, X_train, y_train)
            
            # Train gradient boosting regressor
            model_out = xgboost.dask.train(
                client,
                params={"tree_method": "hist"},
                dtrain=dtrain,
                num_boost_round=50,   # Example hyperparameter
            )
            
            # Predict
            preds = xgboost.dask.predict(client, model_out, X_test)
            
            # Compute MSE on this fold (lazy)
            fold_mse = mean_squared_error(
                y_test.to_dask_array(), 
                preds.to_dask_array(),
                squared=True,  # or False if you want RMSE
                compute=False
            )
            
            # Persist to trigger computation later
            scores.append(fold_mse.persist())
        
        # Bring all fold scores together
        scores_array = da.concatenate(scores).compute()
        mean_mse = scores_array.mean()
        std_mse = scores_array.std()
        
        print(f"Gene {gene}: MSE={mean_mse:.4f} +/- {std_mse:.4f}")
        results[gene] = (mean_mse, std_mse)
    
    return results
