In [1]:
import pereggrn_perturbations
import pereggrn_networks

import os 
import anndata as ad 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
import scanpy as sc
from sklearn.model_selection import train_test_split 

from scipy.sparse import csr_matrix

task_grn_inference_dir = '../../task_grn_inference'
pereggrn_dir = "../../pereggrn"

sys.path.append(task_grn_inference_dir)
# from src.helper import *
# from src.exp_analysis.helper import *
from src.utils.util import process_links
pereggrn_perturbations.set_data_path(f"{pereggrn_dir}/perturbation_data/perturbations")



inference_data_dir = f'{task_grn_inference_dir}/resources/inference_data/'
evaluation_data_dir = f'{task_grn_inference_dir}/resources/evaluation_data/'
raw_datasets_dir = f'{task_grn_inference_dir}/resources/datasets_raw/'

# os.makedirs(inference_data_dir, exist_ok=True)
# os.makedirs(evaluation_data_dir, exist_ok=True)

In [2]:
if False: # get global models
    par = {
        'global_models': [
                            'ANANSE_tissue/networks/lung.parquet',
                            'ANANSE_tissue/networks/stomach.parquet', 
                            'ANANSE_tissue/networks/heart.parquet',
                            'ANANSE_tissue/networks/bone_marrow.parquet',
                            
                            'gtex_rna/networks/Whole_Blood.parquet',
                            'gtex_rna/networks/Brain_Amygdala.parquet', 
                            'gtex_rna/networks/Breast_Mammary_Tissue.parquet', 
                            'gtex_rna/networks/Lung.parquet',
                            'gtex_rna/networks/Stomach.parquet',

                            'cellnet_human_Hg1332/networks/bcell.parquet',
                            'cellnet_human_Hg1332/networks/tcell.parquet',
                            'cellnet_human_Hg1332/networks/skin.parquet',
                            'cellnet_human_Hg1332/networks/neuron.parquet',
                            'cellnet_human_Hg1332/networks/heart.parquet',
                            ],
        'read_dir': f'{pereggrn_dir}/network_collection/networks/',
        'write_dir': f'{task_grn_inference_dir}/resources/grn_models/global/',
        'max_n_links': 50_000
    }
    os.makedirs(par['write_dir'], exist_ok=True)

    names = []
    for model in par['global_models']:
        net = pd.read_parquet(f"{par['read_dir']}/{model}")
        net.columns = ['source','target','weight']
        method = model.split('/')[0].split('_')[0].capitalize()
        tissue = model.split('/')[-1].split('.')[0].replace('_', ' ').capitalize()
        name = method+':'+tissue

        net = process_links(net, par)


        net.to_csv(f"{par['write_dir']}/{name}.csv")

        names.append(name)
    names

In [3]:
def psedudobulk_fun(adata):
    # control_perturbs = adata.obs[adata.obs['is_control']]['perturbation'].unique()
    # test_perturbs = adata.obs[adata.obs['is_test']]['perturbation'].unique()

    # Aggregate metadata (mean or mode as appropriate)
    metadata = (
        adata.obs.groupby('perturbation')
        .agg(lambda x: x.mode()[0] if x.nunique() == 1 else x.iloc[0])  # Adjust for categorical columns
    )

    # if 'counts' in adata.layers:
    #     adata.X = adata.layers['counts'].copy()

    # Pseudobulk the main layer
    pseudobulk_data = adata.to_df().groupby(adata.obs['perturbation']).sum()
    # Ensure the metadata index matches pseudobulk counts
    metadata = metadata.loc[pseudobulk_data.index]

    # Create a new AnnData object for pseudobulked data
    adata_bulked = sc.AnnData(
            X=pseudobulk_data.values,
            obs=metadata.reset_index(),
            var=adata.var.copy()
        )

    # if 'counts' in adata.layers:
    #     # Add pseudobulked layers
    #     adata_bulked.layers['counts'] = csr_matrix(adata_bulked.X)
    #     adata_bulked.layers['X_norm'] = sc.experimental.pp.normalize_pearson_residuals(adata_bulked, layer='counts', inplace=False)['X']
    #     mask = ~(adata_bulked.X.std(axis=0)==0)
    #     adata_bulked = adata_bulked[:, mask]
    #     lognorm = sc.pp.normalize_total(adata_bulked, layer='counts', inplace=False)['X']
    #     adata_bulked.layers['lognorm'] = sc.pp.log1p(lognorm, copy=True)
    # else:
    #     adata_bulked.layers['X_norm'] = adata_bulked.X.copy()
    # adata_bulked.layers['X_norm'] = adata_bulked.X.copy()
        
    return adata_bulked
# - make data sparse
# def sparsify(adata):
#     if (not isinstance(adata.X, csr_matrix)):
#         adata.X = csr_matrix(adata.X) 
#     if (not isinstance(adata.layers['counts'], csr_matrix)):
#         adata.layers['counts'] = csr_matrix(adata.layers['counts']) 
#     return adata

In [4]:
def process_all(file_name):
    # - get the data
    pereggrn_perturbations.load_perturbation_metadata()
    adata = pereggrn_perturbations.load_perturbation(file_name) 
    # print(file_name,type(adata.X), adata.X[:5])
    # return
    pereggrn_perturbations.check_perturbation_dataset(ad = adata)

    # - clearn up 
    del adata.obsp 
    del adata.varm
    del adata.uns
    del adata.obsm
    if 'gene_name' in adata.var.columns:
        adata.var = adata.var[['gene_name']]
        adata.var = adata.var.set_index('gene_name')
    else:
        adata.var = adata.var[[]]
    adata.obs = adata.obs[['perturbation', 'is_control', 'perturbation_type']]

    # - data split 
    if file_name == 'replogle2':
        ctr_samples = adata.obs.is_control
        samples = adata.obs.index[~ctr_samples] 
        _, test_samples = train_test_split(samples, test_size=.2, random_state=32)
        adata.obs['is_test'] = adata.obs.index.isin(test_samples)
    elif file_name == 'norman':
        ctr_samples = adata.obs.is_control
        samples = adata[adata.obs.index[~ctr_samples]].obs.perturbation.unique()
        _, test_samples = train_test_split(samples, test_size=.5, random_state=32)
        adata.obs['is_test'] = adata.obs.perturbation.isin(test_samples)
    elif file_name == 'nakatake':
        samples = adata.obs.perturbation.unique()
        _, test_samples = train_test_split(samples, test_size=.5, random_state=32)
        adata.obs['is_test'] = adata.obs.perturbation.isin(test_samples)
    elif file_name == 'adamson':
        ctr_samples = adata.obs.is_control
        samples = adata[adata.obs.index[~ctr_samples]].obs.perturbation.unique()
        _, test_samples = train_test_split(samples, test_size=.8, random_state=32)
        adata.obs['is_test'] = adata.obs.perturbation.isin(test_samples)

    adata_train = adata[~adata.obs['is_test']] # we use single cells for train (if not already bulked)
    
    if file_name in ['norman', 'adamson']: # these two are single cells. for norman, we have .counts but not for adamson -> different preprocessing
        adata_bulked = psedudobulk_fun(adata) # also normalize
    else:
        adata_bulked = adata
        # adata_bulked.layers['X_norm'] = adata_bulked.X.copy()

    adata_test = adata_bulked[adata_bulked.obs['is_test']] # we use bulked data for test 


    # - duplicated gene names
    duplicates = adata_train.var_names[adata_train.var_names.duplicated()].unique()
    adata_train = adata_train[:, ~adata_train.var_names.isin(duplicates)]

    duplicates = adata_test.var_names[adata_test.var_names.duplicated()].unique()
    adata_test = adata_test[:, ~adata_test.var_names.isin(duplicates)]

    
    # - normalize adata_train 
    # if file_name in ['norman']: # only norman needs this. the others are normalized already
    #     lognorm = sc.pp.normalize_total(adata_train, layer='counts', inplace=False)['X']
    #     adata_train.layers['X_norm'] = sc.pp.log1p(lognorm, copy=True)
    # if file_name in ['adamson']: 
    #     adata_train.layers['X_norm'] = adata_train.X
    adata_train.layers['X_norm'] = adata_train.X
    adata_test.layers['X_norm'] = adata_test.X
    adata.layers['X_norm'] = adata.X

    # if file_name in ['norman']:
    #     adata = sparsify(adata)    
    #     adata_train = sparsify(adata_train)

    if file_name in ['norman', 'adamson']:
        adata.write(f'{raw_datasets_dir}/{file_name}_sc_counts.h5ad')
        # adata_test_sc = adata[adata.obs['is_test']] # we also store singe cell data for these two datasets -> for WS distance
        # duplicates = adata_test_sc.var_names[adata_test_sc.var_names.duplicated()].unique()
        # adata_test_sc = adata_test_sc[:, ~adata_test_sc.var_names.isin(duplicates)]
        # if 'X_norm' not in adata_test_sc.layers:
        #     if 'counts' in adata_test_sc.layers:
        #         lognorm = sc.pp.normalize_total(adata_test_sc, layer='counts', inplace=False)['X']
        #         adata_test_sc.layers['X_norm'] = sc.pp.log1p(lognorm, copy=True)
        #     else:
        #         adata_test_sc.layers['X_norm'] = adata_test_sc.X
        # adata_test_sc.write(f'{evaluation_data_dir}/{file_name}_perturbation_sc.h5ad')
    # if file_name == 'replogle2':
    #     file_name == 'replogle':
    adata_bulked.write(f'{raw_datasets_dir}/{file_name}_bulked.h5ad')
    adata_train.write(f'{inference_data_dir}/{file_name}_rna.h5ad')
    adata_test.write(f'{evaluation_data_dir}/{file_name}_perturbation.h5ad')
    
    return adata_train 

for file_name in ['adamson', "nakatake", "replogle2", "norman"]:
    adata = process_all(file_name)

Checking gene metadata...
Checking perturbation labels...
Checking control labels...
Checking which genes are measured...
Checking for log-transform and raw data...
... done.


  adata.obs.groupby('perturbation')
  pseudobulk_data = adata.to_df().groupby(adata.obs['perturbation']).sum()
  adata_train.layers['X_norm'] = adata_train.X
  adata_test.layers['X_norm'] = adata_test.X


Checking gene metadata...
Checking perturbation labels...
Checking control labels...
Checking which genes are measured...
Checking for log-transform and raw data...
... done.


  adata_train.layers['X_norm'] = adata_train.X
  adata_test.layers['X_norm'] = adata_test.X


Checking gene metadata...
Checking perturbation labels...
Checking control labels...
Checking which genes are measured...
Checking for log-transform and raw data...
... done.


AnnData expects .var.index to contain strings, but got values like:
    ['LINC01409', 'LINC01128', 'NOC2L', 'KLHL17', 'HES4']

    Inferred to be: categorical

  value_idx = self._prep_dim_index(value.index, attr)
  adata_train.layers['X_norm'] = adata_train.X
  adata_test.layers['X_norm'] = adata_test.X


Checking gene metadata...
Checking perturbation labels...
Checking control labels...
Checking which genes are measured...
Checking for log-transform and raw data...
... done.


AnnData expects .var.index to contain strings, but got values like:
    ['RP11-34P13.8', 'RP11-54O7.3', 'SAMD11', 'PERM1', 'HES4']

    Inferred to be: categorical

  value_idx = self._prep_dim_index(value.index, attr)
  adata.obs.groupby('perturbation')
  pseudobulk_data = adata.to_df().groupby(adata.obs['perturbation']).sum()
  adata_train.layers['X_norm'] = adata_train.X
  adata_test.layers['X_norm'] = adata_test.X
