# task-perturb-multiomics-grn
## Creating resources
### by Jalil Nourisa

# Multiomics

In [4]:
import anndata as ad
import pandas as pd

data_dir = '../output/'

resource_dir = '../resources/'

In [27]:
adata_rna = ad.read_h5ad(f'{data_dir}/scRNA/adata_rna_new.h5ad')
adata_atac = ad.read_h5ad(f'{data_dir}/scATAC/adata_atac.h5ad')



In [28]:
adata_atac.obs = adata_atac.obs[['obs_id']]
adata_atac.obs = adata_atac.obs.set_index('obs_id')

In [29]:
adata_rna.var['feature_type'] = 'GEX'
adata_atac.var['feature_type'] = 'ATAC'

In [30]:
mask = adata_atac.obs.index.isin(adata_rna.obs.index) # keep only those cells that are shared 
adata_atac = adata_atac[mask,:]

In [31]:
adata = ad.concat([adata_rna, adata_atac], axis=1)
adata

AnnData object with n_obs × n_vars = 25034 × 158136
    var: 'feature_type'

In [32]:
adata.obs = pd.merge(adata_rna.obs, adata_atac.obs, left_index=True, right_index=True, how='outer')

In [33]:
adata.obs

Unnamed: 0_level_0,cell_type,donor_id
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1
000225c1151ab841,B cells,donor_0
0003c40a54367871,T cells,donor_2
0004bf574b822c3c,T cells,donor_2
000d59b5478f28e2,B cells,donor_0
0011b7473923d7b5,T cells,donor_2
...,...,...
fff2ca1f64c10339,T cells,donor_0
fff87e64f509b570,T cells,donor_0
fff9778b31bc2539,Myeloid cells,donor_2
fffa92f71d2440de,T cells,donor_1


In [34]:
adata.write(f'{resource_dir}/multiomics.h5ad')

In [41]:
adata_atac.obs = adata.obs

In [42]:
adata_atac.write_h5ad(f'{data_dir}/scATAC/adata_atac.h5ad')

# Pertub data

In [5]:
adata_perturb = ad.read_h5ad(f'{data_dir}/preprocess/bulk_adata_f.h5ad')
adata_perturb.obs = adata_perturb.obs[['cell_type', 'sm_name', 'donor_id', 'plate_name', 'row']]
adata_perturb.var = adata_perturb.var[[]]
adata_perturb

AnnData object with n_obs × n_vars = 2170 × 15215
    obs: 'cell_type', 'sm_name', 'donor_id', 'plate_name', 'row'
    layers: 'X_norm_SL', 'X_norm_pearson', 'counts'

In [7]:
def create_df_train(norm_method:str):
    import anndata as ad 
    import pandas as pd
    import os 
    work_dir = '../output'
    bulk_adata = ad.read_h5ad(f'{work_dir}/preprocess/bulk_adata_f.h5ad')
    bulk_index = ['sm_name', 'cell_type', 'donor_id', 'plate_name', 'row']
    print(bulk_adata)
    if norm_method is None:
        pass
    else:
        bulk_adata.X = bulk_adata.layers[f'X_norm_{norm_method}']

    bulk_adata_df = pd.DataFrame(bulk_adata.X, columns=bulk_adata.var.index, index=bulk_adata.obs.index)
    bulk_adata_df = pd.concat([bulk_adata.obs[bulk_index],  bulk_adata_df], axis=1).set_index(bulk_index)
    
    os.makedirs(f'../output/benchmark/df_train/', exist_ok=True)
    if norm_method is None:
        bulk_adata_df.to_csv(f'../output/benchmark/df_train/df_train.csv')
    else:
        bulk_adata_df.to_csv(f'../output/benchmark/df_train/df_train_{norm_method}.csv')
    
create_df_train('pearson')      
create_df_train('SL')  
create_df_train(None)    

AnnData object with n_obs × n_vars = 2170 × 15215
    obs: 'cell_type', 'sm_name', 'donor_id', 'plate_name', 'row', 'well', 'cell_count', 'plate_well_cell_type'
    layers: 'X_norm_SL', 'X_norm_pearson', 'counts'
AnnData object with n_obs × n_vars = 2170 × 15215
    obs: 'cell_type', 'sm_name', 'donor_id', 'plate_name', 'row', 'well', 'cell_count', 'plate_well_cell_type'
    layers: 'X_norm_SL', 'X_norm_pearson', 'counts'
AnnData object with n_obs × n_vars = 2170 × 15215
    obs: 'cell_type', 'sm_name', 'donor_id', 'plate_name', 'row', 'well', 'cell_count', 'plate_well_cell_type'
    layers: 'X_norm_SL', 'X_norm_pearson', 'counts'


# Baseline GRNs

In [47]:
def create_negative_control(tf_n: int=800, sparsity: float=.98):
    import pandas as pd 
    import os
    import numpy as np
    df_train = pd.read_csv(f'../resources/df_train/df_train.csv')

    genes_n = df_train.columns.shape[0]
    X_random = np.random.choice([0, -1, 1], size=(genes_n, tf_n), p=[sparsity, (1-sparsity)/2, (1-sparsity)/2])
    row_names = df_train.columns
    col_names = [f'TF_{i}' for i in range(tf_n)]
    grn_random = pd.DataFrame(X_random, index=row_names, columns=col_names)
    grn_random.reset_index(inplace=True)
    grn_random = pd.melt(grn_random, id_vars=['index'], var_name='TF', value_name='weight')
    grn_random.columns = ['target', 'source', 'weight']
    grn_random = grn_random[['source', 'target', 'weight']]
    grn_random = grn_random[grn_random.weight!=0].reset_index(drop=True)

    print(grn_random.shape)

    grn_random.to_csv(f'../output/benchmarking/baseline_models/negative_control.csv')

create_negative_control()


(243232, 3)


In [13]:
import pandas as pd 
import os
import numpy as np
norm_method = 'pearson'
tfs_list = np.loadtxt(f'../output/utoronto_human_tfs_v_1.01.txt', dtype=str)
df_train = pd.read_csv(f'../resources/df_train/df_train.csv').set_index(['sm_name','cell_type','plate_name','row'])

In [23]:
gene_names = df_train.columns

In [25]:
tfs_list =  np.asarray([gene for gene in tfs_list if gene in df_train.columns])
tf_idx = np.asarray([idx for idx, gene in enumerate(df_train.columns) if gene in tfs_list])

In [17]:
import portia as pt
dataset = pt.GeneExpressionDataset()
for exp_id, data in enumerate(df_train.values):
    dataset.add(pt.Experiment(exp_id, data))
M_bar, S = pt.run(dataset, tf_idx=tf_idx, method='fast', return_sign=True)


Gene expression matrix of shape (2170, 15215)


In [30]:
def format_grn(scores, S, gene_names):
    tfs = []
    targets = []
    weights = []
    for row_i in range(len(scores)):
        for col_i in range(len(scores[0])):
            score = scores[row_i][col_i]
            if score == 0:
                continue
            s = S[row_i][col_i]
            tf = gene_names[row_i]
            gene = gene_names[col_i]

            tfs.append(tf)
            targets.append(gene)
            weights.append(score*s)

    return pd.DataFrame({'source':tfs, 'target':targets, 'weight':weights})

positive_control = format_grn(M_bar, S, gene_names)

positive_control


Unnamed: 0,source,target,weight
0,AC092835.1,A1BG,0.010819
1,AC092835.1,A1BG-AS1,-0.012011
2,AC092835.1,A2M,-0.001585
3,AC092835.1,A2M-AS1,-0.006012
4,AC092835.1,A2MP1,-0.012426
...,...,...,...
16065979,ZXDC,ZXDA,-0.023699
16065980,ZXDC,ZXDB,0.007010
16065981,ZXDC,ZYG11B,-0.003351
16065982,ZXDC,ZYX,0.004515


In [45]:
positive_control_f = positive_control[positive_control.index.isin(positive_control.weight.abs().sort_values()[-200000:,].index)].reset_index()

In [46]:
# np.setdiff1d(positive_control.target.unique(), tfs_list).shape
# positive_control.source.unique().shape
positive_control_f.to_csv(f'../output/benchmarking/baseline_models/positive_control.csv')

# GRNS

In [34]:
work_dir = '../output'
os.makedirs('../output/benchmarking/grn_models/', exist_ok=True)
os.makedirs('../output/benchmarking/peak_gene_models/', exist_ok=True)
grn_model_names = ['figr', 'celloracle', 'granie', 'ananse', 'scglue', 'scenicplus']
for name in grn_model_names:
    grn = pd.read_csv(f'{work_dir}/infer/{name}/grn/{name}_grn.csv', index_col=0)
    if 'cell_type' in grn.columns:
        grn['cell_type'].replace({'agg_type': 'T cells'}, inplace=True)
    
    grn.to_csv(f'../output/benchmarking/grn_models/{name}.csv')
collectRI = pd.read_csv("https://github.com/pablormier/omnipath-static/raw/main/op/collectri-26.09.2023.zip")
collectRI.to_csv(f'../output/benchmarking/grn_models/collectri.csv')

peak_gene_model = ['figr', 'celloracle', 'granie']

for name in peak_gene_model:
    peak_gene = pd.read_csv(f'{work_dir}/infer/{name}/peak_gene.csv', index_col=0)
    peak_gene.to_csv(f'../output/benchmarking/peak_gene_models/{name}.csv')


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  grn['cell_type'].replace({'agg_type': 'T cells'}, inplace=True)
