#TODO: check those smiles that were not given in BindingDB

In [1]:
import json 
import pandas as pd 
import os 
import anndata as ad
import decoupler as dc
import numpy as np
import sys 
from local_utils import  map 
warnings.filterwarnings('ignore')

work_dir = '../output'
os.makedirs(f'{work_dir}/affinity', exist_ok=True)
# read smiles map 
with open(f'{work_dir}/affinity/map_smile_sm_name.json', 'r') as f:
    map_smile_sm_name = json.load(f)

smiles_de_train = list(map_smile_sm_name.keys())
sm_names_de_train = list(map_smile_sm_name.values())



## Map smile to sm_name: pass

In [28]:
df_train = pd.read_parquet('../input/kaggle/input/open-problems-single-cell-perturbations/de_train.parquet')
map_smile_sm_name = {}
for smile, sm_name in zip(df_train.SMILES, df_train.sm_name):
    if smile not in map_smile_sm_name.keys():
        map_smile_sm_name[smile] = sm_name
with open(f'{work_dir}/affinity/map_smile_sm_name.json', 'w') as f:
    json.dump(map_smile_sm_name, f)

## Process binding data of BindingDB
We save it as sm_name:targets 

In [2]:
data = pd.read_csv(f'{work_dir}/affinity/database/BindingDB_All_202406.tsv', sep='\t', on_bad_lines='skip')

  data = pd.read_csv(f'{work_dir}/affinity/database/BindingDB_All_202406.tsv', sep='\t', on_bad_lines='skip')


In [26]:
# subset to those smiles that are given in de_train. #TODO: check the compounds with names 
data_subset = data[data['Ligand SMILES'].isin(smiles_de_train)] 
# rename columns
data_subset = data_subset[['Ligand SMILES', 'UniProt (SwissProt) Entry Name of Target Chain']].reset_index(drop=True)
data_subset.columns = ['SMILES', 'prot']

n_initial_prots = len(data_subset.prot.unique())
# map smile to sm_name
data_subset['sm_name'] = data_subset.SMILES.map(map_smile_sm_name)

data_subset = data_subset.loc[~data_subset.prot.isna(), :]
prots = data_subset.prot.unique()

# map protname to genename 
prot_to_gene_map = map.protname_genename(prots)

data_subset['gene'] = data_subset['prot'].map(prot_to_gene_map)

data_subset = data_subset[~data_subset.gene.isna()].reset_index(drop=True)

print(f"From {n_initial_prots} proteins, {len(prots)} has prot name in UniProt, from which {len(data_subset['gene'].unique())} has gene names")

# group targets for compounds
targets = data_subset.groupby('sm_name')['gene'].apply(list).to_dict()
all_targets_bDB = np.unique(np.concatenate(list(targets.values())))
# save
with open(f'{work_dir}/affinity/curated/targets_bindingDB_all.json', 'w') as f:
    json.dump(targets, f)

Fetched: 500 / 559
Fetched: 559 / 559
From 565 proteins, 564 has prot name in UniProt, from which 552 has gene names


### How to map smiles to some names

In [31]:
import requests
def smiles_to_iupac(smiles_list):
    rep = "Names"
    results_map = {}
    CACTUS = "https://cactus.nci.nih.gov/chemical/structure/{0}/{1}"
    for smiles in smiles_list:
        url = CACTUS.format(smiles, rep)
        try:
            response = requests.get(url)
            response.raise_for_status()
            rr = response.text.split('\n')
            results_map[smiles] = rr
        except requests.exceptions.RequestException as e:
            # Handle HTTP request errors by adding NaN to the results map
            results_map[smiles] = float('nan')
    return results_map
smiles_all = data_subset.SMILES.unique()
rr = smiles_to_iupac(smiles_all)


In [32]:
for smile, names in rr.items():
    if type(names)==list:
        continue
    print(names, smile)

nan O=C(Nc1nc2cccc(-c3ccc(CN4CCS(=O)(=O)CC4)cc3)n2n1)C1CC1
nan CC[C@H](Nc1ncnc2[nH]cnc12)c1nc2cccc(F)c2c(=O)n1-c1ccccc1
nan CC(C)C[C@H](NC(=O)CNC(=O)c1cc(Cl)ccc1Cl)B(O)O


## Broad data
Very low on targets (only 25)

In [2]:
with open(f'{work_dir}/affinity/curated/targets_bindingDB_all.json', 'r') as f:
    targets_bindingDB = json.load(f)

In [17]:
data_broad = pd.read_csv(f'{work_dir}/affinity/database/Repurposing_Hub_export.txt', sep='\t')
data_broad_subset = data_broad[data_broad.Name.isin(sm_names_de_train)]
data_broad_subset = data_broad_subset[['Name', 'Target']].reset_index(drop=True)
data_broad_subset.columns = ['sm_name', 'target']
data_broad_subset = data_broad_subset.groupby('sm_name')['target'].apply(list).to_dict()
# save
with open(f'{work_dir}/affinity/curated/targets_broad.json', 'w') as f:
    json.dump(data_broad_subset, f)

## Coverage on genes and TFs

In [11]:
with open(f'{work_dir}/affinity/curated/targets_bindingDB_all.json', 'r') as f:
    targets_bindingDB = json.load(f)
    all_targets_bDB = np.unique(np.concatenate(list(targets_bindingDB.values())))
    print('number of targets: ', len(all_targets_bDB))
with open(f'{work_dir}/affinity/curated/targets_broad.json', 'r') as f:
    targets_broad  = json.load(f)
    print('number of targets: ', len(np.unique(np.concatenate(list(targets_broad.values())))))
tfs_list = np.loadtxt(f'{work_dir}/utoronto_human_tfs_v_1.01.txt', dtype=str)
print('number of TFs: ', len(tfs_list))

number of targets:  552
number of targets:  25
number of TFs:  1639


In [54]:
np.intersect1d(tfs_list, all_targets_bDB).shape


(12,)

In [47]:
# lets try with protnames
from local_utils import map
all_targets_bDB_prots = map.genename_protname(all_targets_bDB)
tfs_list_prot = map.genename_protname(tfs_list)

all_targets_bDB_prots = np.asarray(list(all_targets_bDB_prots.values()))
tfs_list_prot = np.asarray(list(tfs_list_prot.values()))


In [53]:
np.intersect1d(tfs_list_prot, all_targets_bDB_prots).shape


(12,)

## Coverage on grn

In [55]:
# read nets
grn_model_names = ['collectRI', 'figr', 'celloracle', 'granie', 'ananse', 'scglue', 'scenicplus']

grn_models_dict = {}
for name in grn_model_names:
    grn_models_dict[name] = pd.read_csv(f'{work_dir}/benchmark/grn_models/{name}.csv', index_col=0)

In [113]:
# check the presence of target genes among tfs 
targets = all_targets_bDB
all_targets = targets
for name, grn in grn_models_dict.items():
    print(name, np.intersect1d(all_targets, grn.target.unique()).shape) 

collectRI (356,)
figr (244,)
celloracle (283,)
granie (204,)
ananse (135,)
scglue (347,)
scenicplus (335,)


: 

# Some analysis

In [110]:
# targets as hvgs
# bulk_adata = ad.read_h5ad(f'{work_dir}/preprocess/bulk_adata_integrated.h5ad')
import warnings
for method in ['lognorm', 'pearson', 'seurat_lognorm', 'seurat_pearson', 'scgen_lognorm', 'scgen_pearson']:
    bulk_adata.X = bulk_adata.layers[method]
    sc.pp.highly_variable_genes(bulk_adata, n_top_genes=3000)
    hvgs = bulk_adata.var_names[bulk_adata.var.highly_variable]
    genes_overlap = np.intersect1d(all_targets, bulk_adata.var_names)
    n_targets = len(genes_overlap)
    n_all_genes = bulk_adata.shape[1]

    y_true = np.intersect1d(genes_overlap, hvgs).shape[0]

    y_preds = []
    for i in range(1000):
        mask = np.random.choice([True, False], n_all_genes,  p=[n_targets/n_all_genes, (n_all_genes-n_targets)/n_all_genes])
        random_genes = bulk_adata.var_names[mask]
        y_preds.append(np.intersect1d(random_genes, hvgs).shape[0])
    y_preds = np.asarray(y_preds)

    print(method, (y_preds > y_true).sum())

lognorm 1000
pearson 281
seurat_lognorm 985
seurat_pearson 31
scgen_lognorm 967
scgen_pearson 452


In [111]:
n_targets

352

## Enrichment analysis

In [97]:
# read de_data
de_train = pd.read_parquet(f'{work_dir}/affinity/de_train.parquet')
de_train = de_train.reset_index()
sm_names = de_train.sm_name
sample = de_train.sm_name + '@'  + de_train.cell_type
de_train['sample'] = sample
de_train.set_index('sample', inplace=True)
de_train.drop(columns=['cell_type', 'sm_name'], inplace=True)
de_train.head(3)

gene,A1BG,A1BG-AS1,A2M,A2M-AS1,AAAS,AACS,AAGAB,AAK1,AAMDC,AAMP,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-6-yl)pyrimidin-2-amine@B cells,0.826571,0.366722,0.605799,0.577766,-0.287037,0.293762,0.606627,0.367315,0.955483,-0.579767,...,-0.573711,0.549571,0.077775,0.411108,1.571559,0.803124,0.950774,-0.067148,0.171418,1.943
5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-6-yl)pyrimidin-2-amine@Myeloid cells,1.144635,0.888033,0.294882,1.034652,-0.079176,0.521397,-2.105741,1.811037,0.213352,0.065167,...,-1.100933,0.367605,0.088639,-0.498617,0.899551,0.092172,-0.212104,0.403848,2.683611,0.591665
5-(9-Isopropyl-8-methyl-2-morpholino-9H-purin-6-yl)pyrimidin-2-amine@NK cells,-0.022487,-2.22907,0.078174,2.232399,0.386491,-0.312335,0.108862,-0.570145,0.384764,0.008786,...,0.111267,-2.372358,-0.228849,-1.313962,0.349919,1.20787,0.11907,0.869728,-0.756822,-0.155182


In [98]:
net = grn_models_dict['scenicplus']
tf_act, p_values = dc.run_ulm(de_train, net=net)

# we only want sm_name as sample
tf_act['sm_name'] = sm_names.values
tf_act.set_index('sm_name', inplace=True)

tf_act.shape

(537, 65)

## Evaluate presence of targets 

In [106]:
ep_count = {}
for sm_name, target_genes in targets.items():
    target_genes_n = len(target_genes)
    TFs = tf_act.columns
    print(target_genes_n, len(np.intersect1d(TFs, target_genes)))
    # aa
    # 
    # # get tf act for given compound
    # mask = tf_act.index.get_level_values('sm_name') == sm_name
    # tf_act_subset = tf_act[mask]
    # for index, activities in  tf_act_subset.iterrows():
    #     np.intersect()
    #     # print(np.sort(np.abs(activities)))
    #     aa


8 0
2 0
52 0
39 0
5 0
852 0
31 0
31 0
15 0
47 0
6 0
38 0
106 0
4 0
18 0
6 0
17 0
6 0
3 0
22 0
11 0
1011 0
