In [120]:
import itertools
from itertools import permutations

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

%matplotlib inline

## Input data

* Tox21 compounds with cluster assignment 
* Compound to target annotations from pharos and drug hub
* Target enrichment results for each cluster

In [121]:
tox21_data = pd.read_csv("data_w_identifiers.csv")
target_data = pd.read_csv("pharos_and_drughub.csv")
enrichment_results = pd.read_csv("enrichment_analysis_results.csv")

In [122]:
tox21_data.head()

Unnamed: 0,cluster,cas,chembl,pubchem_cid,smiles,inchikey,sample_name,mechanism_of_action
0,1,129299-90-7,CHEMBL3185034,180494.0,Cl.Cc4ncnc4C[C@H]3CCc2c(C)c1ccccc1n2C3=O,,Fabesetron hydrochloride,
1,1,NOCAS_48522,,60196348.0,FC(F)(F)C(=O)O.CCN(CC)C(=O)c1cc(c(cc1N(CC)CCN(...,XDSUEKSKAPHPTN-UHFFFAOYSA-N,HMR1171 trifluoroacetate (1:1),
2,1,33414-30-1,CHEMBL1620387,969494.0,CN1CCN(CC1)CCC(=O)N2c4ccccc4Sc3ccc(cc23)C(F)(F)F,DOUQJBPSTIKRPH-UHFFFAOYSA-N,Ftormetazine,
3,1,84-02-6,CHEMBL1314751,5281032.0,O=C(O)/C=C\C(=O)O.O=C(O)/C=C\C(=O)O.CN1CCN(CC1...,DSKIOWHQLUWFLG-SPIKMXEPSA-N,Prochlorperazine dimaleate,Dopamine D2 receptor antagonist
4,1,289716-94-5,CHEMBL488060,9861124.0,Clc2ccc(Oc1ccc(F)cc1CNC)cc2Cl,FQEBOQLYHASAOY-UHFFFAOYSA-N,CP-607366,


In [123]:
target_data.head()

Unnamed: 0,cas,cluster,inchikey,target_gene
0,303-49-1,1,GDLIGKIOYRNHDA-UHFFFAOYSA-N,GSTP1
1,303-49-1,1,GDLIGKIOYRNHDA-UHFFFAOYSA-N,HTR2A
2,303-49-1,1,GDLIGKIOYRNHDA-UHFFFAOYSA-N,HTR2B
3,303-49-1,1,GDLIGKIOYRNHDA-UHFFFAOYSA-N,HTR2C
4,303-49-1,1,GDLIGKIOYRNHDA-UHFFFAOYSA-N,SLC6A2


In [124]:
enrichment_results.head()

Unnamed: 0,cluster,target_gene,TruePos,FalsePos,FalseNeg,TrueNeg,TotalCmpdsTested,OddsRatio,p_val
0,1,ABCB1,1,11,11,1078,1101,8.909091,0.123811
1,1,ACHE,1,27,11,1062,1101,3.575758,0.267066
2,1,ADRA1A,1,35,11,1054,1101,2.737662,0.33033
3,1,ADRA1B,1,26,11,1063,1101,3.716783,0.258785
4,1,ADRA1D,3,32,9,1057,1101,11.010417,0.005324


## Filter and merge

* Filter enrichments based on p-value (TODO: use better cutoff)
* Merge target and cluster data

In [125]:
target_data = target_data.merge(
    tox21_data[['inchikey','chembl','pubchem_cid','sample_name']].drop_duplicates(),
    on='inchikey',how='inner')

In [128]:
target_data

Unnamed: 0,cas,cluster,inchikey,target_gene,chembl,pubchem_cid,sample_name
0,303-49-1,1,GDLIGKIOYRNHDA-UHFFFAOYSA-N,GSTP1,CHEMBL415,2801.0,Clomipramine hydrochloride
1,303-49-1,1,GDLIGKIOYRNHDA-UHFFFAOYSA-N,HTR2A,CHEMBL415,2801.0,Clomipramine hydrochloride
2,303-49-1,1,GDLIGKIOYRNHDA-UHFFFAOYSA-N,HTR2B,CHEMBL415,2801.0,Clomipramine hydrochloride
3,303-49-1,1,GDLIGKIOYRNHDA-UHFFFAOYSA-N,HTR2C,CHEMBL415,2801.0,Clomipramine hydrochloride
4,303-49-1,1,GDLIGKIOYRNHDA-UHFFFAOYSA-N,SLC6A2,CHEMBL415,2801.0,Clomipramine hydrochloride
...,...,...,...,...,...,...,...
5385,521-74-4,144,ZDASUJMDVPTNTF-UHFFFAOYSA-N,HSP90AA1,CHEMBL223448,2453.0,Broxyquinoline
5386,140-89-6,144,JCBJVAJGLKENNC-UHFFFAOYSA-M,CA1,CHEMBL2380738,2735045.0,Potassium ethyl xanthate
5387,140-89-6,144,JCBJVAJGLKENNC-UHFFFAOYSA-M,CA2,CHEMBL2380738,2735045.0,Potassium ethyl xanthate
5388,140-89-6,144,JCBJVAJGLKENNC-UHFFFAOYSA-M,CA12,CHEMBL2380738,2735045.0,Potassium ethyl xanthate


In [126]:
cutoff=0.05/target_data.shape[0]
significant_enrichments = enrichment_results[enrichment_results['p_val']<cutoff]

In [127]:
significant_enrichments

Unnamed: 0,cluster,target_gene,TruePos,FalsePos,FalseNeg,TrueNeg,TotalCmpdsTested,OddsRatio,p_val
12,1,CALM1,4,4,8,1085,1101,135.625,5.56e-07
20,1,CYP2D6,6,37,6,1052,1101,28.432432,1.93e-06
34,1,HTR2C,6,36,6,1053,1101,29.25,1.67e-06
36,1,HTR6,5,22,7,1067,1101,34.642857,4.25e-06
39,1,KCNH2,7,56,5,1033,1101,25.825,9.13e-07
140,4,NR3C1,31,19,1,1050,1101,1713.157895,6.1e-46
143,4,PLA2G1B,4,2,28,1067,1101,76.214286,8.5e-06
144,4,SERPINA6,7,9,25,1060,1101,32.977778,8.41e-08
442,16,PGR,5,17,1,1078,1101,317.058824,1.17e-08
691,24,PTGS1,10,39,8,1044,1101,33.461538,3.99e-10


# Find novel compound - target candidates 

* A novel compound-target pair candidate is defined as:
    * The compound is in the cluster
    * The target is in enriched in the cluster
    * The compound is not known have activity against the target
* I will then tier the candidate pairs in the following way:
    * Tier 1: If the compound is known to have activity against a different enriched target
        * Justification: the enriched targets may be related and so activity against one may suggest activity against another 
    * Tier 2: If the compound is known to have activity against ANY target
        * Justification: if a compound has a target annotation then it has been studied in a bioassay and at least has some biological activity, thus is more likely to have activity against other targets too
    * Tier 3: Pairs with compounds having no annotation
* Within each tier, the candidate pairs could be sorted by the number of compounds with known activity against the target -- but this makes sense as a global analysis not within tox21 compounds only.

In [130]:
compound_targets = {}
for i, row in target_data.iterrows():
    if (row['cas'],row['inchikey']) in compound_targets:
        compound_targets[(row['cas'],row['inchikey'])].append(row['target_gene'])
    else:
        compound_targets[(row['cas'],row['inchikey'])] = [row['target_gene']]


In [131]:
focal_cluster_enrichments

Unnamed: 0,cluster,target_gene,TruePos,FalsePos,FalseNeg,TrueNeg,TotalCmpdsTested,OddsRatio,p_val
916,31,DRD3,10,38,8,1045,1101,34.375,3.19e-10
917,31,DRD4,6,25,12,1058,1101,21.16,4.42e-06
927,31,HTR2A,9,52,9,1031,1101,19.826923,8.96e-08


In [132]:
#not every cluster has significant targets
candidates = []
for focal_cluster in set(significant_enrichments['cluster']):
    focal_cluster_compounds = tox21_data[tox21_data['cluster']==focal_cluster]
    focal_cluster_enrichments = significant_enrichments[significant_enrichments['cluster']==focal_cluster]
    #focal_cluster_targets = target_data[target_data['cluster']==focal_cluster]
    
    
    #hashable_compound_keys = zip(focal_cluster_compounds['cas'],focal_cluster_compounds['inchikey'])
    
    for c, compound_row in focal_cluster_compounds.iterrows():
        for t, target_row in focal_cluster_enrichments.iterrows():
            #confirm this compound - target pair is not an existing annotations
            candidate_row = {**dict(compound_row),**dict(target_row)}
            compound_annot = compound_targets.get((compound_row['cas'],compound_row['inchikey']))
            if compound_annot is not None:
                
                if target_row['target_gene'] in compound_annot:
                    #compound has known activity against this target
                    pass
                elif len(set(focal_cluster_enrichments['target_gene']).intersection(set(compound_annot)))>0:
                    #compound has activity against a different enriched target 
                    candidate_row['tier'] = 'Tier 1'
                    candidates.append(candidate_row)
                else:
                    #compound has activity against another target (not enriched in this cluster)
                    candidate_row['tier'] = 'Tier 2'
                    candidates.append(candidate_row)
                
            else:
                candidate_row['tier'] = 'Tier 3'
                candidates.append(candidate_row)      
                
    
    
compound_target_candidates = pd.DataFrame(candidates)


In [133]:
compound_target_candidates

Unnamed: 0,cluster,cas,chembl,pubchem_cid,smiles,inchikey,sample_name,mechanism_of_action,target_gene,TruePos,FalsePos,FalseNeg,TrueNeg,TotalCmpdsTested,OddsRatio,p_val,tier
0,64,54239-37-1,CHEMBL1374751,2755.0,N#Cc1cc(ccc1N)C(O)CNC(C)C,BUXRLJCGHZZYNE-UHFFFAOYSA-N,Cimaterol,,ADRB2,6,9,8,1078,1101,89.833333,5.820000e-09,Tier 1
1,64,72332-33-3,CHEMBL1478530,688561.0,CC(C)N[C@@H](CC)[C@H](O)c2ccc(O)c1NC(=O)C=Cc12,FKNXQNWAXFXVNW-BLLLJJGKSA-N,Procaterol,,ADRB1,5,13,9,1074,1101,45.897436,1.170000e-06,Tier 1
2,64,94749-08-3,CHEMBL1082607,56801.0,OCc1cc(ccc1O)C(O)CNCCCCCCOCCCCc2ccccc2.O=C(O)c...,XTZNCVSCVHTPAI-UHFFFAOYSA-N,Salmeterol xinafoate,Beta-2 adrenergic receptor agonist,ADRB1,5,13,9,1074,1101,45.897436,1.170000e-06,Tier 3
3,64,94749-08-3,CHEMBL1082607,56801.0,OCc1cc(ccc1O)C(O)CNCCCCCCOCCCCc2ccccc2.O=C(O)c...,XTZNCVSCVHTPAI-UHFFFAOYSA-N,Salmeterol xinafoate,Beta-2 adrenergic receptor agonist,ADRB2,6,9,8,1078,1101,89.833333,5.820000e-09,Tier 3
4,64,101975-10-4,CHEMBL313842,5723.0,FC(F)Oc1ccc(cc1OC)C=2C=CC(=O)NN=2,HJMQDJPMQIHLPB-UHFFFAOYSA-N,Zardaverine,,ADRB1,5,13,9,1074,1101,45.897436,1.170000e-06,Tier 2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1840,31,537-12-2,CHEMBL1456312,10830.0,Cl.O=C(Nc1ccccc1)OCC(OC(=O)Nc2ccccc2)CN3CCCCC3,OWULVAZDMWJBLB-UHFFFAOYSA-N,Diperodon hydrochloride,,DRD4,6,25,12,1058,1101,21.160000,4.420000e-06,Tier 3
1841,31,537-12-2,CHEMBL1456312,10830.0,Cl.O=C(Nc1ccccc1)OCC(OC(=O)Nc2ccccc2)CN3CCCCC3,OWULVAZDMWJBLB-UHFFFAOYSA-N,Diperodon hydrochloride,,HTR2A,9,52,9,1031,1101,19.826923,8.960000e-08,Tier 3
1842,31,5560-59-8,CHEMBL1408594,21718.0,O=C(O)C(O)(CC(=O)O)CC(=O)O.CCN(CCCc1ccccc1)CCC...,RYHCACJBKCOBTJ-UHFFFAOYSA-N,Alverine citrate,,DRD3,10,38,8,1045,1101,34.375000,3.190000e-10,Tier 2
1843,31,5560-59-8,CHEMBL1408594,21718.0,O=C(O)C(O)(CC(=O)O)CC(=O)O.CCN(CCCc1ccccc1)CCC...,RYHCACJBKCOBTJ-UHFFFAOYSA-N,Alverine citrate,,DRD4,6,25,12,1058,1101,21.160000,4.420000e-06,Tier 2


In [134]:
compound_target_candidates[compound_target_candidates['tier']=='Tier 1']

Unnamed: 0,cluster,cas,chembl,pubchem_cid,smiles,inchikey,sample_name,mechanism_of_action,target_gene,TruePos,FalsePos,FalseNeg,TrueNeg,TotalCmpdsTested,OddsRatio,p_val,tier
0,64,54239-37-1,CHEMBL1374751,2755.0,N#Cc1cc(ccc1N)C(O)CNC(C)C,BUXRLJCGHZZYNE-UHFFFAOYSA-N,Cimaterol,,ADRB2,6,9,8,1078,1101,89.833333,5.820000e-09,Tier 1
1,64,72332-33-3,CHEMBL1478530,688561.0,CC(C)N[C@@H](CC)[C@H](O)c2ccc(O)c1NC(=O)C=Cc12,FKNXQNWAXFXVNW-BLLLJJGKSA-N,Procaterol,,ADRB1,5,13,9,1074,1101,45.897436,1.170000e-06,Tier 1
20,64,18559-94-9,CHEMBL714,2083.0,OCc1cc(ccc1O)C(O)CNC(C)(C)C,NDAUXUAQIAJITI-UHFFFAOYSA-N,Salbutamol sulfate,Beta-2 adrenergic receptor agonist,ADRB1,5,13,9,1074,1101,45.897436,1.170000e-06,Tier 1
21,64,18559-94-9,CHEMBL714,2083.0,OCc1cc(ccc1O)C(O)CNC(C)(C)C,NDAUXUAQIAJITI-UHFFFAOYSA-N,Albuterol,Beta-2 adrenergic receptor agonist,ADRB1,5,13,9,1074,1101,45.897436,1.170000e-06,Tier 1
64,1,289716-94-5,CHEMBL488060,9861124.0,Clc2ccc(Oc1ccc(F)cc1CNC)cc2Cl,FQEBOQLYHASAOY-UHFFFAOYSA-N,CP-607366,,CALM1,4,4,8,1085,1101,135.625000,5.560000e-07,Tier 1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1816,31,52-53-9,CHEMBL6966,2520.0,COc1cc(ccc1OC)C(C#N)(CCCN(C)CCc2ccc(OC)c(OC)c2...,SGTNSNPWRIOYBX-UHFFFAOYSA-N,Verapamil,,DRD4,6,25,12,1058,1101,21.160000,4.420000e-06,Tier 1
1817,31,52-53-9,CHEMBL6966,2520.0,COc1cc(ccc1OC)C(C#N)(CCCN(C)CCc2ccc(OC)c(OC)c2...,SGTNSNPWRIOYBX-UHFFFAOYSA-N,Verapamil,,HTR2A,9,52,9,1031,1101,19.826923,8.960000e-08,Tier 1
1821,31,5588-33-0,CHEMBL1088,4078.0,CS(=O)c2cc3N(CCC1CCCCN1C)c4ccccc4Sc3cc2,SLVMESMUVMCQIY-UHFFFAOYSA-N,Mesoridazine,,DRD4,6,25,12,1058,1101,21.160000,4.420000e-06,Tier 1
1834,31,1649-18-9,CHEMBL340211,15443.0,Fc1ccc(cc1)C(=O)CCCN2CCN(CC2)c3ccccn3,XTKDAFGWCDAMPY-UHFFFAOYSA-N,Azaperone,,DRD4,6,25,12,1058,1101,21.160000,4.420000e-06,Tier 1


In [147]:
#Confirm no overlap between candidate set and target annotations
compound_target_candidates[['cas','inchikey','target_gene']].merge(target_data[['cas','inchikey','target_gene']],how='inner')

Unnamed: 0,cas,inchikey,target_gene


In [148]:
compound_target_candidates.to_csv("tox21_cluster_compound_target_candidates.csv",index=False)