## Installs and imports

In [1]:
!pip install mmpdb


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import pandas as pd
import numpy as np

## Read in HepG2 data 
Data are from ChEMBL, see RetrieveChEMBLData.ipynb for details

In [3]:
output_df = pd.read_csv('hepg2_output_df.csv')

In [4]:
output_df

Unnamed: 0.1,Unnamed: 0,molecule_chembl_id,IC50,units,smiles
0,0,CHEMBL1315471,35481.3,nM,CCN(CC(=O)NCc1cccs1)S(=O)(=O)c1cc(NC(C)=O)ccc1OC
1,1,CHEMBL1511566,2818.4,nM,O=C(CN(c1ccc(F)cc1)S(=O)(=O)c1ccc2c(c1)OCCO2)N...
2,2,CHEMBL1904108,35481.3,nM,O=C(CCN1C(=O)/C(=C\c2ccccc2F)SC1=S)N1CCOCC1
3,3,CHEMBL1369478,35481.3,nM,COc1cc2c(cc1OC)C(c1ccc(-c3ccc(Cl)c(Cl)c3)o1)=NCC2
4,4,CHEMBL1200833,12589.3,nM,CNCC(O)c1ccc(OC(=O)C(C)(C)C)c(OC(=O)C(C)(C)C)c...
...,...,...,...,...,...
58130,58130,CHEMBL1894857,35481.3,nM,CCCCCCCCC(c1nc2ccccc2[nH]1)N1CCN=C1c1cc2ccccc2...
58131,58131,CHEMBL1871947,6309.6,nM,CN=C(S)N1CCN(C(=O)c2ccco2)CC1
58132,58132,CHEMBL3189459,707.9,nM,CNC(=S)N/N=C/c1cn(Cc2ccc(C)cc2)c2ccccc12
58133,58133,CHEMBL1378106,7943.3,nM,Cc1ccc(Cn2c(CO)cnc2SCC(=O)Nc2ccc(C)c(Cl)c2)cc1


## Create a tab-delimited smiles file (.smi) as input for mmpdb

In [6]:
output_df.loc[:,["smiles","molecule_chembl_id"]].to_csv("AllHepG2Cmpds.smi", sep="\t", header=False, index=False)

## Fragment the HepG2 data (this takes a while)

In [7]:
!mmpdb fragment AllHepG2Cmpds.smi -o AllHepG2Cmpds.fragdb

                                      

## Compute the pCC50 to use as part of mmpdb analysis and save to csv

In [8]:
import math
def compute_pIC50(IC50):
     return -math.log10(IC50*1e-9)

In [9]:
output_df['pCC50'] = output_df['IC50'].apply(compute_pIC50)

In [10]:
data_df = output_df.loc[:,["molecule_chembl_id","pCC50"]]
data_df.rename(columns={"molecule_chembl_id":"id"}, inplace=True)
data_df.to_csv("AllHepG2Data.csv",sep="\t", index=False)

## Create the mmp database using pCC50 values

In [11]:
!mmpdb index AllHepG2Cmpds.fragdb -o AllHepG2.mmpdb --properties AllHepG2Data.csv

                                                                   

## Repeat the above steps but this time with Tox21 data

In [12]:
tox21_df = pd.read_csv("tox21.csv.gz")
tox21_df.head()

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,mol_id,smiles
0,0.0,0.0,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,TOX3021,CCOc1ccc2nc(S(N)(=O)=O)sc2c1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3020,CCN1C(=O)NC(c2ccccc2)C1=O
2,,,,,,,,0.0,,0.0,,,TOX3024,CC[C@]1(O)CC[C@H]2[C@@H]3CCC4=CCCC[C@@H]4[C@H]...
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,TOX3027,CCCN(CC)C(CC)C(=O)Nc1c(C)cccc1C
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TOX20800,CC(O)(P(=O)(O)O)P(=O)(O)O


In [13]:
tox21_df.loc[:,["smiles","mol_id"]].to_csv("tox21.smi", sep="\t", header=False, index=False)

In [14]:
!mmpdb fragment tox21.smi -o Tox21.fragdb

                                   

## Compute the number of tox count (combined_tox) to use as a property for mmpdb

In [15]:
tox21_data_df = tox21_df.loc[:,["mol_id",'NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD',
       'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53']]
tox21_data_df['combined_tox'] = tox21_data_df.loc[:,['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD',
       'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53']].sum(axis=1)
tox21_data_df.rename(columns={"mol_id":"id"}, inplace=True)
tox21_data_df.loc[:,["id","combined_tox"]].to_csv("Tox21Data.csv",sep="\t", index=False)

In [16]:
!mmpdb index Tox21.fragdb -o Tox21.mmpdb --properties Tox21Data.csv

                                                                            