## G1 merge datasets

Simulate the Chembl drug information from the initial Chembl dataset:
- create all the experimental conditions

In [1]:
import pandas as pd
import os
import ds_utils

In [2]:
WorkingFolder = './datasets/'
ExperimCondList = ['STANDARD_TYPE_UNITSj','ASSAY_CHEMBLID','ASSAY_TYPE','ASSAY_ORGANISM',
                   'ORGANISM','TARGET_CHEMBLID']

In [5]:
# details with MAs (ds_G1_raw0.csv was created manually)
df_det = pd.read_csv(os.path.join(WorkingFolder, 'ds_details.csv'))
df_new = pd.read_csv(os.path.join(WorkingFolder, 'ds_G1_raw0.csv'))

Get PROTEIN_ACCESSION from details file:

In [6]:
df_pa =df_det[['PROTEIN_ACCESSION','TARGET_CHEMBLID']].drop_duplicates()
df_pa

Unnamed: 0,PROTEIN_ACCESSION,TARGET_CHEMBLID
0,O15264,CHEMBL2094115
842,Q16539,CHEMBL260
7893,P36888,CHEMBL1974
12513,Q96BR1,CHEMBL6186
13640,Q9H4B4,CHEMBL4897
14359,P13500,CHEMBL1649052
14424,P17181,CHEMBL1887
14425,Q99988,CHEMBL3120039
14432,P13501,CHEMBL1275217
14461,O35235,CHEMBL3596084


Merge PROTEIN_ACCESSION to G1:

In [7]:
df_last = pd.merge(df_new, df_pa, on=['TARGET_CHEMBLID'])
df_last.head()

Unnamed: 0,CMPD_CHEMBLID,CANONICAL_SMILES,PROTEIN_ACCESSION_x,ACTIVITY_ID,STANDARD_TYPE_UNITSj,STANDARD_VALUE,ASSAY_CHEMBLID,ASSAY_TYPE,ASSAY_ORGANISM,CURATED_BY,...,nRotB,LipinskiFailures,TopoPSA,VAdjMat,MW,WPATH,WPOL,XLogP,Zagreb,PROTEIN_ACCESSION_y
0,G1,Br\C(=C\c1oc(Br)cc1)[N+]([O-])=O,,,Inhibition (%),,CHEMBL819807,F,Homo sapiens,,...,2,0,56.28,4.584963,294.847967,211,12,2.817,56,O15264
1,G1,Br\C(=C\c1oc(Br)cc1)[N+]([O-])=O,,,Inhibition (%),,CHEMBL819808,F,Homo sapiens,,...,2,0,56.28,4.584963,294.847967,211,12,2.817,56,O15264
2,G1,Br\C(=C\c1oc(Br)cc1)[N+]([O-])=O,,,Inhibition (%),,CHEMBL818993,F,Homo sapiens,,...,2,0,56.28,4.584963,294.847967,211,12,2.817,56,O15264
3,G1,Br\C(=C\c1oc(Br)cc1)[N+]([O-])=O,,,IC50 (nM),,CHEMBL818992,F,Homo sapiens,,...,2,0,56.28,4.584963,294.847967,211,12,2.817,56,O15264
4,G1,Br\C(=C\c1oc(Br)cc1)[N+]([O-])=O,,,IC50 (nM),,CHEMBL769660,F,Homo sapiens,,...,2,0,56.28,4.584963,294.847967,211,12,2.817,56,O15264


In [8]:
# change name
df_last.rename(columns={'PROTEIN_ACCESSION_x': 'PROTEIN_ACCESSION'}, inplace=True)
df_last.columns

Index(['CMPD_CHEMBLID', 'CANONICAL_SMILES', 'PROTEIN_ACCESSION', 'ACTIVITY_ID',
       'STANDARD_TYPE_UNITSj', 'STANDARD_VALUE', 'ASSAY_CHEMBLID',
       'ASSAY_TYPE', 'ASSAY_ORGANISM', 'CURATED_BY',
       ...
       'nRotB', 'LipinskiFailures', 'TopoPSA', 'VAdjMat', 'MW', 'WPATH',
       'WPOL', 'XLogP', 'Zagreb', 'PROTEIN_ACCESSION_y'],
      dtype='object', length=162)

In [9]:
# place the real values
df_last['PROTEIN_ACCESSION']=df_last['PROTEIN_ACCESSION_y']
df_last.drop('PROTEIN_ACCESSION_y', axis = 1, inplace=True)
df_last.head()

Unnamed: 0,CMPD_CHEMBLID,CANONICAL_SMILES,PROTEIN_ACCESSION,ACTIVITY_ID,STANDARD_TYPE_UNITSj,STANDARD_VALUE,ASSAY_CHEMBLID,ASSAY_TYPE,ASSAY_ORGANISM,CURATED_BY,...,topoShape,nRotB,LipinskiFailures,TopoPSA,VAdjMat,MW,WPATH,WPOL,XLogP,Zagreb
0,G1,Br\C(=C\c1oc(Br)cc1)[N+]([O-])=O,O15264,,Inhibition (%),,CHEMBL819807,F,Homo sapiens,,...,0.75,2,0,56.28,4.584963,294.847967,211,12,2.817,56
1,G1,Br\C(=C\c1oc(Br)cc1)[N+]([O-])=O,O15264,,Inhibition (%),,CHEMBL819808,F,Homo sapiens,,...,0.75,2,0,56.28,4.584963,294.847967,211,12,2.817,56
2,G1,Br\C(=C\c1oc(Br)cc1)[N+]([O-])=O,O15264,,Inhibition (%),,CHEMBL818993,F,Homo sapiens,,...,0.75,2,0,56.28,4.584963,294.847967,211,12,2.817,56
3,G1,Br\C(=C\c1oc(Br)cc1)[N+]([O-])=O,O15264,,IC50 (nM),,CHEMBL818992,F,Homo sapiens,,...,0.75,2,0,56.28,4.584963,294.847967,211,12,2.817,56
4,G1,Br\C(=C\c1oc(Br)cc1)[N+]([O-])=O,O15264,,IC50 (nM),,CHEMBL769660,F,Homo sapiens,,...,0.75,2,0,56.28,4.584963,294.847967,211,12,2.817,56


Save the final G1 file to be used to add descriptors, MA, and predictions with the best model:

In [10]:
# merge first protein descriptor file
df_prot1 = pd.read_csv(os.path.join(WorkingFolder, 'Protein_descriptors.csv'))
df_last2 = pd.merge(df_last, df_prot1, on='PROTEIN_ACCESSION')
df_last2.head()

Unnamed: 0,CMPD_CHEMBLID,CANONICAL_SMILES,PROTEIN_ACCESSION,ACTIVITY_ID,STANDARD_TYPE_UNITSj,STANDARD_VALUE,ASSAY_CHEMBLID,ASSAY_TYPE,ASSAY_ORGANISM,CURATED_BY,...,CHAM810101.lag21,CHAM810101.lag22,CHAM810101.lag23,CHAM810101.lag24,CHAM810101.lag25,CHAM810101.lag26,CHAM810101.lag27,CHAM810101.lag28,CHAM810101.lag29,CHAM810101.lag30
0,G1,Br\C(=C\c1oc(Br)cc1)[N+]([O-])=O,O15264,,Inhibition (%),,CHEMBL819807,F,Homo sapiens,,...,0.986033,1.072765,0.962658,0.952297,1.022723,1.014284,1.027107,1.045441,0.967998,0.987162
1,G1,Br\C(=C\c1oc(Br)cc1)[N+]([O-])=O,O15264,,Inhibition (%),,CHEMBL819808,F,Homo sapiens,,...,0.986033,1.072765,0.962658,0.952297,1.022723,1.014284,1.027107,1.045441,0.967998,0.987162
2,G1,Br\C(=C\c1oc(Br)cc1)[N+]([O-])=O,O15264,,Inhibition (%),,CHEMBL818993,F,Homo sapiens,,...,0.986033,1.072765,0.962658,0.952297,1.022723,1.014284,1.027107,1.045441,0.967998,0.987162
3,G1,Br\C(=C\c1oc(Br)cc1)[N+]([O-])=O,O15264,,IC50 (nM),,CHEMBL818992,F,Homo sapiens,,...,0.986033,1.072765,0.962658,0.952297,1.022723,1.014284,1.027107,1.045441,0.967998,0.987162
4,G1,Br\C(=C\c1oc(Br)cc1)[N+]([O-])=O,O15264,,IC50 (nM),,CHEMBL769660,F,Homo sapiens,,...,0.986033,1.072765,0.962658,0.952297,1.022723,1.014284,1.027107,1.045441,0.967998,0.987162


In [11]:
# merge 2nd protein descriptor file
df_prot2 = pd.read_csv(os.path.join(WorkingFolder, 'Protein_descriptors2.csv'))
df_last3 = pd.merge(df_last2, df_prot2, on='PROTEIN_ACCESSION')
df_last3.head()

Unnamed: 0,CMPD_CHEMBLID,CANONICAL_SMILES,PROTEIN_ACCESSION,ACTIVITY_ID,STANDARD_TYPE_UNITSj,STANDARD_VALUE,ASSAY_CHEMBLID,ASSAY_TYPE,ASSAY_ORGANISM,CURATED_BY,...,comp_L,comp_K,comp_M,comp_F,comp_P,comp_S,comp_T,comp_W,comp_Y,comp_V
0,G1,Br\C(=C\c1oc(Br)cc1)[N+]([O-])=O,O15264,,Inhibition (%),,CHEMBL819807,F,Homo sapiens,,...,0.10411,0.087671,0.035616,0.049315,0.046575,0.057534,0.052055,0.013699,0.041096,0.063014
1,G1,Br\C(=C\c1oc(Br)cc1)[N+]([O-])=O,O15264,,Inhibition (%),,CHEMBL819808,F,Homo sapiens,,...,0.10411,0.087671,0.035616,0.049315,0.046575,0.057534,0.052055,0.013699,0.041096,0.063014
2,G1,Br\C(=C\c1oc(Br)cc1)[N+]([O-])=O,O15264,,Inhibition (%),,CHEMBL818993,F,Homo sapiens,,...,0.10411,0.087671,0.035616,0.049315,0.046575,0.057534,0.052055,0.013699,0.041096,0.063014
3,G1,Br\C(=C\c1oc(Br)cc1)[N+]([O-])=O,O15264,,IC50 (nM),,CHEMBL818992,F,Homo sapiens,,...,0.10411,0.087671,0.035616,0.049315,0.046575,0.057534,0.052055,0.013699,0.041096,0.063014
4,G1,Br\C(=C\c1oc(Br)cc1)[N+]([O-])=O,O15264,,IC50 (nM),,CHEMBL769660,F,Homo sapiens,,...,0.10411,0.087671,0.035616,0.049315,0.046575,0.057534,0.052055,0.013699,0.041096,0.063014


In [12]:
print('-> Saving the final G1 file ...')
df_last3.to_csv(os.path.join(WorkingFolder, 'ds.G1_raw.csv'), index=False)
print('Done!')

-> Saving the final G1 file ...
Done!
