In [1]:
import pandas as pd
import requests
import json
import time
import sqlite3 as sqlite
import pickle

In [2]:
# Set parameteres
basedir = '/Users/ines/FAERS_y2'
# Location of the sqlite database used for the mapping process
mapping_process_db = basedir + '/data/interim/201903_drug_mapping_process.db'
# Location/name of the new database to be created for storing mapped compounds
mapped_compounds_db = basedir + '/data/interim/201903_mapped_compounds.db'

### Open currently available InChIs (which were done via DrugBank)

In [3]:
# Connect to database
conn = sqlite.connect(mapping_process_db)
cur = conn.cursor()

In [4]:
initial_mapped = [i for i in cur.execute("select distinct aeolus_concept, rxnorm_concept, rxnorm_name, inchi, inchikey, drugbank_id from drug_concepts where inchi is not NULL").fetchall()]

In [5]:
conn.close()

In [6]:
initial_mapped[:10]

[(700253,
  10493,
  'Thiopental',
  'InChI=1S/C11H18N2O2S/c1-4-6-7(3)11(5-2)8(14)12-10(16)13-9(11)15/h7H,4-6H2,1-3H3,(H2,12,13,14,15,16)',
  'IUJDSEJGGMCXSG-UHFFFAOYSA-N',
  'DB00599'),
 (700299,
  10502,
  'Thioridazine',
  'InChI=1S/C21H26N2S2/c1-22-13-6-5-7-16(22)12-14-23-18-8-3-4-9-20(18)25-21-11-10-17(24-2)15-19(21)23/h3-4,8-11,15-16H,5-7,12-14H2,1-2H3',
  'KLBQZWRITKRQQV-UHFFFAOYSA-N',
  'DB00679'),
 (700465,
  10510,
  'Thiothixene',
  'InChI=1S/C23H29N3O2S2/c1-24(2)30(27,28)18-10-11-23-21(17-18)19(20-7-4-5-9-22(20)29-23)8-6-12-26-15-13-25(3)14-16-26/h4-5,7-11,17H,6,12-16H2,1-3H3/b19-8-',
  'GFBKORZTTCHDGY-UWVJOHFNSA-N',
  'DB01623'),
 (701322,
  6719,
  'Memantine',
  'InChI=1S/C12H21N/c1-10-3-9-4-11(2,6-10)8-12(13,5-9)7-10/h9H,3-8,13H2,1-2H3',
  'BUGYDGFZZOZRHP-UHFFFAOYSA-N',
  'DB01043'),
 (702661,
  6757,
  'Mephenytoin',
  'InChI=1S/C12H14N2O2/c1-3-12(9-7-5-4-6-8-9)10(15)14(2)11(16)13-12/h4-8H,3H2,1-2H3,(H,13,16)',
  'GMHKMTDVRCWUDX-UHFFFAOYSA-N',
  'DB00532'),
 (702685,
 

In [7]:
len(initial_mapped)

1994

In [8]:
initial_mapped_dict = dict()

In [9]:
for row in initial_mapped:
    initial_mapped_dict[row[0]] = {'rxnorm_concept': row[1]
        , 'rxnorm_name': row[2]
        , 'standard_inchi': row[3]
        , 'standard_inchi_key': row[4]
        , 'drugbank_id': row[5]}

In [10]:
len(initial_mapped_dict)

1994

In [11]:
inchikeys = [i[4] for i in initial_mapped]

In [12]:
inchikeys[:10]

['IUJDSEJGGMCXSG-UHFFFAOYSA-N',
 'KLBQZWRITKRQQV-UHFFFAOYSA-N',
 'GFBKORZTTCHDGY-UWVJOHFNSA-N',
 'BUGYDGFZZOZRHP-UHFFFAOYSA-N',
 'GMHKMTDVRCWUDX-UHFFFAOYSA-N',
 'ALARQZQTBTVLJV-UHFFFAOYSA-N',
 'INWLQCZOYSRPNW-UHFFFAOYSA-N',
 'NPPQSCRMBWNHMW-UHFFFAOYSA-N',
 'SLVMESMUVMCQIY-UHFFFAOYSA-N',
 'PMXMIIMHBWHSKN-UHFFFAOYSA-N']

In [14]:
parent_query = """select distinct str.standard_inchi
, str.standard_inchi_key
, md.chembl_id
, md.max_phase
, str.molregno
, hier.parent_molregno
, str2.standard_inchi as parent_inchi
, str2.standard_inchi_key as parent_inchi_key
, md2.chembl_id as parent_chembl_id
, md2.max_phase as parent_max_phase
from compound_structures str
join molecule_dictionary md on md.molregno = str.molregno
left join molecule_hierarchy hier on hier.molregno = str.molregno
left join compound_structures str2 on str2.molregno = hier.parent_molregno
left join molecule_dictionary md2 on md2.molregno = hier.parent_molregno
where str.standard_inchi_key in ({})""".format(', '.join(['"'+i+'"' for i in inchikeys]))

In [15]:
with open(basedir + '/src/sql/inchi_to_chembl_query.sql', 'w') as f:
    f.write(parent_query)

In [15]:
# Execute query (CHEMBL_24) on calculon and get result

In [16]:
# Insert parent structures and identifiers into compound_structures db

In [16]:
inchis_results = pd.read_csv(basedir + '/data/interim/inchi_to_chembl_query.txt', sep='\t')

In [17]:
inchis_results.head()

Unnamed: 0,standard_inchi,standard_inchi_key,chembl_id,max_phase,molregno,parent_molregno,parent_inchi,parent_inchi_key,parent_chembl_id,parent_max_phase
0,InChI=1S/C19H21N5O4/c1-26-15-10-12-13(11-16(15...,IENZQIKPVFGBNW-UHFFFAOYSA-N,CHEMBL2,4,97,97.0,InChI=1S/C19H21N5O4/c1-26-15-10-12-13(11-16(15...,IENZQIKPVFGBNW-UHFFFAOYSA-N,CHEMBL2,4.0
1,InChI=1S/C10H14N2/c1-12-7-3-5-10(12)9-4-2-6-11...,SNICXCGAKADSCV-JTQLQIEISA-N,CHEMBL3,4,115,115.0,InChI=1S/C10H14N2/c1-12-7-3-5-10(12)9-4-2-6-11...,SNICXCGAKADSCV-JTQLQIEISA-N,CHEMBL3,4.0
2,InChI=1S/C18H20FN3O4/c1-10-9-26-17-14-11(16(23...,GSDSWSVVBLHKDQ-UHFFFAOYSA-N,CHEMBL4,4,146,146.0,InChI=1S/C18H20FN3O4/c1-10-9-26-17-14-11(16(23...,GSDSWSVVBLHKDQ-UHFFFAOYSA-N,CHEMBL4,4.0
3,InChI=1S/C19H16ClNO4/c1-11-15(10-18(22)23)16-9...,CGIGDMFJXJATDK-UHFFFAOYSA-N,CHEMBL6,4,173,173.0,InChI=1S/C19H16ClNO4/c1-11-15(10-18(22)23)16-9...,CGIGDMFJXJATDK-UHFFFAOYSA-N,CHEMBL6,4.0
4,InChI=1S/C8H11NO5S/c1-8(2)6(7(11)12)9-4(10)3-5...,FKENQMMABCRJMK-RITPCOANSA-N,CHEMBL403,4,194,194.0,InChI=1S/C8H11NO5S/c1-8(2)6(7(11)12)9-4(10)3-5...,FKENQMMABCRJMK-RITPCOANSA-N,CHEMBL403,4.0


In [18]:
len(inchis_results)

1801

In [19]:
len(initial_mapped_dict)

1994

In [20]:
# Insert in the dictionary only those compounds that existed in ChEMBL

mapped_compounds = dict()

for key in initial_mapped_dict.keys():
    if initial_mapped_dict[key]['standard_inchi_key'] in [i for i in inchis_results['standard_inchi_key']]:
        mapped_compounds[key] = {'rxnorm_concept': initial_mapped_dict[key]['rxnorm_concept']
        , 'rxnorm_name': initial_mapped_dict[key]['rxnorm_name']
        , 'standard_inchi': initial_mapped_dict[key]['standard_inchi']
        , 'standard_inchi_key': initial_mapped_dict[key]['standard_inchi_key']
        , 'drugbank_id': initial_mapped_dict[key]['drugbank_id']}

In [21]:
len(mapped_compounds)

1835

In [22]:
len(set([mapped_compounds[key]['standard_inchi_key'] for key in mapped_compounds]))

1801

In [23]:
# compounds with available parent
results_approved = inchis_results.loc[inchis_results['parent_max_phase']==4,:]

In [24]:
# None of the ChEMBL ids has two rows in the table
counts = results_approved.groupby('chembl_id').count()
counts.loc[counts['parent_inchi']>1,:]

Unnamed: 0_level_0,standard_inchi,standard_inchi_key,max_phase,molregno,parent_molregno,parent_inchi,parent_inchi_key,parent_chembl_id,parent_max_phase
chembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1


In [25]:
for row in results_approved.iterrows():
    for key in mapped_compounds.keys():
        if mapped_compounds[key]['standard_inchi'] == row[1]['standard_inchi']:
            mapped_compounds[key]['parent_molregno']= row[1]['parent_molregno']
            mapped_compounds[key]['parent_inchi']= row[1]['parent_inchi']
            mapped_compounds[key]['parent_inchi_key']= row[1]['parent_inchi_key']
            mapped_compounds[key]['parent_chembl_id']= row[1]['parent_chembl_id']
            mapped_compounds[key]['parent_max_phase']= row[1]['parent_max_phase']
            mapped_compounds[key]['mapping_comment']= 'parent via Drugbank mapping'

In [26]:
# compounds without parents but approved mapping
results_approved_nonparent = inchis_results.loc[(inchis_results['max_phase']==4)&(inchis_results['parent_max_phase'].isnull()),:]
results_approved_nonparent

Unnamed: 0,standard_inchi,standard_inchi_key,chembl_id,max_phase,molregno,parent_molregno,parent_inchi,parent_inchi_key,parent_chembl_id,parent_max_phase
1456,InChI=1S/C35H62N4O4/c1-24(40)42-32-21-26-9-10-...,OWWLUIWOFHMHOQ-XGHATYIMSA-N,CHEMBL1201206,4,675157,,,,,
1457,InChI=1S/C11H15NO/c1-9-11(13-8-7-12-9)10-5-3-2...,OOBHFESNSZDWIU-UHFFFAOYSA-N,CHEMBL1201208,4,675159,,,,,
1502,InChI=1S/C37H61N2O4/c1-6-20-39(21-12-9-13-22-3...,HTIKWNNIPGXLGM-YLINKJIISA-N,CHEMBL1201352,4,675303,,,,,
1524,InChI=1S/C13H21O5P/c1-9(2)11-6-5-7-12(10(3)4)1...,QVNNONOFASOXQV-UHFFFAOYSA-N,CHEMBL1201766,4,675717,,,,,
1615,InChI=1S/C12H17NO/c1-10-12(14-9-8-13(10)2)11-6...,MFOCDFTXLCYLKU-CMPLNLGQSA-N,CHEMBL1615439,4,1038390,,,,,
1616,InChI=1S/C27H32N2O6S2/c1-5-28(6-2)22-13-9-20(1...,YFKDCGWIINMRQY-UHFFFAOYSA-O,CHEMBL1615783,4,1038734,,,,,
1642,InChI=1S/C22H29NO2/c1-5-21(24)25-22(18(2)17-23...,XLMALTXPSGQGBX-PGRDOPGGSA-N,CHEMBL1738990,4,1118041,,,,,
1659,InChI=1S/C20H23N7O7/c21-20-25-16-15(18(32)26-2...,VVIAGPKUTFNRDU-STQMWFEESA-N,CHEMBL1908841,4,1248760,,,,,
1751,InChI=1S/C24H31NO/c1-3-23(26)24(21-13-7-4-8-14...,SVDHSZFEQYXRDC-UHFFFAOYSA-N,CHEMBL2111157,4,1383597,,,,,
1756,InChI=1S/2ClH.Sr/h2*1H;/q;;+2/p-2,AHBGXTDRMVNFER-UHFFFAOYSA-L,CHEMBL2219640,4,1449451,,,,,


In [27]:
for row in results_approved_nonparent.iterrows():
    for key in mapped_compounds.keys():
        if mapped_compounds[key]['standard_inchi'] == row[1]['standard_inchi']:
            mapped_compounds[key]['parent_molregno']= row[1]['molregno']
            mapped_compounds[key]['parent_inchi']= row[1]['standard_inchi']
            mapped_compounds[key]['parent_inchi_key']= row[1]['standard_inchi_key']
            mapped_compounds[key]['parent_chembl_id']= row[1]['chembl_id']
            mapped_compounds[key]['parent_max_phase']= row[1]['max_phase']
            mapped_compounds[key]['mapping_comment']= 'direct Drugbank mapping'

### For compounds without max_phase 4, see if another compound is available with that pref_name and max_phase 4

In [28]:
nonapproved = inchis_results.loc[(inchis_results['parent_max_phase']!=4)&(inchis_results['max_phase']!=4),:]

In [29]:
inchis_nonapproved = [i for i in nonapproved['standard_inchi']]

In [30]:
nonapproved_names = []
for key in mapped_compounds.keys():
    try:
        mapped_compounds[key]['parent_inchi_key']
        continue
    except KeyError:
        if mapped_compounds[key]['standard_inchi'] in inchis_nonapproved:
            nonapproved_names.append(mapped_compounds[key]['rxnorm_name'])

In [31]:
len(nonapproved_names)

319

In [32]:
# query chembl on these names
chembl_name_query1 = """
select distinct md.pref_name
, md.chembl_id
, str.standard_inchi
, str.standard_inchi_key
, md.molregno
, hier.parent_molregno
, str2.standard_inchi as parent_inchi
, str2.standard_inchi_key as parent_inchi_key
, md2.chembl_id as parent_chembl_id
from molecule_dictionary md
left join compound_structures str on str.molregno = md.molregno
left join molecule_hierarchy hier on hier.molregno = str.molregno
left join compound_structures str2 on str2.molregno = hier.parent_molregno
left join molecule_dictionary md2 on md2.molregno = hier.parent_molregno
where md.pref_name in ({})
and md.max_phase = 4
""".format(', '.join(['"'+i+'"' for i in nonapproved_names]))

In [33]:
with open(basedir + '/src/sql/nonapproved_names_to_chembl.sql', 'w') as f:
    f.write(chembl_name_query1)

In [34]:
# Open query results
name_mappings = pd.read_csv(basedir + '/data/interim/nonapproved_names_to_chembl.txt', sep='\t')

In [35]:
name_mappings.loc[name_mappings['molregno']!=name_mappings['parent_molregno'],:]

Unnamed: 0,pref_name,chembl_id,standard_inchi,standard_inchi_key,molregno,parent_molregno,parent_inchi,parent_inchi_key,parent_chembl_id
29,CISAPRIDE,CHEMBL1200788,InChI=1S/C23H29ClFN3O4.H2O/c1-30-21-13-19(26)1...,QBYYXIDJOFZORM-UHFFFAOYSA-N,674739,557741.0,InChI=1S/C23H29ClFN3O4/c1-30-21-13-19(26)18(24...,DCSUBABJRXZOMT-UHFFFAOYSA-N,CHEMBL1729
32,GRAMICIDIN,CHEMBL1201469,,,675500,,,,
46,VITAMIN E,CHEMBL3989727,,,2197598,,,,


In [36]:
name_mappings.loc[name_mappings['parent_molregno'].isnull(),:]

Unnamed: 0,pref_name,chembl_id,standard_inchi,standard_inchi_key,molregno,parent_molregno,parent_inchi,parent_inchi_key,parent_chembl_id
32,GRAMICIDIN,CHEMBL1201469,,,675500,,,,
46,VITAMIN E,CHEMBL3989727,,,2197598,,,,


In [37]:
name_mappings.loc[name_mappings['pref_name'].isin(['CISAPRIDE', 'GRAMICIDIN', 'VITAMIN E']),:]

Unnamed: 0,pref_name,chembl_id,standard_inchi,standard_inchi_key,molregno,parent_molregno,parent_inchi,parent_inchi_key,parent_chembl_id
26,CISAPRIDE,CHEMBL1729,InChI=1S/C23H29ClFN3O4/c1-30-21-13-19(26)18(24...,DCSUBABJRXZOMT-UHFFFAOYSA-N,557741,557741.0,InChI=1S/C23H29ClFN3O4/c1-30-21-13-19(26)18(24...,DCSUBABJRXZOMT-UHFFFAOYSA-N,CHEMBL1729
29,CISAPRIDE,CHEMBL1200788,InChI=1S/C23H29ClFN3O4.H2O/c1-30-21-13-19(26)1...,QBYYXIDJOFZORM-UHFFFAOYSA-N,674739,557741.0,InChI=1S/C23H29ClFN3O4/c1-30-21-13-19(26)18(24...,DCSUBABJRXZOMT-UHFFFAOYSA-N,CHEMBL1729
32,GRAMICIDIN,CHEMBL1201469,,,675500,,,,
46,VITAMIN E,CHEMBL3989727,,,2197598,,,,


In [38]:
# Enter concepts in dictionary
for row in name_mappings.loc[~name_mappings['parent_molregno'].isnull(),:].iterrows():
    for key in mapped_compounds.keys():
        if mapped_compounds[key]['rxnorm_name'].upper() == row[1]['pref_name']:
            mapped_compounds[key]['parent_molregno']= row[1]['parent_molregno']
            mapped_compounds[key]['parent_inchi']= row[1]['parent_inchi']
            mapped_compounds[key]['parent_inchi_key']= row[1]['parent_inchi_key']
            mapped_compounds[key]['parent_chembl_id']= row[1]['parent_chembl_id']
            mapped_compounds[key]['parent_max_phase']= 4
            mapped_compounds[key]['mapping_comment']= 'replaced Drugbank mapping with max_phase_4 on pref_name matching rxnorm_name'

In [39]:
# Enter concepts in dictionary
for row in name_mappings.loc[name_mappings['parent_molregno'].isnull(),:].iterrows():
    for key in mapped_compounds.keys():
        if mapped_compounds[key]['rxnorm_name'].upper() == row[1]['pref_name']:
            mapped_compounds[key]['parent_molregno']= row[1]['molregno']
            mapped_compounds[key]['parent_inchi']= row[1]['standard_inchi']
            mapped_compounds[key]['parent_inchi_key']= row[1]['standard_inchi_key']
            mapped_compounds[key]['parent_chembl_id']= row[1]['chembl_id']
            mapped_compounds[key]['parent_max_phase']= 4
            mapped_compounds[key]['mapping_comment']= 'replaced Drugbank mapping with max_phase_4 on pref_name matching rxnorm_name'

In [40]:
count = 0
for key in mapped_compounds.keys():
    try:
        if 'replaced' in mapped_compounds[key]['mapping_comment']:
            count +=1
            print(count, mapped_compounds[key])
    except KeyError:
        continue

1 {'rxnorm_concept': 10734, 'rxnorm_name': 'Tranylcypromine', 'standard_inchi': 'InChI=1S/C9H11N/c10-9-6-8(9)7-4-2-1-3-5-7/h1-5,8-9H,6,10H2', 'standard_inchi_key': 'AELCINSCMGFISI-UHFFFAOYSA-N', 'drugbank_id': 'DB00752', 'parent_molregno': 2197714.0, 'parent_inchi': 'InChI=1S/2C9H11N/c2*10-9-6-8(9)7-4-2-1-3-5-7/h2*1-5,8-9H,6,10H2/t2*8-,9+/m10/s1', 'parent_inchi_key': 'IGLYMJRIWWIQQE-QUOODJBBSA-N', 'parent_chembl_id': 'CHEMBL3989843', 'parent_max_phase': 4, 'mapping_comment': 'replaced Drugbank mapping with max_phase_4 on pref_name matching rxnorm_name'}
2 {'rxnorm_concept': 3105, 'rxnorm_name': 'Dantrolene', 'standard_inchi': 'InChI=1S/C14H10N4O5/c19-13-8-17(14(20)16-13)15-7-11-5-6-12(23-11)9-1-3-10(4-2-9)18(21)22/h1-7H,8H2,(H,16,19,20)', 'standard_inchi_key': 'OZOMQRBLCMDCEG-UHFFFAOYSA-N', 'drugbank_id': 'DB01219', 'parent_molregno': 675239.0, 'parent_inchi': 'InChI=1S/C14H10N4O5/c19-13-8-17(14(20)16-13)15-7-11-5-6-12(23-11)9-1-3-10(4-2-9)18(21)22/h1-7H,8H2,(H,16,19,20)/b15-7+', 'pare

### Next check compound name approved compounds

In [41]:
nonapproved_names2 = []
for key in mapped_compounds.keys():
    try:
        mapped_compounds[key]['parent_inchi_key']
        continue
    except KeyError:
        nonapproved_names2.append(mapped_compounds[key]['rxnorm_name'])

In [42]:
nonapproved_names2[:5]

['Alclometasone', 'Mometasone', 'Oxyquinoline', 'Glutamate', '1-octacosanol']

In [43]:
len(nonapproved_names2)

270

In [44]:
# Now check the compound name and synonyms

nonapproved_rest_query = """select distinct 
md.chembl_id
, str.standard_inchi
, md.molregno
, hier.parent_molregno
, cr.compound_name
, md2.pref_name as parent_pref_name
, md2.chembl_id as parent_chembl_id
, str2.standard_inchi as parent_inchi
, str2.standard_inchi_key as parent_inchi_key
from molecule_dictionary md
join compound_records cr on cr.molregno = md.molregno
left join compound_structures str on str.molregno = md.molregno
left join molecule_hierarchy hier on hier.molregno = md.molregno
left join compound_structures str2 on str2.molregno = hier.parent_molregno
left join molecule_dictionary md2 on md2.molregno = hier.parent_molregno
where cr.compound_name in ({0})
and md.max_phase = 4
""".format(', '.join(['"'+i+'"' for i in nonapproved_names2]))

with open(basedir + '/src/sql/nonapproved_names_to_chembl2.sql', 'w') as f:
    f.write(nonapproved_rest_query)

In [45]:
names_rest = pd.read_csv(basedir + '/data/interim/nonapproved_names_to_chembl2.txt', sep='\t')

In [46]:
names_rest

Unnamed: 0,chembl_id,standard_inchi,molregno,parent_molregno,compound_name,parent_pref_name,parent_chembl_id,parent_inchi,parent_inchi_key
0,CHEMBL607710,InChI=1S/C10H12ClNO4/c11-7-1-3-9(4-2-7)15-5-8(...,604618,604618,chlorphenesin,CHLORPHENESIN CARBAMATE,CHEMBL607710,InChI=1S/C10H12ClNO4/c11-7-1-3-9(4-2-7)15-5-8(...,SKPLBLUECSEIFO-UHFFFAOYSA-N


In [47]:
# Chlorphenesin case is ambiguous but going to give preference to the approved case
# Update dictionary with parent info
for row in names_rest.iterrows():
    for key in mapped_compounds.keys():
        if mapped_compounds[key]['rxnorm_name'].lower() == row[1]['compound_name'].lower():      
            mapped_compounds[key]['parent_molregno']= row[1]['parent_molregno']
            mapped_compounds[key]['parent_inchi']= row[1]['parent_inchi']
            mapped_compounds[key]['parent_inchi_key']= row[1]['parent_inchi_key']
            mapped_compounds[key]['parent_chembl_id']= row[1]['parent_chembl_id']
            mapped_compounds[key]['parent_max_phase']= 4
            mapped_compounds[key]['mapping_comment']= 'replaced Drugbank mapping with max_phase_4 on compound_name'

### Next, check synonyms

In [48]:
nonapproved_names3 = []
for key in mapped_compounds.keys():
    try:
        mapped_compounds[key]['parent_inchi_key']
        continue
    except KeyError:
        nonapproved_names3.append(mapped_compounds[key]['rxnorm_name'])

In [49]:
len(nonapproved_names3)

269

In [50]:
# Now check the synonyms

nonapproved_synonyms = """
select distinct 
md.chembl_id
, str.standard_inchi
, str.standard_inchi_key
, md.molregno
, hier.parent_molregno
, md.pref_name
, syn.synonyms
, md2.pref_name as parent_pref_name
, md2.chembl_id as parent_chembl_id
, md2.molregno as parent_molregno
, str2.standard_inchi as parent_inchi
, str2.standard_inchi_key as parent_inchi_key
from molecule_dictionary md
join molecule_synonyms syn on md.molregno = syn.molregno
left join compound_structures str on str.molregno = md.molregno
left join molecule_hierarchy hier on hier.molregno = md.molregno
left join compound_structures str2 on str2.molregno = hier.parent_molregno
left join molecule_dictionary md2 on md2.molregno = hier.parent_molregno
where syn.synonyms in ({0})
and md.max_phase = 4
and syn.syn_type != 'TRADE_NAME'
""".format(', '.join(['"'+i+'"' for i in nonapproved_names3]))

with open(basedir + '/src/sql/nonapproved_names_to_chembl3.sql', 'w') as f:
    f.write(nonapproved_synonyms)

In [51]:
names_rest2 = pd.read_csv(basedir + '/data/interim/nonapproved_names_to_chembl3.txt', sep='\t')

In [52]:
names_rest2.sort_values(by='synonyms')

Unnamed: 0,chembl_id,standard_inchi,standard_inchi_key,molregno,parent_molregno,pref_name,synonyms,parent_pref_name,parent_chembl_id,parent_molregno.1,parent_inchi,parent_inchi_key
9,CHEMBL1200989,InChI=1S/C28H37ClO7/c1-6-22(33)35-14-21(32)28(...,DJHCCTTVDRAMEH-DUUJBDRPSA-N,674940,674940,ALCLOMETASONE DIPROPIONATE,Alclometasone,ALCLOMETASONE DIPROPIONATE,CHEMBL1200989,674940,InChI=1S/C28H37ClO7/c1-6-22(33)35-14-21(32)28(...,DJHCCTTVDRAMEH-DUUJBDRPSA-N
2,CHEMBL95889,"InChI=1S/C5H11NO2/c1-6(2,3)4-5(7)8/h4H2,1-3H3/p+1",KWIUHFFTVRNATP-UHFFFAOYSA-O,160056,160056,BETAINE,"Betaine, anhydrous",BETAINE,CHEMBL95889,160056,"InChI=1S/C5H11NO2/c1-6(2,3)4-5(7)8/h4H2,1-3H3/p+1",KWIUHFFTVRNATP-UHFFFAOYSA-O
12,CHEMBL2107567,"InChI=1S/Ca.H3O4P/c;1-5(2,3)4/h;(H3,1,2,3,4)/q...",FUFJGUQYACFECW-UHFFFAOYSA-L,1379837,1379837,"CALCIUM PHOSPHATE, DIBASIC",Calcium phosphate,"CALCIUM PHOSPHATE, DIBASIC",CHEMBL2107567,1379837,"InChI=1S/Ca.H3O4P/c;1-5(2,3)4/h;(H3,1,2,3,4)/q...",FUFJGUQYACFECW-UHFFFAOYSA-L
11,CHEMBL1201016,InChI=1S/C21H27N5O9S2/c1-9(2)33-21(30)35-10(3)...,LTINZAODLRIQIX-FBXRGJNPSA-N,674967,674967,CEFPODOXIME PROXETIL,Cefpodoxime,CEFPODOXIME PROXETIL,CHEMBL1201016,674967,InChI=1S/C21H27N5O9S2/c1-9(2)33-21(30)35-10(3)...,LTINZAODLRIQIX-FBXRGJNPSA-N
7,CHEMBL1159650,InChI=1S/C25H32ClFO5/c1-5-21(31)32-25(20(30)13...,CBGUOGMQLZIXBE-XGQKBEPLSA-N,641189,641189,CLOBETASOL PROPIONATE,Clobetasol,CLOBETASOL PROPIONATE,CHEMBL1159650,641189,InChI=1S/C25H32ClFO5/c1-5-21(31)32-25(20(30)13...,CBGUOGMQLZIXBE-XGQKBEPLSA-N
8,CHEMBL1200545,InChI=1S/C26H32F2O7/c1-13-8-17-18-10-20(27)19-...,BOBLHFUVNSFZPJ-JOYXJVLSSA-N,674496,674496,DIFLORASONE DIACETATE,Diflorasone,DIFLORASONE DIACETATE,CHEMBL1200545,674496,InChI=1S/C26H32F2O7/c1-13-8-17-18-10-20(27)19-...,BOBLHFUVNSFZPJ-JOYXJVLSSA-N
6,CHEMBL1096882,InChI=1S/C10H13FN5O7P/c11-10-14-7(12)4-8(15-10...,GIUYCYHIANZCFB-FJFJXFQQSA-N,624161,624161,FLUDARABINE PHOSPHATE,Fludarabine,FLUDARABINE PHOSPHATE,CHEMBL1096882,624161,InChI=1S/C10H13FN5O7P/c11-10-14-7(12)4-8(15-10...,GIUYCYHIANZCFB-FJFJXFQQSA-N
10,CHEMBL1201010,InChI=1S/C23H31FO6/c1-13(25)30-12-19(28)22(29)...,SYWHXTATXSMDSB-GSLJADNHSA-N,674961,674961,FLUDROCORTISONE ACETATE,Fludrocortisone,FLUDROCORTISONE ACETATE,CHEMBL1201010,674961,InChI=1S/C23H31FO6/c1-13(25)30-12-19(28)22(29)...,SYWHXTATXSMDSB-GSLJADNHSA-N
1,CHEMBL989,InChI=1S/C24H30F2O6/c1-20(2)31-19-9-13-14-8-16...,FEBLZLNTKCEFIT-VSXGLTOVSA-N,105252,105252,FLUOCINOLONE ACETONIDE,Fluocinolone,FLUOCINOLONE ACETONIDE,CHEMBL989,105252,InChI=1S/C24H30F2O6/c1-20(2)31-19-9-13-14-8-16...,FEBLZLNTKCEFIT-VSXGLTOVSA-N
4,CHEMBL1473,InChI=1S/C25H31F3O5S/c1-5-20(31)33-25(21(32)34...,WMWTYOKRWGGJOA-CENSZEJFSA-N,401141,401141,FLUTICASONE PROPIONATE,Fluticasone,FLUTICASONE PROPIONATE,CHEMBL1473,401141,InChI=1S/C25H31F3O5S/c1-5-20(31)33-25(21(32)34...,WMWTYOKRWGGJOA-CENSZEJFSA-N


In [53]:
# Check all of these manually
# FLUTICASONE has two forms in chembl, both approved, can't know which. FLUTICASONE PROPIONATE has more activities.
# Tenofovir not sure, so do not replace
# All of the rest only has that one form as approved drug upon manual checking of 'browse drugs', no other one with the same name, so happy to accept above mappings

In [54]:
# Update dictionary with parent info except for tenofovir
count = 0
for row in names_rest2.iterrows():
    for key in mapped_compounds.keys():
        if mapped_compounds[key]['rxnorm_name'].lower() == 'tenofovir':
            continue
        if mapped_compounds[key]['rxnorm_name'].lower() == row[1]['synonyms'].lower():
            count += 1
            mapped_compounds[key]['parent_molregno']= row[1]['molregno']
            mapped_compounds[key]['parent_inchi']= row[1]['standard_inchi']
            mapped_compounds[key]['parent_inchi_key']= row[1]['standard_inchi_key']
            mapped_compounds[key]['parent_chembl_id']= row[1]['chembl_id']
            mapped_compounds[key]['parent_max_phase']= 4
            mapped_compounds[key]['mapping_comment']= 'replaced Drugbank mapping with max_phase_4 on synonyms'
            print(count, mapped_compounds[key])

1 {'rxnorm_concept': 9100, 'rxnorm_name': 'Racemethionine', 'standard_inchi': 'InChI=1S/C5H11NO2S/c1-9-3-2-4(6)5(7)8/h4H,2-3,6H2,1H3,(H,7,8)', 'standard_inchi_key': 'FFEARJCKVFRZRR-UHFFFAOYSA-N', 'drugbank_id': 'DB13972', 'parent_molregno': 63949, 'parent_inchi': 'InChI=1S/C5H11NO2S/c1-9-3-2-4(6)5(7)8/h4H,2-3,6H2,1H3,(H,7,8)/t4-/m0/s1', 'parent_inchi_key': 'FFEARJCKVFRZRR-BYPYZUCNSA-N', 'parent_chembl_id': 'CHEMBL42336', 'parent_max_phase': 4, 'mapping_comment': 'replaced Drugbank mapping with max_phase_4 on synonyms'}
2 {'rxnorm_concept': 25126, 'rxnorm_name': 'fluocinolone', 'standard_inchi': 'InChI=1S/C21H26F2O6/c1-18-4-3-10(25)5-13(18)14(22)6-12-11-7-15(26)21(29,17(28)9-24)19(11,2)8-16(27)20(12,18)23/h3-5,11-12,14-16,24,26-27,29H,6-9H2,1-2H3/t11-,12-,14-,15+,16-,18-,19-,20-,21-/m0/s1', 'standard_inchi_key': 'UUOUOERPONYGOS-CLCRDYEYSA-N', 'drugbank_id': 'DB12553', 'parent_molregno': 105252, 'parent_inchi': 'InChI=1S/C24H30F2O6/c1-20(2)31-19-9-13-14-8-16(25)15-7-12(28)5-6-21(15,3)23(

### For rest of the compounds with InChI, accept mapping non-max-phase 4, retrieve ChEMBL details

In [55]:
unassigned_inchi_keys = []
for key in mapped_compounds.keys():
    try:
        mapped_compounds[key]['parent_inchi_key']
        continue
    except KeyError:
        unassigned_inchi_keys.append(mapped_compounds[key]['standard_inchi_key'])

In [56]:
unassigned_inchi_keys[:5]

['MCJGNVYPOGVAJF-UHFFFAOYSA-N',
 'WHUUTDBJXJRKMK-VKHMYHEASA-N',
 'CNNRPFQICPFDPO-UHFFFAOYSA-N',
 'SIYLLGKDQZGJHK-UHFFFAOYSA-N',
 'FGIUAXJPYTZDNR-UHFFFAOYSA-N']

In [57]:
len(unassigned_inchi_keys)

257

In [58]:
unassigned_query = """select distinct md.pref_name
, str.standard_inchi
, str.standard_inchi_key
, md.chembl_id
, md.max_phase
, hier.molregno
, hier.parent_molregno
, str2.standard_inchi as parent_inchi
, str2.standard_inchi_key as parent_inchi_key
, md2.chembl_id as parent_chembl_id
, md2.max_phase as parent_max_phase
from compound_structures str
join molecule_dictionary md on md.molregno = str.molregno
left join molecule_hierarchy hier on hier.molregno = str.molregno
left join compound_structures str2 on str2.molregno = hier.parent_molregno
left join molecule_dictionary md2 on md2.molregno = hier.parent_molregno
where str.standard_inchi_key in ({})""".format(', '.join(['"'+i+'"' for i in unassigned_inchi_keys]))

with open(basedir + '/src/sql/unassigned_inchi_keys_chembl.sql', 'w') as f:
    f.write(unassigned_query)

In [59]:
unassigned_inchis = pd.read_csv(basedir + '/data/interim/unassigned_inchi_keys_chembl.txt', sep = '\t')

In [60]:
unassigned_inchis.loc[unassigned_inchis['parent_molregno'].isnull(),:]

Unnamed: 0,pref_name,standard_inchi,standard_inchi_key,chembl_id,max_phase,molregno,parent_molregno,parent_inchi,parent_inchi_key,parent_chembl_id,parent_max_phase
245,,InChI=1S/C31H41N5O5/c1-16(2)26-28(38)35-11-7-1...,SEALOBQTUQIVGU-QNIJNHAOSA-N,CHEMBL2365712,0,,,,,,
252,,"InChI=1S/Ni.H2O4S/c;1-5(2,3)4/h;(H2,1,2,3,4)/q...",LGQLOGILCSXPEA-UHFFFAOYSA-L,CHEMBL3990696,0,,,,,,


In [61]:
select_ids = unassigned_inchis.loc[unassigned_inchis['molregno']!=unassigned_inchis['parent_molregno'],'chembl_id'].drop_duplicates()

In [62]:
# Checking that the ones without a parent do not have another row in the table
unassigned_inchis.loc[unassigned_inchis['chembl_id'].isin(select_ids),:]

Unnamed: 0,pref_name,standard_inchi,standard_inchi_key,chembl_id,max_phase,molregno,parent_molregno,parent_inchi,parent_inchi_key,parent_chembl_id,parent_max_phase
203,,InChI=1S/C6H11FO5/c7-3(1-8)5(11)6(12)4(10)2-9/...,AOYNUTHNTBLRMT-MXWOLSILSA-N,CHEMBL1808698,0,1166395.0,1248859.0,InChI=1S/C6H11FO5/c7-3(1-8)5(11)6(12)4(10)2-9/...,AOYNUTHNTBLRMT-SLPGGIOYSA-N,CHEMBL1908940,0.0
245,,InChI=1S/C31H41N5O5/c1-16(2)26-28(38)35-11-7-1...,SEALOBQTUQIVGU-QNIJNHAOSA-N,CHEMBL2365712,0,,,,,,
249,CALCIUM CARBIMIDE,InChI=1S/CN2.Ca/c2-1-3;/q-2;+2,MYFXBBAEXORJNB-UHFFFAOYSA-N,CHEMBL3301667,0,1763638.0,191091.0,InChI=1S/CH2N2/c2-1-3/h2-3H,VPKDCDLSJZCGKE-UHFFFAOYSA-N,CHEMBL116583,0.0
250,CALCIUM GLYCERYLPHOSPHATE,"InChI=1S/C3H9O6P.Ca/c4-1-3(2-5)9-10(6,7)8;/h3-...",UHHRFSOMMCWGSO-UHFFFAOYSA-L,CHEMBL3707206,0,2039177.0,695253.0,"InChI=1S/C3H9O6P/c4-1-3(2-5)9-10(6,7)8/h3-5H,1...",DHCLVCXQIBBOPH-UHFFFAOYSA-N,CHEMBL1232903,0.0
252,,"InChI=1S/Ni.H2O4S/c;1-5(2,3)4/h;(H2,1,2,3,4)/q...",LGQLOGILCSXPEA-UHFFFAOYSA-L,CHEMBL3990696,0,,,,,,


In [63]:
unassigned_inchis.head()

Unnamed: 0,pref_name,standard_inchi,standard_inchi_key,chembl_id,max_phase,molregno,parent_molregno,parent_inchi,parent_inchi_key,parent_chembl_id,parent_max_phase
0,ALPRENOLOL,InChI=1S/C15H23NO2/c1-4-7-13-8-5-6-9-15(13)18-...,PAZJSJFMUHDSTF-UHFFFAOYSA-N,CHEMBL266195,0,822.0,822.0,InChI=1S/C15H23NO2/c1-4-7-13-8-5-6-9-15(13)18-...,PAZJSJFMUHDSTF-UHFFFAOYSA-N,CHEMBL266195,0.0
1,SEROTONIN,InChI=1S/C10H12N2O/c11-4-3-7-6-12-10-2-1-8(13)...,QZAYGJVTTNCVMB-UHFFFAOYSA-N,CHEMBL39,0,2214.0,2214.0,InChI=1S/C10H12N2O/c11-4-3-7-6-12-10-2-1-8(13)...,QZAYGJVTTNCVMB-UHFFFAOYSA-N,CHEMBL39,0.0
2,QUINACRINE,InChI=1S/C23H30ClN3O/c1-5-27(6-2)13-7-8-16(3)2...,GPKJTRJOBQGKQK-UHFFFAOYSA-N,CHEMBL7568,2,2355.0,2355.0,InChI=1S/C23H30ClN3O/c1-5-27(6-2)13-7-8-16(3)2...,GPKJTRJOBQGKQK-UHFFFAOYSA-N,CHEMBL7568,2.0
3,APROBARBITAL,InChI=1S/C10H14N2O3/c1-4-5-10(6(2)3)7(13)11-9(...,UORJNBVJVRLXMQ-UHFFFAOYSA-N,CHEMBL7863,0,2407.0,2407.0,InChI=1S/C10H14N2O3/c1-4-5-10(6(2)3)7(13)11-9(...,UORJNBVJVRLXMQ-UHFFFAOYSA-N,CHEMBL7863,0.0
4,GENISTEIN,InChI=1S/C15H10O5/c16-9-3-1-8(2-4-9)11-7-20-13...,TZBJGXHYKVUXJN-UHFFFAOYSA-N,CHEMBL44,0,2658.0,2658.0,InChI=1S/C15H10O5/c16-9-3-1-8(2-4-9)11-7-20-13...,TZBJGXHYKVUXJN-UHFFFAOYSA-N,CHEMBL44,0.0


In [64]:
# Update dictionary with parent info
for row in unassigned_inchis.loc[~unassigned_inchis['parent_molregno'].isnull(),:].iterrows():
    for key in mapped_compounds.keys():
        if mapped_compounds[key]['standard_inchi'] == row[1]['standard_inchi']:      
            mapped_compounds[key]['parent_molregno']= row[1]['parent_molregno']
            mapped_compounds[key]['parent_inchi']= row[1]['parent_inchi']
            mapped_compounds[key]['parent_inchi_key']= row[1]['parent_inchi_key']
            mapped_compounds[key]['parent_chembl_id']= row[1]['parent_chembl_id']
            mapped_compounds[key]['parent_max_phase']= row[1]['parent_max_phase']
            mapped_compounds[key]['mapping_comment']= 'parent from direct Drugbank mapping (for non-approved, no replacement approved found)'

In [65]:
# Update dictionary with parent info
for row in unassigned_inchis.loc[unassigned_inchis['parent_molregno'].isnull(),:].iterrows():
    for key in mapped_compounds.keys():
        if mapped_compounds[key]['standard_inchi'] == row[1]['standard_inchi']:      
            mapped_compounds[key]['parent_molregno']= row[1]['molregno']
            mapped_compounds[key]['parent_inchi']= row[1]['standard_inchi']
            mapped_compounds[key]['parent_inchi_key']= row[1]['standard_inchi_key']
            mapped_compounds[key]['parent_chembl_id']= row[1]['chembl_id']
            mapped_compounds[key]['parent_max_phase']= row[1]['max_phase']
            mapped_compounds[key]['mapping_comment']= 'direct Drugbank mapping (for non-approved, no replacement approved found)'

In [66]:
still_unassigned = []
for key in mapped_compounds.keys():
    try:
        mapped_compounds[key]['parent_inchi_key']
    except KeyError:
        print(mapped_compounds[key])
        still_unassigned.append(mapped_compounds[key]['standard_inchi_key'])

In [67]:
still_unassigned

[]

### Make new database and insert all compounds from mapped_compounds dictionary

In [69]:
conn = sqlite.connect(mapped_compounds_db)
cur = conn.cursor()

In [70]:
create_statement = """create table compound_structures
(aeolus_concept integer
, rxnorm_name text
, rxnorm_concept integer
, drugbank_id text
, original_drugbank_unichem_inchi text
, original_drugbank_unichem_inchi_key text
, mapped_parent_chembl_id text
, mapped_parent_standard_inchi text
, mapped_parent_standard_inchi_key text
, mapped_parent_molregno integer
, mapped_parent_max_phase integer
, mapping_notes text
, notes text
)"""
cur.execute(create_statement)

<sqlite3.Cursor at 0x11d7fd9d0>

In [71]:
insert_row = """insert into compound_structures 
(aeolus_concept
, rxnorm_name
, rxnorm_concept
, drugbank_id
, original_drugbank_unichem_inchi
, original_drugbank_unichem_inchi_key
, mapped_parent_chembl_id
, mapped_parent_standard_inchi
, mapped_parent_standard_inchi_key
, mapped_parent_molregno 
, mapped_parent_max_phase
, mapping_notes)
values 
(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"""

for key in mapped_compounds.keys():
    cur.execute(insert_row, (key, mapped_compounds[key]['rxnorm_name']
        , mapped_compounds[key]['rxnorm_concept']
        , mapped_compounds[key]['drugbank_id']
        , mapped_compounds[key]['standard_inchi']
        , mapped_compounds[key]['standard_inchi_key']
        , mapped_compounds[key]['parent_chembl_id']
        , mapped_compounds[key]['parent_inchi']
        , mapped_compounds[key]['parent_inchi_key']
        , mapped_compounds[key]['parent_molregno']
        , mapped_compounds[key]['parent_max_phase']
        , mapped_compounds[key]['mapping_comment']))

In [72]:
cur.execute('select count(*) from compound_structures').fetchall()

[(1835,)]

In [73]:
conn.commit()
conn.close()