- Remove zeros in SIDER STITCH IDs to get PubChem Compound ID
- Use Unichem to get InchiKey for PubChem ID
- also get chembl_id
- do query for parent molregno in ChEMBL

In [1]:
import pandas as pd
import requests
import re
import json
import time
import pickle
import datetime

In [2]:
current_date = datetime.date.today().strftime("%Y%m%d")

In [3]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', 30)

In [4]:
basedir = "/scratch/ias41/ae_code"

### Whole of SIDER

In [5]:
all_se = pd.read_csv(basedir + '/sider/data/meddra_all_se.tsv', sep='\t', header=None)
all_se.columns = ['STITCH_flat', 'STITCH_stereo', 'UMLS_concept_label', 'MedDRA_concept_type','UMLS_concept', 'side_effect_name']

In [6]:
all_se.head()

Unnamed: 0,STITCH_flat,STITCH_stereo,UMLS_concept_label,MedDRA_concept_type,UMLS_concept,side_effect_name
0,CID100000085,CID000010917,C0000729,LLT,C0000729,Abdominal cramps
1,CID100000085,CID000010917,C0000729,PT,C0000737,Abdominal pain
2,CID100000085,CID000010917,C0000737,LLT,C0000737,Abdominal pain
3,CID100000085,CID000010917,C0000737,PT,C0687713,Gastrointestinal pain
4,CID100000085,CID000010917,C0000737,PT,C0000737,Abdominal pain


In [7]:
len(set(all_se['STITCH_stereo']))

1556

### Convert STITCH ids to InChIs

In [8]:
p = re.compile('CID0*([0-9]+)')
base = 'https://www.ebi.ac.uk/unichem/rest/'

inchi_dict = dict()
errors = []

def find_inchi(stitch_id):
    """For a STITCH compound id, retrieve the inchi and inchikey via Unichem
    stitch_id -- str, STITCH compound id, which has arbitrary number of 0s inserted between the 'CID' and the PubChem id."""
    
    compound_id = p.match(stitch_id).group(1)
    response = requests.get(base + 'src_compound_id/{}/22/1'.format(compound_id)) # source 22 is PubChem
    
    try:
        assert response.status_code == 200
    except AssertionError:
        errors.append(stitch_id)
        return
    
    result = json.loads(response.content.decode())
    
    try:
        chembl_id = result[0]['src_compound_id']
    except IndexError:
        errors.append(stitch_id)
        return
    time.sleep(0.2)
    
    inchi_dict['CID' + compound_id] = {'original_id': stitch_id, 'chembl_id': chembl_id}

In [9]:
%%time
for compound in all_se['STITCH_stereo'].drop_duplicates():
    find_inchi(compound)

CPU times: user 22.9 s, sys: 1.06 s, total: 24 s
Wall time: 6min 33s


In [11]:
len(inchi_dict)

1217

In [12]:
# Many of the errors are biologicals/peptides or are compounds that have been downgraded - checked some manually
len(errors)

339

In [18]:
339/1556

0.217866323907455

In [15]:
with open(basedir + '/sider/data/stitch2inchi_dict.pkl', 'wb') as f:
    pickle.dump(inchi_dict, f)

In [13]:
with open(basedir + '/sider/data/stitch2inchi_dict.pkl', 'rb') as f:
    inchi_from_pkl = pickle.load(f)

### Make mapping dataframe

In [15]:
compound_mappings = pd.DataFrame.from_dict(inchi_dict, orient='index').reset_index()
compound_mappings.columns = ['pubchem_id', 'stitch_id', 'chembl_id']

In [16]:
compound_mappings.head()

Unnamed: 0,pubchem_id,stitch_id,chembl_id
0,CID10917,CID000010917,CHEMBL1149
1,CID119,CID000000119,CHEMBL96
2,CID137,CID000000137,CHEMBL601
3,CID5280360,CID005280360,CHEMBL548
4,CID5282411,CID005282411,CHEMBL1139


### Find parent compounds in ChEMBL

In [17]:
len(compound_mappings['stitch_id'].drop_duplicates())

1217

In [18]:
len(compound_mappings['chembl_id'].drop_duplicates())

1219

In [19]:
myquery = """use chembl_25;
select distinct
md.chembl_id
, md.molregno
, hier.parent_molregno
, md2.chembl_id as parent_chembl_id
from molecule_dictionary md
left join molecule_hierarchy hier on hier.molregno = md.molregno
left join molecule_dictionary md2 on md2.molregno = hier.parent_molregno
where md.chembl_id in ({});""".format("'" + "', '".join(list(compound_mappings['chembl_id'])) + "'")

In [21]:
with open(basedir + '/sider/src/sider_parent_molregnos.sql', 'w') as f:
    f.write(myquery)

#### Executed query on local MySQL installation of ChEMBL

In [22]:
parent_results = pd.read_csv(basedir + '/sider/data/sider_parent_molregnos.txt', sep='\t')

In [None]:
# Sometimes parent molregnos are missing - replace by molregno

In [24]:
parent_results['parent_chembl_id'] = parent_results.parent_chembl_id.fillna(value=parent_results.chembl_id)

In [25]:
parent_results['parent_molregno'] = parent_results.parent_molregno.fillna(value=parent_results.molregno)

In [26]:
parent_results['parent_molregno'] = parent_results['parent_molregno'].astype(int)

In [27]:
parent_results.head()

Unnamed: 0,chembl_id,molregno,parent_molregno,parent_chembl_id
0,CHEMBL1000,111185,111185,CHEMBL1000
1,CHEMBL1002,111482,111482,CHEMBL1002
2,CHEMBL1006,112480,112480,CHEMBL1006
3,CHEMBL1008,112651,112651,CHEMBL1008
4,CHEMBL1014,116349,116349,CHEMBL1014


In [28]:
len(parent_results['chembl_id'].drop_duplicates())

1219

In [29]:
len(parent_results['parent_molregno'].drop_duplicates())

1201

In [30]:
# Need to add back original identifiers 

In [31]:
# reverse the inchi dict
chembl_id_dict = dict()

for key in inchi_dict.keys():
    chembl_id = inchi_dict[key]['chembl_id']
    original_id = inchi_dict[key]['original_id']
    pubchem_id = key
    chembl_id_dict[chembl_id] = {'stitch_id': original_id, 'pubchem_id': key}

In [32]:
chembl_id_dict

{'CHEMBL1149': {'stitch_id': 'CID000010917', 'pubchem_id': 'CID10917'},
 'CHEMBL96': {'stitch_id': 'CID000000119', 'pubchem_id': 'CID119'},
 'CHEMBL601': {'stitch_id': 'CID000000137', 'pubchem_id': 'CID137'},
 'CHEMBL548': {'stitch_id': 'CID005280360', 'pubchem_id': 'CID5280360'},
 'CHEMBL1139': {'stitch_id': 'CID005282411', 'pubchem_id': 'CID5282411'},
 'CHEMBL815': {'stitch_id': 'CID005280363', 'pubchem_id': 'CID5280363'},
 'CHEMBL667': {'stitch_id': 'CID000000187', 'pubchem_id': 'CID187'},
 'CHEMBL477': {'stitch_id': 'CID000060961', 'pubchem_id': 'CID60961'},
 'CHEMBL495': {'stitch_id': 'CID005280723', 'pubchem_id': 'CID5280723'},
 'CHEMBL1160819': {'stitch_id': 'CID000000222', 'pubchem_id': 'CID222'},
 'CHEMBL1485': {'stitch_id': 'CID000006322', 'pubchem_id': 'CID6322'},
 'CHEMBL720': {'stitch_id': 'CID000000244', 'pubchem_id': 'CID244'},
 'CHEMBL1182': {'stitch_id': 'CID000000247', 'pubchem_id': 'CID247'},
 'CHEMBL17564': {'stitch_id': 'CID000000297', 'pubchem_id': 'CID297'},
 'CH

In [33]:
parent_results['stitch_stereo'] = parent_results['chembl_id'].apply(lambda x: chembl_id_dict[x]['stitch_id'])

In [34]:
parent_results.head()

Unnamed: 0,chembl_id,molregno,parent_molregno,parent_chembl_id,stitch_stereo
0,CHEMBL1000,111185,111185,CHEMBL1000,CID000002678
1,CHEMBL1002,111482,111482,CHEMBL1002,CID000123600
2,CHEMBL1006,112480,112480,CHEMBL1006,CID000002141
3,CHEMBL1008,112651,112651,CHEMBL1008,CID000002351
4,CHEMBL1014,116349,116349,CHEMBL1014,CID000002540


In [35]:
# Just check the parent query has worked
parent_results.loc[parent_results['parent_molregno']!=parent_results['molregno'],:].head()

Unnamed: 0,chembl_id,molregno,parent_molregno,parent_chembl_id,stitch_stereo
48,CHEMBL1092067,622931,613911,CHEMBL1086863,CID000450503
121,CHEMBL1200328,674279,192908,CHEMBL1175,CID000060834
122,CHEMBL1200330,674281,14406,CHEMBL550,CID000005909
123,CHEMBL1200331,674282,581859,CHEMBL1757,CID000054331
125,CHEMBL1200343,674294,453579,CHEMBL1581,CID000441313


In [36]:
# check if any were null
parent_results.loc[parent_results['parent_chembl_id'].isnull()]

Unnamed: 0,chembl_id,molregno,parent_molregno,parent_chembl_id,stitch_stereo


In [37]:
# Save convenience file with mappings stitch 2 parent chembl molregno
parent_results.to_csv(basedir + '/sider/data/sider_mapped_stitch2parent.txt', sep='\t', index=None)

In [42]:
pt_mapped = all_se.merge(parent_results, left_on='STITCH_stereo', right_on='stitch_stereo')
pt_mapped = pt_mapped.loc[pt_mapped['MedDRA_concept_type']=='PT',:]

In [43]:
pt_mapped.head()

Unnamed: 0,STITCH_flat,STITCH_stereo,UMLS_concept_label,MedDRA_concept_type,UMLS_concept,side_effect_name,chembl_id,molregno,parent_molregno,parent_chembl_id,stitch_stereo
1,CID100000085,CID000010917,C0000729,PT,C0000737,Abdominal pain,CHEMBL1149,181939,181939,CHEMBL1149,CID000010917
3,CID100000085,CID000010917,C0000737,PT,C0687713,Gastrointestinal pain,CHEMBL1149,181939,181939,CHEMBL1149,CID000010917
4,CID100000085,CID000010917,C0000737,PT,C0000737,Abdominal pain,CHEMBL1149,181939,181939,CHEMBL1149,CID000010917
6,CID100000085,CID000010917,C0002418,PT,C0002418,Amblyopia,CHEMBL1149,181939,181939,CHEMBL1149,CID000010917
8,CID100000085,CID000010917,C0002871,PT,C0002871,Anaemia,CHEMBL1149,181939,181939,CHEMBL1149,CID000010917


In [44]:
# Save version of all_se file for compounds with molregno mappings
pt_mapped.to_csv(basedir + '/sider/results/sider_all_se_pt_mapped.txt', sep='\t', index=None)