In [None]:
"""Mapping compounds from extracted unbound concentrations, Fu, and Plasma protein binding to ChEMBL approved drugs"""

In [1]:
import pandas as pd
import numpy as np

In [2]:
basedir = '/scratch/ias41/ae_code/plasma_concentrations'

In [3]:
pd.set_option('display.max_colwidth',300)
pd.set_option('display.max_rows',300)

In [4]:
dataset1 = pd.read_csv(basedir + '/results/interim/PMID22210121_data.txt', sep='\t')
dataset2 = pd.read_csv(basedir + '/results/interim/PMID15637086_data.txt', sep='\t')
dataset3 = pd.read_csv(basedir + '/results/interim/PMID30115648_data.txt', sep='\t')
dataset4 = pd.read_csv(basedir + '/results/interim/PMID12667944_data.txt', sep='\t')
concatenated = pd.concat([dataset1, dataset2, dataset3, dataset4], ignore_index=True, sort=False)

In [5]:
concatenated.head()

Unnamed: 0,Original drug name,Original synonyms,PPB lower,PPB upper,PMID,ETCP unbound (nM),Fu,inchi_key,SMILES,PPB,ETCP unbound (nM) upper
0,3-Carboxy-4-methyl-5-propyl-2-furan-propionic acid,,99.0,,PMID22210121,,,,,,
1,Acenocoumarin,,99.0,,PMID22210121,,,,,,
2,Acetaminophen,,0.0,,PMID22210121,,,,,,
3,Alprazolam,,71.0,,PMID22210121,,,,,,
4,Alprenolol,,85.0,,PMID22210121,,,,,,


In [6]:
drug_df = concatenated[['Original drug name', 'Original synonyms', 'inchi_key','PMID']].drop_duplicates()
drug_df['Original drug name'] = drug_df['Original drug name'].apply(lambda x: x.upper().strip(' '))
drug_df['Original synonyms'] = drug_df['Original synonyms'].apply(lambda x: str(x).upper().strip(' '))

In [7]:
drug_df.head()

Unnamed: 0,Original drug name,Original synonyms,inchi_key,PMID
0,3-CARBOXY-4-METHYL-5-PROPYL-2-FURAN-PROPIONIC ACID,NAN,,PMID22210121
1,ACENOCOUMARIN,NAN,,PMID22210121
2,ACETAMINOPHEN,NAN,,PMID22210121
3,ALPRAZOLAM,NAN,,PMID22210121
4,ALPRENOLOL,NAN,,PMID22210121


In [8]:
approved = pd.read_csv(basedir + '/data/chembl_approved_drugs.txt', sep='\t')
approved['synonyms'] = approved['synonyms'].apply(lambda x: str(x).upper())

In [9]:
approved.head()

Unnamed: 0,chembl_id,molregno,pref_name,compound_name,compound_key,synonyms,syn_type,standard_inchi_key,canonical_smiles,mw_freebase
0,CHEMBL2,97,PRAZOSIN,"[4-(4-Amino-6,7-dimethoxy-quinazolin-2-yl)-piperazin-1-yl]-furan-2-yl-methanone",1,PRAZOSIN,ATC,IENZQIKPVFGBNW-UHFFFAOYSA-N,COc1cc2nc(nc(N)c2cc1OC)N3CCN(CC3)C(=O)c4occc4,383.41
1,CHEMBL2,97,PRAZOSIN,"[4-(4-Amino-6,7-dimethoxy-quinazolin-2-yl)-piperazin-1-yl]-furan-2-yl-methanone",1,PRAZOSIN,BAN,IENZQIKPVFGBNW-UHFFFAOYSA-N,COc1cc2nc(nc(N)c2cc1OC)N3CCN(CC3)C(=O)c4occc4,383.41
2,CHEMBL2,97,PRAZOSIN,"[4-(4-Amino-6,7-dimethoxy-quinazolin-2-yl)-piperazin-1-yl]-furan-2-yl-methanone",1,PRAZOSIN,FDA,IENZQIKPVFGBNW-UHFFFAOYSA-N,COc1cc2nc(nc(N)c2cc1OC)N3CCN(CC3)C(=O)c4occc4,383.41
3,CHEMBL2,97,PRAZOSIN,"[4-(4-Amino-6,7-dimethoxy-quinazolin-2-yl)-piperazin-1-yl]-furan-2-yl-methanone",1,PRAZOSIN,INN,IENZQIKPVFGBNW-UHFFFAOYSA-N,COc1cc2nc(nc(N)c2cc1OC)N3CCN(CC3)C(=O)c4occc4,383.41
4,CHEMBL2,97,PRAZOSIN,"[4-(4-Amino-6,7-dimethoxy-quinazolin-2-yl)-piperazin-1-yl]-furan-2-yl-methanone",1,CP-12299,RESEARCH_CODE,IENZQIKPVFGBNW-UHFFFAOYSA-N,COc1cc2nc(nc(N)c2cc1OC)N3CCN(CC3)C(=O)c4occc4,383.41


In [10]:
# Pref_name in approved are not unique - some compounds with different chembl_id have same name
len(approved[['chembl_id', 'pref_name']].drop_duplicates()), len(approved[['pref_name']].drop_duplicates())

(3813, 3741)

### Round 1 - mapping on inchi key

In [11]:
inchi_key_mappings = pd.merge(drug_df, approved[['pref_name', 'chembl_id', 'molregno', 'standard_inchi_key', 'canonical_smiles']].drop_duplicates(), left_on = 'inchi_key', right_on='standard_inchi_key')
inchi_key_mappings = inchi_key_mappings.loc[~inchi_key_mappings['inchi_key'].isnull(),:].drop_duplicates()

In [12]:
inchi_key_mappings

Unnamed: 0,Original drug name,Original synonyms,inchi_key,PMID,pref_name,chembl_id,molregno,standard_inchi_key,canonical_smiles
206388,ABACAVIR,NAN,MCGSCOLBFJQGHM-SCZZXKLOSA-N,PMID30115648,ABACAVIR,CHEMBL1380,321707,MCGSCOLBFJQGHM-SCZZXKLOSA-N,Nc1nc(NC2CC2)c3ncn([C@@H]4C[C@H](CO)C=C4)c3n1
206389,ACAMPROSATE,NAN,AFCGFAGUEYAMAO-UHFFFAOYSA-N,PMID30115648,ACAMPROSATE,CHEMBL1201293,675244,AFCGFAGUEYAMAO-UHFFFAOYSA-N,CC(=O)NCCCS(=O)(=O)O
206390,ACEBUTOLOL,NAN,GOEMGAFJFRBGGG-UHFFFAOYSA-N,PMID30115648,ACEBUTOLOL,CHEMBL642,27347,GOEMGAFJFRBGGG-UHFFFAOYSA-N,CCCC(=O)Nc1ccc(OCC(O)CNC(C)C)c(c1)C(=O)C
206391,ACETAMINOPHEN,NAN,RZVAJINKPMORJF-UHFFFAOYSA-N,PMID30115648,ACETAMINOPHEN,CHEMBL112,16450,RZVAJINKPMORJF-UHFFFAOYSA-N,CC(=O)Nc1ccc(O)cc1
206392,ACETAZOLAMIDE,NAN,BZKPWHYZMXOIDC-UHFFFAOYSA-N,PMID30115648,ACETAZOLAMIDE,CHEMBL20,1125,BZKPWHYZMXOIDC-UHFFFAOYSA-N,CC(=O)Nc1nnc(s1)S(=O)(=O)N
...,...,...,...,...,...,...,...,...,...
206968,ZIPRASIDONE,NAN,MVWVFYHBGMAFLY-UHFFFAOYSA-N,PMID30115648,ZIPRASIDONE,CHEMBL708,33664,MVWVFYHBGMAFLY-UHFFFAOYSA-N,Clc1cc2NC(=O)Cc2cc1CCN3CCN(CC3)c4nsc5ccccc45
206969,ZOLEDRONIC ACID,NAN,XRASPMIURGNCCH-UHFFFAOYSA-N,PMID30115648,ZOLEDRONIC ACID,CHEMBL924,83838,XRASPMIURGNCCH-UHFFFAOYSA-N,OC(Cn1ccnc1)(P(=O)(O)O)P(=O)(O)O
206970,ZOLMITRIPTAN,NAN,ULSDMUVEXKOYBU-ZDUSSCGKSA-N,PMID30115648,ZOLMITRIPTAN,CHEMBL1185,196215,ULSDMUVEXKOYBU-ZDUSSCGKSA-N,CN(C)CCc1c[nH]c2ccc(C[C@H]3COC(=O)N3)cc12
206971,ZOLPIDEM,NAN,ZAFYATHCZYHLPB-UHFFFAOYSA-N,PMID30115648,ZOLPIDEM,CHEMBL911,80965,ZAFYATHCZYHLPB-UHFFFAOYSA-N,CN(C)C(=O)Cc1c(nc2ccc(C)cn12)c3ccc(C)cc3


### Round 2 - mapping on pref_name

In [13]:
pref_name_mappings = pd.merge(drug_df.loc[~drug_df['inchi_key'].isin(list(inchi_key_mappings['inchi_key'])),:], approved[['pref_name', 'chembl_id','molregno', 'standard_inchi_key', 'canonical_smiles']].drop_duplicates(), left_on = 'Original drug name', right_on='pref_name').drop_duplicates()
pref_name_mappings

Unnamed: 0,Original drug name,Original synonyms,inchi_key,PMID,pref_name,chembl_id,molregno,standard_inchi_key,canonical_smiles
0,ACETAMINOPHEN,NAN,,PMID22210121,ACETAMINOPHEN,CHEMBL112,16450,RZVAJINKPMORJF-UHFFFAOYSA-N,CC(=O)Nc1ccc(O)cc1
1,ALPRAZOLAM,NAN,,PMID22210121,ALPRAZOLAM,CHEMBL661,27648,VREFGVBLTWBCJP-UHFFFAOYSA-N,Cc1nnc2CN=C(c3ccccc3)c4cc(Cl)ccc4n12
2,AMILORIDE,NAN,,PMID22210121,AMILORIDE,CHEMBL945,89526,XSDQTOBWRPYKKA-UHFFFAOYSA-N,NC(=N)NC(=O)c1nc(Cl)c(N)nc1N
3,AMIODARONE,NAN,,PMID22210121,AMIODARONE,CHEMBL633,27185,IYIKLHRQXLHMJQ-UHFFFAOYSA-N,CCCCc1oc2ccccc2c1C(=O)c3cc(I)c(OCCN(CC)CC)c(I)c3
4,AMIODARONE,NAN,,PMID15637086,AMIODARONE,CHEMBL633,27185,IYIKLHRQXLHMJQ-UHFFFAOYSA-N,CCCCc1oc2ccccc2c1C(=O)c3cc(I)c(OCCN(CC)CC)c(I)c3
...,...,...,...,...,...,...,...,...,...
430,DOXORUBICIN,NAN,,PMID12667944,DOXORUBICIN,CHEMBL53463,78759,AOJJSUZBOXZQNB-TZSSRYMLSA-N,COc1cccc2C(=O)c3c(O)c4C[C@](O)(C[C@H](O[C@H]5C[C@H](N)[C@H](O)[C@H](C)O5)c4c(O)c3C(=O)c12)C(=O)CO
431,ENCAINIDE,NAN,,PMID12667944,ENCAINIDE,CHEMBL315838,139046,PJWPNDMDCLXCOM-UHFFFAOYSA-N,COc1ccc(cc1)C(=O)Nc2ccccc2CCC3CCCCN3C
432,HYDROXYZINE,NAN,,PMID12667944,HYDROXYZINE,CHEMBL896,77459,ZQDWXGKKHFNSQK-UHFFFAOYSA-N,OCCOCCN1CCN(CC1)C(c2ccccc2)c3ccc(Cl)cc3
433,PYRILAMINE,NAN,,PMID12667944,PYRILAMINE,CHEMBL511,10661,YECBIJXISLIIDS-UHFFFAOYSA-N,COc1ccc(CN(CCN(C)C)c2ccccn2)cc1


In [14]:
all_mappings = pd.concat([inchi_key_mappings, pref_name_mappings],sort=False,ignore_index=True).drop_duplicates()

In [15]:
all_mappings.sort_values(by='pref_name')

Unnamed: 0,Original drug name,Original synonyms,inchi_key,PMID,pref_name,chembl_id,molregno,standard_inchi_key,canonical_smiles
0,ABACAVIR,NAN,MCGSCOLBFJQGHM-SCZZXKLOSA-N,PMID30115648,ABACAVIR,CHEMBL1380,321707,MCGSCOLBFJQGHM-SCZZXKLOSA-N,Nc1nc(NC2CC2)c3ncn([C@@H]4C[C@H](CO)C=C4)c3n1
1,ACAMPROSATE,NAN,AFCGFAGUEYAMAO-UHFFFAOYSA-N,PMID30115648,ACAMPROSATE,CHEMBL1201293,675244,AFCGFAGUEYAMAO-UHFFFAOYSA-N,CC(=O)NCCCS(=O)(=O)O
2,ACEBUTOLOL,NAN,GOEMGAFJFRBGGG-UHFFFAOYSA-N,PMID30115648,ACEBUTOLOL,CHEMBL642,27347,GOEMGAFJFRBGGG-UHFFFAOYSA-N,CCCC(=O)Nc1ccc(OCC(O)CNC(C)C)c(c1)C(=O)C
892,ACENOCOUMAROL,NAN,VABCILAOYCMVPS-OAHLLOKOSA-N,PMID30115648,ACENOCOUMAROL,CHEMBL397420,394206,VABCILAOYCMVPS-UHFFFAOYSA-N,CC(=O)CC(C1=C(O)c2ccccc2OC1=O)c3ccc(cc3)[N+](=O)[O-]
3,ACETAMINOPHEN,NAN,RZVAJINKPMORJF-UHFFFAOYSA-N,PMID30115648,ACETAMINOPHEN,CHEMBL112,16450,RZVAJINKPMORJF-UHFFFAOYSA-N,CC(=O)Nc1ccc(O)cc1
...,...,...,...,...,...,...,...,...,...
581,ZOLEDRONIC ACID,NAN,XRASPMIURGNCCH-UHFFFAOYSA-N,PMID30115648,ZOLEDRONIC ACID,CHEMBL924,83838,XRASPMIURGNCCH-UHFFFAOYSA-N,OC(Cn1ccnc1)(P(=O)(O)O)P(=O)(O)O
582,ZOLMITRIPTAN,NAN,ULSDMUVEXKOYBU-ZDUSSCGKSA-N,PMID30115648,ZOLMITRIPTAN,CHEMBL1185,196215,ULSDMUVEXKOYBU-ZDUSSCGKSA-N,CN(C)CCc1c[nH]c2ccc(C[C@H]3COC(=O)N3)cc12
583,ZOLPIDEM,NAN,ZAFYATHCZYHLPB-UHFFFAOYSA-N,PMID30115648,ZOLPIDEM,CHEMBL911,80965,ZAFYATHCZYHLPB-UHFFFAOYSA-N,CN(C)C(=O)Cc1c(nc2ccc(C)cn12)c3ccc(C)cc3
825,ZOMEPIRAC,NAN,,PMID22210121,ZOMEPIRAC,CHEMBL19490,23001,ZXVNMYWKKDOREA-UHFFFAOYSA-N,Cc1cc(CC(=O)O)n(C)c1C(=O)c2ccc(Cl)cc2


### Round 3 - mapping of source synonym to chembl pref_name

In [16]:
unmapped = drug_df.loc[(~drug_df['Original drug name'].isin(list(all_mappings['Original drug name'])))&(~drug_df['inchi_key'].isin(list(all_mappings['standard_inchi_key']))),:]

In [17]:
len(unmapped)

216

In [18]:
synonym_pref_name_mappings_initial = pd.merge(drug_df, approved[['pref_name', 'chembl_id','molregno', 'standard_inchi_key','canonical_smiles']].drop_duplicates(), left_on='Original synonyms', right_on='pref_name').drop_duplicates().sort_values(by='pref_name')

In [19]:
synonym_pref_name_mappings_initial

Unnamed: 0,Original drug name,Original synonyms,inchi_key,PMID,pref_name,chembl_id,molregno,standard_inchi_key,canonical_smiles


# Round 4

In [20]:
# Original drug name to ChEMBL synonyms
substance_synonyms_mappings = pd.merge(unmapped, approved, left_on='Original drug name', right_on='synonyms')[['pref_name','Original drug name','syn_type','chembl_id','molregno', 'standard_inchi_key','canonical_smiles']].drop_duplicates()

In [21]:
# Inspect manually and decide which ones to drop
substance_synonyms_mappings.sort_values(by='Original drug name')

Unnamed: 0,pref_name,Original drug name,syn_type,chembl_id,molregno,standard_inchi_key,canonical_smiles
0,CANDESARTAN CILEXETIL,CANDESARTAN,BAN,CHEMBL1014,116349,GHOSNRCGJFBJIB-UHFFFAOYSA-N,CCOc1nc2cccc(C(=O)OC(C)OC(=O)OC3CCCCC3)c2n1Cc4ccc(cc4)c5ccccc5c6nn[nH]n6
11,FLUDARABINE PHOSPHATE,FLUDARABINE,ATC,CHEMBL1096882,624161,GIUYCYHIANZCFB-FJFJXFQQSA-N,Nc1nc(F)nc2c1ncn2[C@@H]3O[C@H](COP(=O)(O)O)[C@@H](O)[C@@H]3O
12,FLUDARABINE PHOSPHATE,FLUDARABINE,INN,CHEMBL1096882,624161,GIUYCYHIANZCFB-FJFJXFQQSA-N,Nc1nc(F)nc2c1ncn2[C@@H]3O[C@H](COP(=O)(O)O)[C@@H](O)[C@@H]3O
15,GANIRELIX ACETATE,GANIRELIX,ATC,CHEMBL1251,236376,GJNXBNATEDXMAK-PFLSVRRQSA-N,CCN\\C(=N/CC)\\NCCCC[C@@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](CO)NC(=O)[C@@H](Cc2cccnc2)NC(=O)[C@@H](Cc3ccc(Cl)cc3)NC(=O)[C@@H](Cc4ccc5ccccc5c4)NC(=O)C)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCCN\\C(=N\\CC)\\NCC)C(=O)N6CCC[C@H]6C(=O)N[C@H](C)C(=O)N
16,GANIRELIX ACETATE,GANIRELIX,BAN,CHEMBL1251,236376,GJNXBNATEDXMAK-PFLSVRRQSA-N,CCN\\C(=N/CC)\\NCCCC[C@@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](CO)NC(=O)[C@@H](Cc2cccnc2)NC(=O)[C@@H](Cc3ccc(Cl)cc3)NC(=O)[C@@H](Cc4ccc5ccccc5c4)NC(=O)C)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCCN\\C(=N\\CC)\\NCC)C(=O)N6CCC[C@H]6C(=O)N[C@H](C)C(=O)N
17,GANIRELIX ACETATE,GANIRELIX,BNF,CHEMBL1251,236376,GJNXBNATEDXMAK-PFLSVRRQSA-N,CCN\\C(=N/CC)\\NCCCC[C@@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](CO)NC(=O)[C@@H](Cc2cccnc2)NC(=O)[C@@H](Cc3ccc(Cl)cc3)NC(=O)[C@@H](Cc4ccc5ccccc5c4)NC(=O)C)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCCN\\C(=N\\CC)\\NCC)C(=O)N6CCC[C@H]6C(=O)N[C@H](C)C(=O)N
18,GANIRELIX ACETATE,GANIRELIX,INN,CHEMBL1251,236376,GJNXBNATEDXMAK-PFLSVRRQSA-N,CCN\\C(=N/CC)\\NCCCC[C@@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](CO)NC(=O)[C@@H](Cc2cccnc2)NC(=O)[C@@H](Cc3ccc(Cl)cc3)NC(=O)[C@@H](Cc4ccc5ccccc5c4)NC(=O)C)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCCN\\C(=N\\CC)\\NCC)C(=O)N6CCC[C@H]6C(=O)N[C@H](C)C(=O)N
23,TENOFOVIR DISOPROXIL FUMARATE,TENOFOVIR,BAN,CHEMBL1486,417163,VCMJCVGFSROFHV-WZGZYPNHSA-N,CC(C)OC(=O)OCOP(=O)(CO[C@H](C)Cn1cnc2c(N)ncnc12)OCOC(=O)OC(C)C.OC(=O)\\C=C\\C(=O)O
24,TENOFOVIR DISOPROXIL FUMARATE,TENOFOVIR,INN,CHEMBL1486,417163,VCMJCVGFSROFHV-WZGZYPNHSA-N,CC(C)OC(=O)OCOP(=O)(CO[C@H](C)Cn1cnc2c(N)ncnc12)OCOC(=O)OC(C)C.OC(=O)\\C=C\\C(=O)O
35,ZIMELDINE,ZIMELIDINE,OTHER,CHEMBL37744,54070,OYPPVKRFBIWMSX-SXGWCWSVSA-N,CN(C)C\\C=C(\\c1ccc(Br)cc1)/c2cccnc2


In [22]:
# accept all but tenofovir and candesartan - these are phase 3
to_drop = ['CHEMBL1014', 'CHEMBL1486']
substance_synonyms_mappings_to_add = substance_synonyms_mappings.loc[~substance_synonyms_mappings['chembl_id'].isin(to_drop),all_mappings.columns].drop_duplicates()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [23]:
substance_synonyms_mappings_to_add

Unnamed: 0,Original drug name,Original synonyms,inchi_key,PMID,pref_name,chembl_id,molregno,standard_inchi_key,canonical_smiles
11,FLUDARABINE,,,,FLUDARABINE PHOSPHATE,CHEMBL1096882,624161,GIUYCYHIANZCFB-FJFJXFQQSA-N,Nc1nc(F)nc2c1ncn2[C@@H]3O[C@H](COP(=O)(O)O)[C@@H](O)[C@@H]3O
15,GANIRELIX,,,,GANIRELIX ACETATE,CHEMBL1251,236376,GJNXBNATEDXMAK-PFLSVRRQSA-N,CCN\\C(=N/CC)\\NCCCC[C@@H](NC(=O)[C@H](Cc1ccc(O)cc1)NC(=O)[C@H](CO)NC(=O)[C@@H](Cc2cccnc2)NC(=O)[C@@H](Cc3ccc(Cl)cc3)NC(=O)[C@@H](Cc4ccc5ccccc5c4)NC(=O)C)C(=O)N[C@@H](CC(C)C)C(=O)N[C@@H](CCCCN\\C(=N\\CC)\\NCC)C(=O)N6CCC[C@H]6C(=O)N[C@H](C)C(=O)N
35,ZIMELIDINE,,,,ZIMELDINE,CHEMBL37744,54070,OYPPVKRFBIWMSX-SXGWCWSVSA-N,CN(C)C\\C=C(\\c1ccc(Br)cc1)/c2cccnc2


In [24]:
len(all_mappings)

1020

In [25]:
all_mappings = pd.concat([all_mappings, substance_synonyms_mappings_to_add], sort=False,ignore_index=True).drop_duplicates()

In [26]:
len(all_mappings)

1023

In [27]:
all_mappings.head()

Unnamed: 0,Original drug name,Original synonyms,inchi_key,PMID,pref_name,chembl_id,molregno,standard_inchi_key,canonical_smiles
0,ABACAVIR,NAN,MCGSCOLBFJQGHM-SCZZXKLOSA-N,PMID30115648,ABACAVIR,CHEMBL1380,321707,MCGSCOLBFJQGHM-SCZZXKLOSA-N,Nc1nc(NC2CC2)c3ncn([C@@H]4C[C@H](CO)C=C4)c3n1
1,ACAMPROSATE,NAN,AFCGFAGUEYAMAO-UHFFFAOYSA-N,PMID30115648,ACAMPROSATE,CHEMBL1201293,675244,AFCGFAGUEYAMAO-UHFFFAOYSA-N,CC(=O)NCCCS(=O)(=O)O
2,ACEBUTOLOL,NAN,GOEMGAFJFRBGGG-UHFFFAOYSA-N,PMID30115648,ACEBUTOLOL,CHEMBL642,27347,GOEMGAFJFRBGGG-UHFFFAOYSA-N,CCCC(=O)Nc1ccc(OCC(O)CNC(C)C)c(c1)C(=O)C
3,ACETAMINOPHEN,NAN,RZVAJINKPMORJF-UHFFFAOYSA-N,PMID30115648,ACETAMINOPHEN,CHEMBL112,16450,RZVAJINKPMORJF-UHFFFAOYSA-N,CC(=O)Nc1ccc(O)cc1
4,ACETAZOLAMIDE,NAN,BZKPWHYZMXOIDC-UHFFFAOYSA-N,PMID30115648,ACETAZOLAMIDE,CHEMBL20,1125,BZKPWHYZMXOIDC-UHFFFAOYSA-N,CC(=O)Nc1nnc(s1)S(=O)(=O)N


# Round 5

In [28]:
unmapped_2 = drug_df.loc[(~drug_df['Original drug name'].isin(list(all_mappings['Original drug name'])))&(~drug_df['inchi_key'].isin(list(all_mappings['standard_inchi_key']))),:]

In [29]:
len(unmapped_2)

213

In [30]:
unmapped_2.head()

Unnamed: 0,Original drug name,Original synonyms,inchi_key,PMID
275,16-ACETYL GITOXIN,NAN,NEBPBFLVSYFRQE-ZYMOIYFGSA-N,PMID30115648
276,5-HYDROXYMETHYL TOLTERODINE,NAN,DUXZAXCGJSBGDW-HXUWFJFHSA-N,PMID30115648
280,ACECAINIDE,N-ACETYLPROCAINAMIDE,KEECCEWTUVWFCV-UHFFFAOYSA-N,PMID30115648
287,ACTISOMIDE,NAN,QAHRRCMLXFLZTF-FYYLOGMGSA-N,PMID30115648
289,ADEFOVIR,NAN,SUPKOOSCJHTBAH-UHFFFAOYSA-N,PMID30115648


In [31]:
# Substance to compound name
# Original drug name to ChEMBL synonyms
round5_mappings = pd.merge(unmapped_2, approved, left_on='Original drug name', right_on='compound_name')[['pref_name','Original drug name','chembl_id','molregno',]].drop_duplicates()

In [32]:
round5_mappings

Unnamed: 0,pref_name,Original drug name,chembl_id,molregno


# Round 6
Synonyms to synonyms

In [33]:
# Synonyms to synonyms
syn_to_syn_mappings = pd.merge(unmapped_2.loc[~unmapped_2['Original synonyms'].isnull(),:], approved, left_on='Original synonyms',right_on='synonyms')[['Original drug name','pref_name','Original synonyms','syn_type','chembl_id','molregno', 'standard_inchi_key','canonical_smiles']].drop_duplicates()
syn_to_syn_mappings = syn_to_syn_mappings.loc[syn_to_syn_mappings['Original synonyms']!='NAN',:]

In [34]:
syn_to_syn_mappings

Unnamed: 0,Original drug name,pref_name,Original synonyms,syn_type,chembl_id,molregno,standard_inchi_key,canonical_smiles


In [35]:
len(all_mappings)

1023

In [36]:
all_mappings.loc[all_mappings['Original drug name']=='DOXYCYCLINE']

Unnamed: 0,Original drug name,Original synonyms,inchi_key,PMID,pref_name,chembl_id,molregno,standard_inchi_key,canonical_smiles
190,DOXYCYCLINE,NAN,JBIWCJUYHHGXTC-AKNGSSGZSA-N,PMID30115648,DOXYCYCLINE,CHEMBL1433,371546,JBIWCJUYHHGXTC-AKNGSSGZSA-N,C[C@@H]1[C@H]2[C@H](O)[C@H]3[C@H](N(C)C)C(=C(C(=O)N)C(=O)[C@@]3(O)C(=C2C(=O)c4c(O)cccc14)O)O
658,DOXYCYCLINE,NAN,,PMID22210121,DOXYCYCLINE,CHEMBL1433,371546,JBIWCJUYHHGXTC-AKNGSSGZSA-N,C[C@@H]1[C@H]2[C@H](O)[C@H]3[C@H](N(C)C)C(=C(C(=O)N)C(=O)[C@@]3(O)C(=C2C(=O)c4c(O)cccc14)O)O
659,DOXYCYCLINE,NAN,,PMID22210121,DOXYCYCLINE,CHEMBL1200699,674650,XQTWDDCIUJNLTR-CVHRZJFOSA-N,O.C[C@@H]1[C@H]2[C@H](O)[C@H]3[C@H](N(C)C)C(=C(C(=O)N)C(=O)[C@@]3(O)C(=C2C(=O)c4c(O)cccc14)O)O


In [37]:
# Checking that all 'substance' only have one mapping to chembl_id
for name in list(set(all_mappings['Original drug name'])):
    subset_df = all_mappings.loc[all_mappings['Original drug name']==name,'molregno'].drop_duplicates()
    if len(subset_df)>1:
        print(name)

DOXYCYCLINE
LORACARBEF
CISAPRIDE
CEFACLOR
TACROLIMUS
OXYPHENBUTAZONE
CEPHALEXIN
PINACIDIL
CEFPROZIL
SAXAGLIPTIN
INDOCYANINE GREEN


In [38]:
# Drop non-parents
to_be_dropped = ['CHEMBL1200788', 'CHEMBL1200544', 'CHEMBL1646', 'CHEMBL3184906', 'CHEMBL3989887', 'CHEMBL2103745', 'CHEMBL1201018', 'CHEMBL3989676', 'CHEMBL1200610', 'CHEMBL1200338', 'CHEMBL1200699']

In [39]:
all_mappings = all_mappings.loc[~all_mappings['chembl_id'].isin(to_be_dropped),:]

In [40]:
len(all_mappings)

1009

In [41]:
# Checking that all 'substance' only have one mapping to chembl_id
for name in list(set(all_mappings['Original drug name'])):
    subset_df = all_mappings.loc[all_mappings['Original drug name']==name,'molregno'].drop_duplicates()
    if len(subset_df)>1:
        print(name)

# See currently unmapped

In [42]:
unmapped_3 = drug_df.loc[(~drug_df['Original drug name'].isin(list(all_mappings['Original drug name'])))&(~drug_df['inchi_key'].isin(list(all_mappings['standard_inchi_key']))),:]

In [43]:
unmapped_3.head()

Unnamed: 0,Original drug name,Original synonyms,inchi_key,PMID
275,16-ACETYL GITOXIN,NAN,NEBPBFLVSYFRQE-ZYMOIYFGSA-N,PMID30115648
276,5-HYDROXYMETHYL TOLTERODINE,NAN,DUXZAXCGJSBGDW-HXUWFJFHSA-N,PMID30115648
280,ACECAINIDE,N-ACETYLPROCAINAMIDE,KEECCEWTUVWFCV-UHFFFAOYSA-N,PMID30115648
287,ACTISOMIDE,NAN,QAHRRCMLXFLZTF-FYYLOGMGSA-N,PMID30115648
289,ADEFOVIR,NAN,SUPKOOSCJHTBAH-UHFFFAOYSA-N,PMID30115648


In [44]:
unmapped_3['short substance'] = unmapped_3['Original drug name'].apply(lambda x: x.rstrip('E'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [45]:
unmapped_3.head()

Unnamed: 0,Original drug name,Original synonyms,inchi_key,PMID,short substance
275,16-ACETYL GITOXIN,NAN,NEBPBFLVSYFRQE-ZYMOIYFGSA-N,PMID30115648,16-ACETYL GITOXIN
276,5-HYDROXYMETHYL TOLTERODINE,NAN,DUXZAXCGJSBGDW-HXUWFJFHSA-N,PMID30115648,5-HYDROXYMETHYL TOLTERODIN
280,ACECAINIDE,N-ACETYLPROCAINAMIDE,KEECCEWTUVWFCV-UHFFFAOYSA-N,PMID30115648,ACECAINID
287,ACTISOMIDE,NAN,QAHRRCMLXFLZTF-FYYLOGMGSA-N,PMID30115648,ACTISOMID
289,ADEFOVIR,NAN,SUPKOOSCJHTBAH-UHFFFAOYSA-N,PMID30115648,ADEFOVIR


## Attach mappings after considering trailing 'e'

In [46]:
# Noticed some names had an 'e' on the end that prevented synonym mapping
my_columns = [i for i in all_mappings.columns]
short_names = pd.merge(unmapped_3, approved, left_on='short substance', right_on='pref_name')[my_columns].drop_duplicates()
short_names

Unnamed: 0,Original drug name,Original synonyms,inchi_key,PMID,pref_name,chembl_id,molregno,standard_inchi_key,canonical_smiles


In [47]:
len(set(drug_df['Original drug name'])), len(set(all_mappings['Original drug name'])), len(set(all_mappings['molregno']))

(1027, 778, 773)

In [48]:
all_mappings.head()

Unnamed: 0,Original drug name,Original synonyms,inchi_key,PMID,pref_name,chembl_id,molregno,standard_inchi_key,canonical_smiles
0,ABACAVIR,NAN,MCGSCOLBFJQGHM-SCZZXKLOSA-N,PMID30115648,ABACAVIR,CHEMBL1380,321707,MCGSCOLBFJQGHM-SCZZXKLOSA-N,Nc1nc(NC2CC2)c3ncn([C@@H]4C[C@H](CO)C=C4)c3n1
1,ACAMPROSATE,NAN,AFCGFAGUEYAMAO-UHFFFAOYSA-N,PMID30115648,ACAMPROSATE,CHEMBL1201293,675244,AFCGFAGUEYAMAO-UHFFFAOYSA-N,CC(=O)NCCCS(=O)(=O)O
2,ACEBUTOLOL,NAN,GOEMGAFJFRBGGG-UHFFFAOYSA-N,PMID30115648,ACEBUTOLOL,CHEMBL642,27347,GOEMGAFJFRBGGG-UHFFFAOYSA-N,CCCC(=O)Nc1ccc(OCC(O)CNC(C)C)c(c1)C(=O)C
3,ACETAMINOPHEN,NAN,RZVAJINKPMORJF-UHFFFAOYSA-N,PMID30115648,ACETAMINOPHEN,CHEMBL112,16450,RZVAJINKPMORJF-UHFFFAOYSA-N,CC(=O)Nc1ccc(O)cc1
4,ACETAZOLAMIDE,NAN,BZKPWHYZMXOIDC-UHFFFAOYSA-N,PMID30115648,ACETAZOLAMIDE,CHEMBL20,1125,BZKPWHYZMXOIDC-UHFFFAOYSA-N,CC(=O)Nc1nnc(s1)S(=O)(=O)N


In [50]:
# Need to match back to original datasets

In [51]:
ds1_mappings = all_mappings.loc[all_mappings['PMID']=='PMID22210121',:][['Original drug name', 'chembl_id', 'molregno']]
dataset1['Original drug name'] = dataset1['Original drug name'].apply(lambda x: x.upper().strip(' '))
ds1_result = ds1_mappings.merge(dataset1, on='Original drug name', how='inner')
ds1_result.to_csv(basedir + '/results/interim/PMID22210121_mapped_data.txt', sep='\t', index=False)

In [52]:
ds2_mappings = all_mappings.loc[all_mappings['PMID']=='PMID15637086',:][['Original drug name', 'chembl_id', 'molregno']]
dataset2['Original drug name'] = dataset2['Original drug name'].apply(lambda x: x.upper().strip(' '))
ds2_result = ds2_mappings.merge(dataset2, on='Original drug name', how='inner')
ds2_result.to_csv(basedir + '/results/interim/PMID15637086_mapped_data.txt', sep='\t', index=False)

In [53]:
ds3_mappings = all_mappings.loc[all_mappings['PMID']=='PMID30115648',:][['Original drug name', 'chembl_id', 'molregno']]
dataset3['Original drug name'] = dataset3['Original drug name'].apply(lambda x: x.upper().strip(' '))
ds3_result = ds3_mappings.merge(dataset3, on='Original drug name', how='inner')
ds3_result.to_csv(basedir + '/results/interim/PMID30115648_mapped_data.txt', sep='\t', index=False)

In [54]:
ds4_mappings = all_mappings.loc[all_mappings['PMID']=='PMID12667944',:][['Original drug name', 'chembl_id', 'molregno']]
dataset4['Original drug name'] = dataset4['Original drug name'].apply(lambda x: x.upper().strip(' '))
ds4_result = ds4_mappings.merge(dataset4, on='Original drug name', how='inner')
ds4_result.to_csv(basedir + '/results/interim/PMID12667944_mapped_data.txt', sep='\t', index=False)