In [1]:
"""Mapping drugs from Schulz ea publication with plasma concentrations to ChEMBL approved drugs"""

'Mapping drugs from Schulz ea publication with plasma concentrations to ChEMBL approved drugs'

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('display.max_colwidth',300)
pd.set_option('display.max_rows',300)

In [3]:
basedir = '/scratch/ias41/ae_code/plasma_concentrations'

In [4]:
plasma = pd.read_csv(basedir + '/data/Schulz_ea_upper_values_formatted.txt', sep='\t')
plasma['Substance'] = plasma['Substance'].apply(lambda x: x.rstrip(' ').upper())
plasma['Synonym'] = plasma['Synonym'].apply(lambda x: x.upper() if type(x)==str else np.NaN)

In [5]:
plasma.head(20)

Unnamed: 0,Substance,Salts,Synonym,Blood-plasma concentration (mg/L) therapeutic (normal) upper limit,Unit
0,ABACAVIR,,,3.9,
1,ACAMPROSATE,,,0.7,
2,ACEBUTOLOL,,,2.0,
3,ACENOCOUMAROL,,,0.5,
4,ACETALDEHYDE,,,30.0,
5,ACETAZOLAMIDE,,,20.0,
6,ACETOHEXAMIDE,,,70.0,
7,ACETONE,,,20.0,
8,ACETYLDIGOXIN,,,0.0008,
9,ACETYLSALICYLIC ACID,,,200.0,


In [6]:
approved = pd.read_csv(basedir + '/data/chembl_approved_drugs.txt', sep='\t')
approved['synonyms'] = approved['synonyms'].apply(lambda x: str(x).upper())

In [7]:
approved.head()

Unnamed: 0,chembl_id,molregno,pref_name,compound_name,compound_key,synonyms,syn_type,standard_inchi_key,canonical_smiles,mw_freebase
0,CHEMBL2,97,PRAZOSIN,"[4-(4-Amino-6,7-dimethoxy-quinazolin-2-yl)-piperazin-1-yl]-furan-2-yl-methanone",1,PRAZOSIN,ATC,IENZQIKPVFGBNW-UHFFFAOYSA-N,COc1cc2nc(nc(N)c2cc1OC)N3CCN(CC3)C(=O)c4occc4,383.41
1,CHEMBL2,97,PRAZOSIN,"[4-(4-Amino-6,7-dimethoxy-quinazolin-2-yl)-piperazin-1-yl]-furan-2-yl-methanone",1,PRAZOSIN,BAN,IENZQIKPVFGBNW-UHFFFAOYSA-N,COc1cc2nc(nc(N)c2cc1OC)N3CCN(CC3)C(=O)c4occc4,383.41
2,CHEMBL2,97,PRAZOSIN,"[4-(4-Amino-6,7-dimethoxy-quinazolin-2-yl)-piperazin-1-yl]-furan-2-yl-methanone",1,PRAZOSIN,FDA,IENZQIKPVFGBNW-UHFFFAOYSA-N,COc1cc2nc(nc(N)c2cc1OC)N3CCN(CC3)C(=O)c4occc4,383.41
3,CHEMBL2,97,PRAZOSIN,"[4-(4-Amino-6,7-dimethoxy-quinazolin-2-yl)-piperazin-1-yl]-furan-2-yl-methanone",1,PRAZOSIN,INN,IENZQIKPVFGBNW-UHFFFAOYSA-N,COc1cc2nc(nc(N)c2cc1OC)N3CCN(CC3)C(=O)c4occc4,383.41
4,CHEMBL2,97,PRAZOSIN,"[4-(4-Amino-6,7-dimethoxy-quinazolin-2-yl)-piperazin-1-yl]-furan-2-yl-methanone",1,CP-12299,RESEARCH_CODE,IENZQIKPVFGBNW-UHFFFAOYSA-N,COc1cc2nc(nc(N)c2cc1OC)N3CCN(CC3)C(=O)c4occc4,383.41


In [8]:
# Substance in plasma_renamed are unique
len(plasma), len(plasma['Substance'].drop_duplicates())

(873, 873)

In [9]:
# Pref_name in approved are not unique - some compounds with different chembl_id have same name
len(approved[['chembl_id', 'pref_name']].drop_duplicates()), len(approved[['pref_name']].drop_duplicates())

(3813, 3741)

In [10]:
# Find names with more than one chembl_id and are overlapping with plasma concentration data
approved_name_counts = approved[['chembl_id', 'pref_name','canonical_smiles']].drop_duplicates().groupby(by='pref_name').count()
approved_name_counts.loc[approved_name_counts['chembl_id']>1,:]

multiples = approved.loc[approved['pref_name'].isin(approved_name_counts.loc[approved_name_counts['chembl_id']>1,:].index),['chembl_id','pref_name','canonical_smiles']].drop_duplicates().sort_values(by='pref_name')

multiples_overlapping = approved.loc[(approved['pref_name'].isin(approved_name_counts.loc[approved_name_counts['chembl_id']>1,:].index))&(approved['pref_name'].isin(plasma['Substance'])),['chembl_id','pref_name','canonical_smiles']].drop_duplicates().sort_values(by='pref_name')

In [11]:
# Manually inspect these, easy to see which ones are hydrated in smiles
multiples_overlapping

Unnamed: 0,chembl_id,pref_name,canonical_smiles
455010,CHEMBL3544909,CALCIFEDIOL,O.C[C@H](CCCC(C)(C)O)[C@H]1CC[C@H]2\\C(=C\\C=C/3\\C[C@@H](O)CCC3=C)\\CCC[C@]12C
269079,CHEMBL1040,CALCIFEDIOL,C[C@H](CCCC(C)(C)O)[C@H]1CC[C@H]2\\C(=C\\C=C/3\\C[C@@H](O)CCC3=C)\\CCC[C@]12C
19577,CHEMBL14,CARBACHOL,[Cl-].C[N+](C)(C)CCOC(=O)N
254955,CHEMBL965,CARBACHOL,C[N+](C)(C)CCOC(=O)N
432690,CHEMBL1201236,CARBIDOPA,C[C@@](Cc1ccc(O)c(O)c1)(NN)C(=O)O
420439,CHEMBL1200748,CARBIDOPA,O.C[C@@](Cc1ccc(O)c(O)c1)(NN)C(=O)O
187236,CHEMBL680,CEFACLOR,N[C@@H](C(=O)N[C@H]1[C@H]2SCC(=C(N2C1=O)C(=O)O)Cl)c3ccccc3
427988,CHEMBL1201018,CEFACLOR,O.N[C@@H](C(=O)N[C@H]1[C@H]2SCC(=C(N2C1=O)C(=O)O)Cl)c3ccccc3
85505,CHEMBL88,CYCLOPHOSPHAMIDE,ClCCN(CCCl)P1(=O)NCCCO1
422118,CHEMBL1200796,CYCLOPHOSPHAMIDE,O.ClCCN(CCCl)P1(=O)NCCCO1


In [12]:
# Here are the ones to keep
keep = ['CHEMBL269732','CHEMBL3833412','CHEMBL1412', 'CHEMBL1228', 'CHEMBL1425', 'CHEMBL1237', 'CHEMBL148', 'CHEMBL1433', 'CHEMBL88', 'CHEMBL680', 'CHEMBL1201236', 'CHEMBL965', 'CHEMBL1040']

In [13]:
# Remove all multiples
# Add back in the ones to keep

approved_unique = approved.loc[~approved['chembl_id'].isin(multiples['chembl_id']),['chembl_id', 'pref_name']].drop_duplicates()
approved_keep = approved.loc[approved['chembl_id'].isin(keep),['chembl_id','pref_name']].drop_duplicates()
approved_unique = pd.concat([approved_unique, approved_keep])

In [14]:
len(approved_unique)

3700

In [15]:
pref_name_mappings = pd.merge(plasma, approved_unique, left_on = 'Substance', right_on='pref_name')
pref_name_mappings

Unnamed: 0,Substance,Salts,Synonym,Blood-plasma concentration (mg/L) therapeutic (normal) upper limit,Unit,chembl_id,pref_name
0,ABACAVIR,,,3.90,,CHEMBL1380,ABACAVIR
1,ACAMPROSATE,,,0.70,,CHEMBL1201293,ACAMPROSATE
2,ACEBUTOLOL,,,2.00,,CHEMBL642,ACEBUTOLOL
3,ACENOCOUMAROL,,,0.50,,CHEMBL397420,ACENOCOUMAROL
4,ACETAZOLAMIDE,,,20.00,,CHEMBL20,ACETAZOLAMIDE
...,...,...,...,...,...,...,...
625,ZOLPIDEM,,,0.15,,CHEMBL911,ZOLPIDEM
626,ZOMEPIRAC,,,4.00,,CHEMBL19490,ZOMEPIRAC
627,ZONISAMIDE,,,40.00,,CHEMBL750,ZONISAMIDE
628,ZOPICLONE,,,0.05,,CHEMBL135400,ZOPICLONE


In [16]:
# Currently unmapped, Substance name is not in mappings based on pref_name
unmapped1 = plasma.loc[~plasma['Substance'].isin(pref_name_mappings['Substance']),:]

In [17]:
unmapped1.head()

Unnamed: 0,Substance,Salts,Synonym,Blood-plasma concentration (mg/L) therapeutic (normal) upper limit,Unit
4,ACETALDEHYDE,,,30.0,
7,ACETONE,,,20.0,
8,ACETYLDIGOXIN,,,0.0008,
9,ACETYLSALICYLIC ACID,,,200.0,
14,ADIPIODONE,meglumine,,1200.0,


In [18]:
# Try synonym from plasma concentrations on chembl pref_name
synonym_pref_name_mappings_initial = pd.merge(unmapped1, approved[['pref_name', 'chembl_id','canonical_smiles']].drop_duplicates(), left_on='Synonym', right_on='pref_name').drop_duplicates().sort_values(by='Substance')
synonym_pref_name_mappings_initial = synonym_pref_name_mappings_initial.loc[~synonym_pref_name_mappings_initial['Synonym'].isnull(),:]

In [19]:
synonym_pref_name_mappings_initial

Unnamed: 0,Substance,Salts,Synonym,Blood-plasma concentration (mg/L) therapeutic (normal) upper limit,Unit,pref_name,chembl_id,canonical_smiles
2030,ALENDRONATE,,ALENDRONIC ACID,0.265,,ALENDRONIC ACID,CHEMBL870,NCCCC(O)(P(=O)(O)O)P(=O)(O)O
2031,AZAPROPAZONE,,APAZONE,90.0,,APAZONE,CHEMBL1565476,CCCC1C(=O)N2N(C1=O)c3cc(C)ccc3N=C2N(C)C
2032,DEXTROPROPOXYPHENE,,PROPOXYPHENE,0.5,,PROPOXYPHENE,CHEMBL1213351,CCC(=O)O[C@@](Cc1ccccc1)([C@H](C)CN(C)C)c2ccccc2
2033,ETAMSYLATE,,ETHAMSYLATE,20.0,,ETHAMSYLATE,CHEMBL1514715,CCNCC.Oc1ccc(O)c(c1)S(=O)(=O)O
2034,INDOMETACIN,,INDOMETHACIN,3.0,,INDOMETHACIN,CHEMBL6,COc1ccc2c(c1)c(CC(=O)O)c(C)n2C(=O)c3ccc(Cl)cc3
2035,MESALAZINE,,MESALAMINE,1.0,,MESALAMINE,CHEMBL704,Nc1ccc(O)c(c1)C(=O)O
2037,METAMIZOLE,,DIPYRONE,10.0,,DIPYRONE,CHEMBL3989803,O.[Na+].CN(CS(=O)(=O)[O-])C1=C(C)N(C)N(C1=O)c2ccccc2
2036,METAMIZOLE,,DIPYRONE,10.0,,DIPYRONE,CHEMBL461522,CN(CS(=O)(=O)O)C1=C(C)N(C)N(C1=O)c2ccccc2
2038,PAMIDRONATE,,PAMIDRONIC ACID,0.25,,PAMIDRONIC ACID,CHEMBL834,NCCC(O)(P(=O)(O)O)P(=O)(O)O
2039,PHENAZONE,,ANTIPYRINE,25.0,,ANTIPYRINE,CHEMBL277474,CN1N(C(=O)C=C1C)c2ccccc2


In [20]:
# Need to drop CHEMBL3989803, not sure about CHEMBL136478 either

In [21]:
synonym_pref_name_mappings = synonym_pref_name_mappings_initial.loc[~synonym_pref_name_mappings_initial['chembl_id'].isin(['CHEMBL3989803','CHEMBL136478']),['Substance','pref_name', 'Blood-plasma concentration (mg/L) therapeutic (normal) upper limit', 'chembl_id']]

In [22]:
all_mappings = pd.concat([synonym_pref_name_mappings,pref_name_mappings],sort=False,ignore_index=True)

In [23]:
all_mappings.loc[all_mappings['pref_name']=='DIPYRONE',:]

Unnamed: 0,Substance,pref_name,Blood-plasma concentration (mg/L) therapeutic (normal) upper limit,chembl_id,Salts,Synonym,Unit
6,METAMIZOLE,DIPYRONE,10.0,CHEMBL461522,,,


In [24]:
unmapped2 = plasma.loc[(~plasma['Substance'].isin(all_mappings['Substance']))&(~plasma['Synonym'].isin(all_mappings['pref_name'])),:]

In [25]:
len(unmapped2)

229

In [26]:
all_mappings.head()

Unnamed: 0,Substance,pref_name,Blood-plasma concentration (mg/L) therapeutic (normal) upper limit,chembl_id,Salts,Synonym,Unit
0,ALENDRONATE,ALENDRONIC ACID,0.265,CHEMBL870,,,
1,AZAPROPAZONE,APAZONE,90.0,CHEMBL1565476,,,
2,DEXTROPROPOXYPHENE,PROPOXYPHENE,0.5,CHEMBL1213351,,,
3,ETAMSYLATE,ETHAMSYLATE,20.0,CHEMBL1514715,,,
4,INDOMETACIN,INDOMETHACIN,3.0,CHEMBL6,,,


# Third round

In [27]:
# Plasma Substance to ChEMBL synonyms
substance_synonyms_mappings = pd.merge(unmapped2, approved, left_on='Substance', right_on='synonyms')[['pref_name','Substance','syn_type','chembl_id','canonical_smiles', 'Blood-plasma concentration (mg/L) therapeutic (normal) upper limit', 'Unit']].drop_duplicates()

In [28]:
# Inspect manually and decide which ones to drop
substance_synonyms_mappings.sort_values(by='Substance')

Unnamed: 0,pref_name,Substance,syn_type,chembl_id,canonical_smiles,Blood-plasma concentration (mg/L) therapeutic (normal) upper limit,Unit
869,FLUOROURACIL,5-FLUOROURACIL,OTHER,CHEMBL185,FC1=CNC(=O)NC1=O,0.3,
3,ASPIRIN,ACETYLSALICYLIC ACID,TRADE_NAME,CHEMBL25,CC(=O)Oc1ccccc1C(=O)O,200.0,
0,ASPIRIN,ACETYLSALICYLIC ACID,ATC,CHEMBL25,CC(=O)Oc1ccccc1C(=O)O,200.0,
1,ASPIRIN,ACETYLSALICYLIC ACID,INN,CHEMBL25,CC(=O)Oc1ccccc1C(=O)O,200.0,
2,ASPIRIN,ACETYLSALICYLIC ACID,OTHER,CHEMBL25,CC(=O)Oc1ccccc1C(=O)O,200.0,
304,IODIPAMIDE,ADIPIODONE,ATC,CHEMBL1165268,OC(=O)c1c(I)cc(I)c(NC(=O)CCCCC(=O)Nc2c(I)cc(I)c(C(=O)O)c2I)c1I,1200.0,
305,IODIPAMIDE,ADIPIODONE,BAN,CHEMBL1165268,OC(=O)c1c(I)cc(I)c(NC(=O)CCCCC(=O)Nc2c(I)cc(I)c(C(=O)O)c2I)c1I,1200.0,
306,IODIPAMIDE,ADIPIODONE,INN,CHEMBL1165268,OC(=O)c1c(I)cc(I)c(NC(=O)CCCCC(=O)Nc2c(I)cc(I)c(C(=O)O)c2I)c1I,1200.0,
307,IODIPAMIDE,ADIPIODONE,JAN,CHEMBL1165268,OC(=O)c1c(I)cc(I)c(NC(=O)CCCCC(=O)Nc2c(I)cc(I)c(C(=O)O)c2I)c1I,1200.0,
328,METHYLPROMAZINE,ALIMEMAZINE,ATC,CHEMBL829,CC(CN(C)C)CN1c2ccccc2Sc3ccccc13,0.4,


In [29]:
to_drop = ['CHEMBL1200544','CHEMBL1256696', 'CHEMBL1701', 'CHEMBL1201059', 'CHEMBL1536']
substance_synonyms_mappings_to_add = substance_synonyms_mappings.loc[~substance_synonyms_mappings['chembl_id'].isin(to_drop),all_mappings.columns].drop_duplicates()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [30]:
substance_synonyms_mappings_to_add

Unnamed: 0,Substance,pref_name,Blood-plasma concentration (mg/L) therapeutic (normal) upper limit,chembl_id,Salts,Synonym,Unit
0,ACETYLSALICYLIC ACID,ASPIRIN,200.0,CHEMBL25,,,
304,ADIPIODONE,IODIPAMIDE,1200.0,CHEMBL1165268,,,
328,ALIMEMAZINE,METHYLPROMAZINE,0.4,CHEMBL829,,,
349,AMINOPHENAZONE,AMINOPYRINE,20.0,CHEMBL288470,,,
387,AMMONIA,"AMMONIA SOLUTION, STRONG",1.7,CHEMBL1160819,,,
392,AMRINONE,INAMRINONE,4.0,CHEMBL12856,,,
497,BENDROFLUAZIDE,BENDROFLUMETHIAZIDE,0.1,CHEMBL1684,,,
507,BETACAROTENE,BETA CAROTENE,6.0,CHEMBL1293,,,
577,CANDESARTAN,CANDESARTAN CILEXETIL,0.18,CHEMBL1014,,,
588,CEFALEXIN,CEPHALEXIN,65.0,CHEMBL1727,,,


In [31]:
len(all_mappings)

644

In [32]:
all_mappings = pd.concat([all_mappings, substance_synonyms_mappings_to_add], sort=False,ignore_index=True).drop_duplicates()

In [33]:
len(all_mappings)

672

In [34]:
all_mappings.head()

Unnamed: 0,Substance,pref_name,Blood-plasma concentration (mg/L) therapeutic (normal) upper limit,chembl_id,Salts,Synonym,Unit
0,ALENDRONATE,ALENDRONIC ACID,0.265,CHEMBL870,,,
1,AZAPROPAZONE,APAZONE,90.0,CHEMBL1565476,,,
2,DEXTROPROPOXYPHENE,PROPOXYPHENE,0.5,CHEMBL1213351,,,
3,ETAMSYLATE,ETHAMSYLATE,20.0,CHEMBL1514715,,,
4,INDOMETACIN,INDOMETHACIN,3.0,CHEMBL6,,,


# Round 4

In [35]:
# Substance to compound name
round4_mappings = pd.merge(unmapped2, approved, left_on='Substance',right_on='compound_name')

In [36]:
round4_mappings

Unnamed: 0,Substance,Salts,Synonym,Blood-plasma concentration (mg/L) therapeutic (normal) upper limit,Unit,chembl_id,molregno,pref_name,compound_name,compound_key,synonyms,syn_type,standard_inchi_key,canonical_smiles,mw_freebase
0,CHLORTALIDONE,,,1.4,,CHEMBL1055,134333,CHLORTHALIDONE,CHLORTALIDONE,CHLORTALIDONE,CHLORTALIDONE,ATC,JIVPVXMEBJLZRO-UHFFFAOYSA-N,NS(=O)(=O)c1cc(ccc1Cl)C2(O)NC(=O)c3ccccc23,338.77
1,CHLORTALIDONE,,,1.4,,CHEMBL1055,134333,CHLORTHALIDONE,CHLORTALIDONE,CHLORTALIDONE,CHLORTALIDONE,BAN,JIVPVXMEBJLZRO-UHFFFAOYSA-N,NS(=O)(=O)c1cc(ccc1Cl)C2(O)NC(=O)c3ccccc23,338.77
2,CHLORTALIDONE,,,1.4,,CHEMBL1055,134333,CHLORTHALIDONE,CHLORTALIDONE,CHLORTALIDONE,CHLORTALIDONE,BNF,JIVPVXMEBJLZRO-UHFFFAOYSA-N,NS(=O)(=O)c1cc(ccc1Cl)C2(O)NC(=O)c3ccccc23,338.77
3,CHLORTALIDONE,,,1.4,,CHEMBL1055,134333,CHLORTHALIDONE,CHLORTALIDONE,CHLORTALIDONE,CHLORTHALIDONE,FDA,JIVPVXMEBJLZRO-UHFFFAOYSA-N,NS(=O)(=O)c1cc(ccc1Cl)C2(O)NC(=O)c3ccccc23,338.77
4,CHLORTALIDONE,,,1.4,,CHEMBL1055,134333,CHLORTHALIDONE,CHLORTALIDONE,CHLORTALIDONE,CHLORTALIDONE,INN,JIVPVXMEBJLZRO-UHFFFAOYSA-N,NS(=O)(=O)c1cc(ccc1Cl)C2(O)NC(=O)c3ccccc23,338.77
5,CHLORTALIDONE,,,1.4,,CHEMBL1055,134333,CHLORTHALIDONE,CHLORTALIDONE,CHLORTALIDONE,CHLORTALIDONE,JAN,JIVPVXMEBJLZRO-UHFFFAOYSA-N,NS(=O)(=O)c1cc(ccc1Cl)C2(O)NC(=O)c3ccccc23,338.77
6,CHLORTALIDONE,,,1.4,,CHEMBL1055,134333,CHLORTHALIDONE,CHLORTALIDONE,CHLORTALIDONE,CHLORTHALIDONE,OTHER,JIVPVXMEBJLZRO-UHFFFAOYSA-N,NS(=O)(=O)c1cc(ccc1Cl)C2(O)NC(=O)c3ccccc23,338.77
7,CHLORTALIDONE,,,1.4,,CHEMBL1055,134333,CHLORTHALIDONE,CHLORTALIDONE,CHLORTALIDONE,G-33182,RESEARCH_CODE,JIVPVXMEBJLZRO-UHFFFAOYSA-N,NS(=O)(=O)c1cc(ccc1Cl)C2(O)NC(=O)c3ccccc23,338.77
8,CHLORTALIDONE,,,1.4,,CHEMBL1055,134333,CHLORTHALIDONE,CHLORTALIDONE,CHLORTALIDONE,CHLORTHALIDONE,TRADE_NAME,JIVPVXMEBJLZRO-UHFFFAOYSA-N,NS(=O)(=O)c1cc(ccc1Cl)C2(O)NC(=O)c3ccccc23,338.77
9,CHLORTALIDONE,,,1.4,,CHEMBL1055,134333,CHLORTHALIDONE,CHLORTALIDONE,CHLORTALIDONE,HYGROTON,TRADE_NAME,JIVPVXMEBJLZRO-UHFFFAOYSA-N,NS(=O)(=O)c1cc(ccc1Cl)C2(O)NC(=O)c3ccccc23,338.77


In [37]:
# Only two compounds in above, which have not been mapped yet (so don't need to regenerate unmapped for this round)
all_mappings.loc[all_mappings['pref_name'].isin(['CHLORTALIDONE', 'PIPERACILLINE']),:]

Unnamed: 0,Substance,pref_name,Blood-plasma concentration (mg/L) therapeutic (normal) upper limit,chembl_id,Salts,Synonym,Unit


In [38]:
len(all_mappings)

672

In [39]:
# Append these to all_mappings
all_mappings = pd.concat([all_mappings, round4_mappings[all_mappings.columns].drop_duplicates()], sort=False, ignore_index=True).drop_duplicates()

In [40]:
len(all_mappings)

673

# Round 5
Synonyms to synonyms

In [41]:
all_mappings.head()

Unnamed: 0,Substance,pref_name,Blood-plasma concentration (mg/L) therapeutic (normal) upper limit,chembl_id,Salts,Synonym,Unit
0,ALENDRONATE,ALENDRONIC ACID,0.265,CHEMBL870,,,
1,AZAPROPAZONE,APAZONE,90.0,CHEMBL1565476,,,
2,DEXTROPROPOXYPHENE,PROPOXYPHENE,0.5,CHEMBL1213351,,,
3,ETAMSYLATE,ETHAMSYLATE,20.0,CHEMBL1514715,,,
4,INDOMETACIN,INDOMETHACIN,3.0,CHEMBL6,,,


In [42]:
unmapped5 = plasma.loc[~plasma['Substance'].isin(all_mappings['Substance']),:]

In [43]:
len(unmapped5)

200

In [44]:
# Synonyms to synonyms
syn_to_syn_mappings = pd.merge(unmapped5.loc[~unmapped5['Synonym'].isnull(),:], approved, left_on='Synonym',right_on='synonyms')[['Substance','pref_name','Synonym','chembl_id','canonical_smiles', 'Blood-plasma concentration (mg/L) therapeutic (normal) upper limit', 'Unit']].drop_duplicates()
syn_to_syn_mappings

Unnamed: 0,Substance,pref_name,Synonym,chembl_id,canonical_smiles,Blood-plasma concentration (mg/L) therapeutic (normal) upper limit,Unit
0,4-AMINOPYRIDINE,DALFAMPRIDINE,FAMPRIDINE,CHEMBL284348,Nc1ccncc1,0.075,
108,"3,4-DIAMINOPYRIDIN",AMIFAMPRIDINE,DAP,CHEMBL354077,Nc1ccncc1N,0.1,
116,GOLD,GOLD SODIUM THIOMALATE,SODIUM AUROTHIOMALATE,CHEMBL306043,,8.0,
128,THIOCYANATE FROM NITROPRUSSIDE,SODIUM NITROPRUSSIDE,SODIUM NITROPRUSSIDE,CHEMBL136478,,30.0,


In [45]:
len(all_mappings)

673

In [46]:
all_mappings = pd.concat([all_mappings, syn_to_syn_mappings], sort=False, ignore_index=True)

In [47]:
len(all_mappings)

677

In [48]:
# Check these manually?
unmapped5.loc[(~unmapped5['Synonym'].isnull())&(unmapped5['Synonym'].str.contains(',')),:]

Unnamed: 0,Substance,Salts,Synonym,Blood-plasma concentration (mg/L) therapeutic (normal) upper limit,Unit
95,BISMUTH,,"WISMUT, BISMUT",0.1,
396,BETA-HEXACHLOROCYCLOHEXANE,,"BETA-HCH, BETA-LINDANE",0.0001,
405,4-HYDROXYBUTYRATE,,"GHB, SODIUM OXYBATE",120.0,
469,LYSERGIDE,,"LYSERGIC ACID DIETHYL AMIDE, LSD",0.005,
510,"3,4-METHYLENEDIOXYETHYLAMPHETAMINE",,"MDEA, MDE; EVE",0.2,
511,"3,4-METHYLENEDIOXYMETHYLAMPHETAMINE",,"MDMA; ECSTASY, XTC; ADAM",0.35,


In [49]:
all_mappings.head()

Unnamed: 0,Substance,pref_name,Blood-plasma concentration (mg/L) therapeutic (normal) upper limit,chembl_id,Salts,Synonym,Unit,canonical_smiles
0,ALENDRONATE,ALENDRONIC ACID,0.265,CHEMBL870,,,,
1,AZAPROPAZONE,APAZONE,90.0,CHEMBL1565476,,,,
2,DEXTROPROPOXYPHENE,PROPOXYPHENE,0.5,CHEMBL1213351,,,,
3,ETAMSYLATE,ETHAMSYLATE,20.0,CHEMBL1514715,,,,
4,INDOMETACIN,INDOMETHACIN,3.0,CHEMBL6,,,,


In [50]:
# hydroxybutyrate hasn't been mapped yet
all_mappings.loc[all_mappings['Substance']=='4-HYDROXYBUTYRATE',:]

Unnamed: 0,Substance,pref_name,Blood-plasma concentration (mg/L) therapeutic (normal) upper limit,chembl_id,Salts,Synonym,Unit,canonical_smiles


In [51]:
oxybate = plasma.loc[plasma['Substance']=='4-HYDROXYBUTYRATE',:]

In [52]:
approved.loc[approved['pref_name']=='OXYBATE',['chembl_id', 'pref_name']].drop_duplicates()

Unnamed: 0,chembl_id,pref_name
301588,CHEMBL1342,OXYBATE


In [53]:
oxybate['chembl_id'] = 'CHEMBL1342'
oxybate['pref_name'] = 'OXYBATE'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [54]:
len(all_mappings)

677

In [55]:
all_mappings = pd.concat([all_mappings, oxybate], sort=False, ignore_index=True)

In [56]:
len(all_mappings)

678

In [57]:
all_mappings.tail()

Unnamed: 0,Substance,pref_name,Blood-plasma concentration (mg/L) therapeutic (normal) upper limit,chembl_id,Salts,Synonym,Unit,canonical_smiles
673,4-AMINOPYRIDINE,DALFAMPRIDINE,0.075,CHEMBL284348,,FAMPRIDINE,,Nc1ccncc1
674,"3,4-DIAMINOPYRIDIN",AMIFAMPRIDINE,0.1,CHEMBL354077,,DAP,,Nc1ccncc1N
675,GOLD,GOLD SODIUM THIOMALATE,8.0,CHEMBL306043,,SODIUM AUROTHIOMALATE,,
676,THIOCYANATE FROM NITROPRUSSIDE,SODIUM NITROPRUSSIDE,30.0,CHEMBL136478,,SODIUM NITROPRUSSIDE,,
677,4-HYDROXYBUTYRATE,OXYBATE,120.0,CHEMBL1342,,"GHB, SODIUM OXYBATE",,


In [58]:
# Checking that all 'substance' only have one mapping to chembl_id
test_counts = all_mappings.groupby('Substance').count()

In [59]:
test_counts.loc[test_counts['pref_name']>1,:]

Unnamed: 0_level_0,pref_name,Blood-plasma concentration (mg/L) therapeutic (normal) upper limit,chembl_id,Salts,Synonym,Unit,canonical_smiles
Substance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


In [60]:
len(all_mappings)

678

# See currently unmapped

In [61]:
unmapped6 = plasma.loc[~plasma['Substance'].isin(all_mappings['Substance']),:]
len(unmapped6)

195

In [62]:
unmapped6.head()

Unnamed: 0,Substance,Salts,Synonym,Blood-plasma concentration (mg/L) therapeutic (normal) upper limit,Unit
4,ACETALDEHYDE,,,30.0,
7,ACETONE,,,20.0,
8,ACETYLDIGOXIN,,,0.0008,
16,AJMALINE,,,2.21,
19,ALDRIN,,,0.0015,


In [63]:
unmapped6['short substance'] = unmapped6['Substance'].apply(lambda x: x.rstrip('E'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


## Attach mappings after considering trailing 'e'

In [64]:
# Noticed some names had an 'e' on the end that prevented synonym mapping
my_columns = [i for i in all_mappings.columns]
short_names = pd.merge(unmapped6, approved, left_on='short substance', right_on='pref_name')[my_columns].drop_duplicates()
short_names

Unnamed: 0,Substance,pref_name,Blood-plasma concentration (mg/L) therapeutic (normal) upper limit,chembl_id,Salts,Synonym,Unit,canonical_smiles
0,ALFUZOSINE,ALFUZOSIN,0.06,CHEMBL709,,,,COc1cc2nc(nc(N)c2cc1OC)N(C)CCCNC(=O)C3CCCO3
60,DIBENZEPINE,DIBENZEPIN,0.5,CHEMBL1442422,,,,CN(C)CCN1C(=O)c2ccccc2N(C)c3ccccc13
72,LINEZOLIDE,LINEZOLID,4.0,CHEMBL126,,,,CC(=O)NC[C@H]1CN(C(=O)O1)c2ccc(N3CCOCC3)c(F)c2


In [65]:
len(all_mappings)

678

In [66]:
all_mappings = pd.concat([all_mappings, short_names], sort = False, ignore_index=True)

In [67]:
len(all_mappings)

681

In [68]:
all_mappings.tail()

Unnamed: 0,Substance,pref_name,Blood-plasma concentration (mg/L) therapeutic (normal) upper limit,chembl_id,Salts,Synonym,Unit,canonical_smiles
676,THIOCYANATE FROM NITROPRUSSIDE,SODIUM NITROPRUSSIDE,30.0,CHEMBL136478,,SODIUM NITROPRUSSIDE,,
677,4-HYDROXYBUTYRATE,OXYBATE,120.0,CHEMBL1342,,"GHB, SODIUM OXYBATE",,
678,ALFUZOSINE,ALFUZOSIN,0.06,CHEMBL709,,,,COc1cc2nc(nc(N)c2cc1OC)N(C)CCCNC(=O)C3CCCO3
679,DIBENZEPINE,DIBENZEPIN,0.5,CHEMBL1442422,,,,CN(C)CCN1C(=O)c2ccccc2N(C)c3ccccc13
680,LINEZOLIDE,LINEZOLID,4.0,CHEMBL126,,,,CC(=O)NC[C@H]1CN(C(=O)O1)c2ccc(N3CCOCC3)c(F)c2


## Attach the mw_freebase

In [69]:
all_mappings.head()

Unnamed: 0,Substance,pref_name,Blood-plasma concentration (mg/L) therapeutic (normal) upper limit,chembl_id,Salts,Synonym,Unit,canonical_smiles
0,ALENDRONATE,ALENDRONIC ACID,0.265,CHEMBL870,,,,
1,AZAPROPAZONE,APAZONE,90.0,CHEMBL1565476,,,,
2,DEXTROPROPOXYPHENE,PROPOXYPHENE,0.5,CHEMBL1213351,,,,
3,ETAMSYLATE,ETHAMSYLATE,20.0,CHEMBL1514715,,,,
4,INDOMETACIN,INDOMETHACIN,3.0,CHEMBL6,,,,


In [70]:
with_freebase_initial = pd.merge(all_mappings, approved, on='chembl_id', suffixes=('', '_y'))

In [71]:
mylist = ['Substance'
 , 'pref_name'
 , 'Blood-plasma concentration (mg/L) therapeutic (normal) upper limit'
 , 'Unit'
 , 'chembl_id'
 , 'molregno'
 , 'mw_freebase']

In [72]:
with_freebase = with_freebase_initial.loc[:,mylist].drop_duplicates().reset_index(drop=True,inplace=False)
with_freebase['Unit'] = with_freebase['Unit'].fillna('mg/L')

In [73]:
with_freebase.columns

Index(['Substance', 'pref_name',
       'Blood-plasma concentration (mg/L) therapeutic (normal) upper limit',
       'Unit', 'chembl_id', 'molregno', 'mw_freebase'],
      dtype='object')

In [74]:
with_freebase.columns = ['Original source Substance', 'pref_name',
       'Blood-plasma concentration, therapeutic (normal) upper limit',
       'Unit', 'chembl_id', 'molregno','mw_freebase']

In [75]:
all_info = with_freebase[['Original source Substance','pref_name', 'chembl_id', 'molregno','mw_freebase','Blood-plasma concentration, therapeutic (normal) upper limit', 'Unit']].sort_values(by='pref_name')

In [76]:
all_info.head()

Unnamed: 0,Original source Substance,pref_name,chembl_id,molregno,mw_freebase,"Blood-plasma concentration, therapeutic (normal) upper limit",Unit
14,ABACAVIR,ABACAVIR,CHEMBL1380,321707,286.34,3.9,mg/L
15,ACAMPROSATE,ACAMPROSATE,CHEMBL1201293,675244,181.21,0.7,mg/L
16,ACEBUTOLOL,ACEBUTOLOL,CHEMBL642,27347,336.43,2.0,mg/L
17,ACENOCOUMAROL,ACENOCOUMAROL,CHEMBL397420,394206,353.33,0.5,mg/L
665,PARACETAMOL,ACETAMINOPHEN,CHEMBL112,16450,151.16,25.0,mg/L


In [77]:
all_info.loc[all_info['Unit']!='mg/L',:]

Unnamed: 0,Original source Substance,pref_name,chembl_id,molregno,mw_freebase,"Blood-plasma concentration, therapeutic (normal) upper limit",Unit
174,CYSTEAMINE,CYSTEAMINE,CHEMBL602,21036,77.15,20.0,umol/L
193,DIDANOSINE,DIDANOSINE,CHEMBL1460,390877,236.23,30.0,umol/L
285,GEMCITABINE,GEMCITABINE,CHEMBL888,75901,263.2,20.0,umol/L


In [78]:
all_info.to_csv(basedir + '/results/interim/Schulz_ea_mapped_upper_plasma_concentrations.txt', sep='\t',index=None)