In [1]:
"""Clean up various characters etc from the extracted datasets of Fu/PPB from literature"""

'Clean up various characters etc from the datasets'

In [2]:
#### RDKIT env ###

In [3]:
import pandas as pd
import re
from rdkit import Chem



In [4]:
basedir = '/scratch/ias41/ae_code/plasma_concentrations'

In [5]:
pd.set_option('display.max_rows', 1000)

### Dataset 1 - PMID15637086

In [6]:
dataset1 = pd.read_excel(basedir + '/data/PMID15637086_ETPC_from_HTML.xlsx')
dataset1.columns = [i.strip('\xa0') for i in dataset1.columns]

In [7]:
dataset1 = dataset1.applymap(lambda x: x.replace('\xa0','').replace('\u2002', '').replace('\u20021',''))

In [8]:
def remove_numbers(x):
    pattern = re.compile('[0-9]+\.(.*)')
    drug_name = re.match(pattern, x).group(1)
    return drug_name

In [9]:
dataset1['Drug name'] = dataset1['Drug name'].apply(lambda x: remove_numbers(x))

In [10]:
dataset1['ETCPun-bound (nM)'] = dataset1['ETCPun-bound (nM)'].apply(lambda x: x.replace(' ', ''))

In [11]:
# drop drug combination
dataset1 = dataset1.drop(42)

In [12]:
dataset1['Drug name'] = dataset1['Drug name'].apply(lambda x: x.replace(' i.v.', ''))

In [13]:
dataset1_selection = dataset1[['Drug name', 'ETCPun-bound (nM)']]
dataset1_selection.columns = ['Original drug name', 'ETCP unbound (nM)']
dataset1_selection['PMID'] = 'PMID15637086'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
# Save to file
dataset1_selection.to_csv(basedir + '/results/interim/PMID15637086_data.txt', sep='\t', index=None)

### Dataset 2 - PMID22210121

In [15]:
dataset2 = pd.read_excel(basedir + '/data/PMID22210121_PPB_from_HTML.xlsx')

In [16]:
dataset2['fb(%)'] = dataset2['fb(%)'].apply(lambda x: str(x).replace('<', '').replace('>', ''))

In [17]:
def remove_approx(x):
    x = x.replace('\xa0', ' ').replace('\xa06', ' ')
    pattern = re.compile('([0-9\.]+) ± [0-9/.]+')
    match = re.match(pattern, x)
    if match:
        return match.group(1)
    else:
        return x

In [18]:
dataset2['fb(%)'] = dataset2['fb(%)'].apply(lambda x: remove_approx(x))

In [19]:
def find_upper_bound(x):
    match = re.match('([0-9]+)–([0-9]+)', str(x))
    if match:
        return float(match.group(1)), float(match.group(2))
    else:
        return float(x), None

In [20]:
dataset2['fb upper'] = dataset2['fb(%)'].apply(lambda x: find_upper_bound(x)[1])

In [21]:
dataset2['fb lower'] = dataset2['fb(%)'].apply(lambda x: find_upper_bound(x)[0])

In [22]:
dataset2_selection = dataset2[['Drugs', 'fb lower', 'fb upper']]
dataset2_selection['PMID'] = 'PMID22210121'
dataset2_selection.columns = ['Drug name', 'PPB lower', 'PPB upper', 'PMID']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [23]:
def separate_synonyms(x):
    pattern = '(.+)\((.+)\)'
    match = re.match(pattern, x)
    if match:
        return match.group(1).strip(' '), match.group(2)
    else:
        return x, None

In [24]:
dataset2_selection['Original drug name'] = dataset2_selection['Drug name'].apply(lambda x: separate_synonyms(x)[0])
dataset2_selection['Original synonyms'] = dataset2_selection['Drug name'].apply(lambda x: separate_synonyms(x)[1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [25]:
# Save to file
dataset2_selection[['Original drug name', 'Original synonyms', 'PPB lower', 'PPB upper', 'PMID']].to_csv(basedir + '/results/interim/PMID22210121_data.txt', sep='\t', index=None)

### Dataset 3 - PMID30115648

In [26]:
dataset3 = pd.read_excel(basedir + '/data/PMID30115648_Fu.xlsx')

In [27]:
dataset3_selection = dataset3.loc[~dataset3['fraction unbound \nin plasma (fu)'].isnull(),:][['Name', 'SMILES', 'fraction unbound \nin plasma (fu)']]

In [28]:
dataset3_selection['Original drug name'] = dataset3_selection['Name'].apply(lambda x: separate_synonyms(x)[0])
dataset3_selection['Original synonyms'] = dataset3_selection['Name'].apply(lambda x: separate_synonyms(x)[1])

In [29]:
def smiles_to_inchikey(x):
    try:
        mol = Chem.MolFromSmiles(x)
        if mol:
            inchikey = Chem.InchiToInchiKey(Chem.inchi.MolToInchi(mol))
            return inchikey
    except NameError:
        return None

In [30]:
dataset3_selection['inchi_key'] = dataset3_selection['SMILES'].apply(lambda x: smiles_to_inchikey(x))

RDKit ERROR: [22:14:02] Explicit valence for atom # 3 O, 3, is greater than permitted
RDKit ERROR: [22:14:02] Explicit valence for atom # 0 N, 4, is greater than permitted
RDKit ERROR: [22:14:02] Explicit valence for atom # 6 N, 4, is greater than permitted
RDKit ERROR: [22:14:02] SMILES Parse Error: syntax error while parsing: C(C([N+]([Gd+++]([N+]1(C2)C3)([N+](C2)(C2)C4)([O-]5)([O-]6)([O-]C3=O)([O-]C4=O)[O-]C2=O)(CC6=O)CC5=O)C1)C(=CC=C1OCC)C=C1
RDKit ERROR: [22:14:02] SMILES Parse Error: Failed parsing SMILES 'C(C([N+]([Gd+++]([N+]1(C2)C3)([N+](C2)(C2)C4)([O-]5)([O-]6)([O-]C3=O)([O-]C4=O)[O-]C2=O)(CC6=O)CC5=O)C1)C(=CC=C1OCC)C=C1' for input: 'C(C([N+]([Gd+++]([N+]1(C2)C3)([N+](C2)(C2)C4)([O-]5)([O-]6)([O-]C3=O)([O-]C4=O)[O-]C2=O)(CC6=O)CC5=O)C1)C(=CC=C1OCC)C=C1'
RDKit ERROR: [22:14:02] Explicit valence for atom # 3 N, 4, is greater than permitted
RDKit ERROR: [22:14:02] Explicit valence for atom # 6 O, 3, is greater than permitted
RDKit ERROR: [22:14:02] SMILES Parse Error: syntax err

In [31]:
dataset3_selection.columns = ['Name', 'SMILES', 'Fu',
       'Original drug name', 'Original synonyms', 'inchi_key']

In [32]:
# save to file
dataset3_selection['PMID'] = 'PMID30115648'
dataset3_selection[['Original drug name', 'Original synonyms', 'Fu', 'PMID', 'inchi_key', 'SMILES']].to_csv(basedir + '/results/interim/PMID30115648_data.txt', sep='\t', index=None)

### Dataset 4 - Redfern et al.

In [33]:
dataset4 = pd.read_excel(basedir + '/data/PMID12667944_PPB.xlsx')

In [34]:
dataset4.drop(labels = ['ETPC ng/ml lower bound','ETPC ng/ml higher bound'], axis=1, inplace=True)

In [35]:
dataset4.columns = ['Original drug name', 'PPB', 'PPB upper', 'ETCP unbound (nM)',
       'ETCP unbound (nM) upper']

In [36]:
dataset4['PMID'] = 'PMID12667944'
dataset4.to_csv(basedir + '/results/interim/PMID12667944_data.txt', sep='\t', index=False)