In [3]:
"""Combine previously extracted Fu/PPB data with Fu/PPB data from ChEMBL into one file"""

'Combine previously extracted Fu/PPB data with Fu/PPB data from ChEMBL into one file'

In [1]:
import pandas as pd

In [3]:
basedir = '/scratch/ias41/ae_code/plasma_concentrations'

In [5]:
dataset1 = pd.read_csv(basedir + '/results/interim/PMID22210121_mapped_data.txt', sep='\t')
dataset2 = pd.read_csv(basedir + '/results/interim/PMID15637086_mapped_data.txt', sep='\t')
dataset3 = pd.read_csv(basedir + '/results/interim/PMID30115648_mapped_data.txt', sep='\t')
dataset4 = pd.read_csv(basedir + '/results/interim/PMID12667944_mapped_data.txt', sep='\t')
chembl_data = pd.read_csv(basedir + '/data/chembl_fu_ppb.txt', sep='\t')

In [6]:
chembl_human = chembl_data.loc[chembl_data['assay_organism']=='Homo sapiens',:]
chembl_human = chembl_human.loc[chembl_human['data_validity_comment'].isnull(),:]
chembl_human = chembl_human.loc[~chembl_human['standard_value'].isnull(),:]

In [7]:
for i in [dataset1, dataset2, dataset3, dataset4, chembl_human]:
    print(i.columns)

Index(['Original drug name', 'chembl_id', 'molregno', 'Original synonyms',
       'PPB lower', 'PPB upper', 'PMID'],
      dtype='object')
Index(['Original drug name', 'chembl_id', 'molregno', 'ETCP unbound (nM)',
       'PMID'],
      dtype='object')
Index(['Original drug name', 'chembl_id', 'molregno', 'Original synonyms',
       'Fu', 'PMID', 'inchi_key', 'SMILES'],
      dtype='object')
Index(['Original drug name', 'chembl_id', 'molregno', 'PPB', 'PPB upper',
       'ETCP unbound (nM)', 'ETCP unbound (nM) upper', 'PMID'],
      dtype='object')
Index(['parent_molregno', 'parent_chembl_id', 'pref_name', 'max_phase',
       'standard_type', 'standard_relation', 'standard_value', 'upper_value',
       'standard_flag', 'assay_test_type', 'description', 'assay_organism',
       'assay_tissue', 'activity_comment', 'data_validity_comment',
       'assay_id'],
      dtype='object')


In [8]:
len(chembl_human['parent_molregno'].drop_duplicates())

5542

In [9]:
concatenated = pd.concat([dataset1, dataset2, dataset3, dataset4, chembl_human], ignore_index=True, sort=False)

In [10]:
len(concatenated['parent_molregno'].drop_duplicates())

5543

In [11]:
# Value, upper value, type, chembl_assay_id, reference

### Prepare each dataset

In [12]:
dataset1_selection = dataset1.copy()
dataset1_selection['type'] = 'PPB'

In [13]:
dataset1_selection.columns=['Original drug name', 'chembl_id', 'molregno', 'Original synonyms',
       'value', 'upper_value', 'reference', 'type']

In [14]:
dataset2_selection = dataset2.copy()

In [15]:
dataset2_selection['type'] = 'ETCP unbound (nM)'
dataset2_selection.columns=['Original drug name', 'chembl_id', 'molregno',
       'value', 'reference', 'type']

In [16]:
dataset3_selection = dataset3.copy()[['Original drug name', 'chembl_id', 'molregno', 'Original synonyms',
       'Fu', 'PMID']]
dataset3_selection['type'] = 'Fu'

In [17]:
dataset3_selection.columns=['Original drug name', 'chembl_id', 'molregno', 'Original synonyms',
       'value', 'reference', 'type']

In [18]:
# Need to do some further wrangling on dataset4

# separate into ppb and etcp
dataset4_ppb = dataset4.drop(labels = ['ETCP unbound (nM)', 'ETCP unbound (nM) upper'], axis=1)
# Drop rows with NAs only
dataset4_ppb.dropna(subset=['PPB', 'PPB upper'], how='all', axis=0, inplace=True)
dataset4_ppb['type'] = 'PPB'
dataset4_ppb.columns = ['Original drug name', 'chembl_id', 'molregno', 'value', 'upper_value',
       'reference', 'type']

In [19]:
dataset4.columns

Index(['Original drug name', 'chembl_id', 'molregno', 'PPB', 'PPB upper',
       'ETCP unbound (nM)', 'ETCP unbound (nM) upper', 'PMID'],
      dtype='object')

In [20]:
dataset4_etcp = dataset4.drop(labels = ['PPB', 'PPB upper'], axis=1)
dataset4_etcp.dropna(subset=['ETCP unbound (nM)', 'ETCP unbound (nM) upper'], how='all', axis=0, inplace=True)
dataset4_etcp['type'] = 'ETCP unbound (nM)'

In [21]:
dataset4_etcp.columns = ['Original drug name', 'chembl_id', 'molregno', 'value',
       'upper_value', 'reference', 'type']

In [22]:
# keep tissues not applicable
chembl_selection = chembl_human.loc[~chembl_human['assay_tissue'].isin(['Serum', 'Kidney', 'Liver', 'Urine']),:]

In [23]:
chembl_selection.drop(labels=['parent_chembl_id', 'max_phase', 'standard_relation', 'standard_flag', 'assay_test_type', 'description', 'assay_organism', 'assay_tissue', 'activity_comment', 'data_validity_comment'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [24]:
chembl_selection.columns = ['molregno', 'Original drug name', 'type', 'value',
       'upper_value', 'assay_id']

In [25]:
all_data = pd.concat([dataset1_selection, dataset2_selection, dataset3_selection, dataset4_ppb, dataset4_etcp, chembl_selection], ignore_index=True, sort=False)

In [27]:
all_data.columns = ['Drug name', 'chembl_id', 'molregno', 'Original synonyms',
       'value', 'upper_value', 'reference', 'type', 'assay_id']

In [28]:
all_data.to_csv(basedir + '/results/interim/combined_fu_ppb_etcp.txt', sep='\t', index=False)