In [1]:
"""Take mean of lower-upper Fu/PPB per publication, take medians of Fus from different publications, calculate Fu and multiply by total plasma conc. to get unbound. Add the few extracted unbound concentrations back and take median per compound. Add back a few total concentrations without unbound info."""

'Take mean of lower-upper Fu/PPB per publication, take medians of Fus from different publications, calculate Fu and multiply by total plasma conc. to get unbound. Add the few extracted unbound concentrations back and take median per compound. Add back a few total concentrations without unbound info.'

In [1]:
import pandas as pd
import numpy as np

1. if unbound concentration available, keep apart to combine later
2. for other data, take median of various fu/ppb figures
3. merge that data with plasma concentrations and use to calculate 'calculated unbound concentration'
4. combined with etcp data
4. convert to -log(molar unbound) 


In [2]:
basedir = '/scratch/ias41/ae_code/plasma_concentrations'

In [32]:
combined_data = pd.read_csv(basedir + '/results/interim/combined_fu_ppb_etcp.txt', sep='\t')
plasma_conc = pd.read_csv(basedir + '/results/interim/molregno2median_total_plasma_conc.txt', sep='\t')
approved_drugs = pd.read_csv(basedir + '/data/chembl_approved_drugs.txt', sep='\t')

In [4]:
molregno2pref_name = dict()
for row in approved_drugs[['molregno','pref_name']].drop_duplicates().iterrows():
    molregno2pref_name[row[1]['molregno']] = row[1]['pref_name']

In [5]:
molregno2pref_name[97]

'PRAZOSIN'

In [6]:
plasma_conc.head()

Unnamed: 0,molregno,pref_name,parent_chembl_id,median_Molar_value,median_pMolar_value
0,97,PRAZOSIN,CHEMBL2,5.216348e-08,7.282633
1,115,NICOTINE,CHEMBL3,1.849112e-07,6.733037
2,146,OFLOXACIN,CHEMBL4,8.025016e-06,5.095554
3,147,NALIDIXIC ACID,CHEMBL5,0.0001291767,3.888816
4,173,INDOMETHACIN,CHEMBL6,5.589871e-06,5.252598


In [33]:
plasma_conc.columns = ['molregno', 'pref_name','chembl_id','median Molar total plasma concentration', 'median pMolar total plasma concentration']

In [9]:
etcp_data = combined_data.loc[combined_data['type']=='ETCP unbound (nM)',:]
other_data = combined_data.loc[combined_data['type']!='ETCP unbound (nM)',:]

#### For ETCP data, convert to concentrations to Molar value. If there are two values, take mean

In [10]:
def value_nM_to_M(row):
    # Take higher range value
    if pd.isna(row['upper_value']):
        return (row['value'] * (10**-9))
    if pd.isna(row['upper_value']) == False:
        return (np.mean([row['value'], row['upper_value']]) * (10**-9))

In [11]:
etcp_data['unbound_molar_value'] = etcp_data.apply(value_nM_to_M, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


#### Take mean of available upper and lower PPB measurements per compound from same publication

In [12]:
# Invert PPB
def calculate_fu(x):
    if x['type'] == 'PPB':
        if pd.isna(x['upper_value']):
            Fu = (100 - x['value'])/100
        elif pd.isna(x['upper_value']) == False:
            # in case a range is reported - take average of value and upper_value
            ppb_average = (x['value']+x['upper_value'])/2
            fu = Fu = (100 - ppb_average)/100
        return Fu
    elif x['type'] == 'Fu':
        return x['value']

In [13]:
other_data['Fu'] = other_data.apply(calculate_fu, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [14]:
# Take median of Fu data from different sources
#Fus = other_data.groupby(by='molregno')['Fu'].median().reset_index()

In [15]:
other_data.sort_values(by='molregno')

Unnamed: 0,Drug name,chembl_id,molregno,Original synonyms,value,upper_value,reference,type,assay_id,Fu
677,PRAZOSIN,CHEMBL2,97,,0.060,,PMID30115648,Fu,,0.060
134,PRAZOSIN,CHEMBL2,97,,95.000,,PMID22210121,PPB,,0.050
3601,PRAZOSIN,,97,,0.053,,,Fu,CHEMBL3591953,0.053
1652,PRAZOSIN,,97,,0.060,,,Fu,CHEMBL1614672,0.060
1362,PRAZOSIN,,97,,0.060,,,Fu,CHEMBL1043580,0.060
...,...,...,...,...,...,...,...,...,...,...
7552,,,2198150,,99.300,,,PPB,CHEMBL3864372,0.007
7550,,,2198573,,98.300,,,PPB,CHEMBL3864372,0.017
7551,,,2198627,,98.800,,,PPB,CHEMBL3864372,0.012
3876,,,2198868,,1.000,,,Fu,CHEMBL3863238,1.000


In [16]:
# Merge with plasma concentrations
Fus_merged = other_data[['molregno','Fu']].merge(plasma_conc, on='molregno')

In [17]:
Fus_merged.loc[Fus_merged['chembl_id'].isnull()]

Unnamed: 0,molregno,Fu,pref_name,chembl_id,Molar total plasma concentration,pMolar total plasma concentration


In [18]:
# Calculate unbound concentration
Fus_merged['Molar unbound plasma concentration'] = Fus_merged['Fu'] * Fus_merged['Molar total plasma concentration']

In [19]:
etcp_data.columns

Index(['Drug name', 'chembl_id', 'molregno', 'Original synonyms', 'value',
       'upper_value', 'reference', 'type', 'assay_id', 'unbound_molar_value'],
      dtype='object')

In [20]:
etcp_data.loc[etcp_data['chembl_id'].isnull()]

Unnamed: 0,Drug name,chembl_id,molregno,Original synonyms,value,upper_value,reference,type,assay_id,unbound_molar_value


In [21]:
# combine with etcp data
ds1 = etcp_data[['molregno', 'chembl_id','unbound_molar_value']]
ds1.columns = ['molregno','chembl_id','Molar unbound plasma concentration']

In [22]:
ds1['pref_name'] = ds1['molregno'].apply(lambda x: molregno2pref_name[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [23]:
Fus_merged.columns

Index(['molregno', 'Fu', 'pref_name', 'chembl_id',
       'Molar total plasma concentration', 'pMolar total plasma concentration',
       'Molar unbound plasma concentration'],
      dtype='object')

In [24]:
ds2 = Fus_merged[['pref_name','molregno','chembl_id','Molar total plasma concentration','Molar unbound plasma concentration']]
combined_unbound = pd.concat([ds1,ds2], ignore_index=True, sort=False)

In [25]:
combined_unbound.head()

Unnamed: 0,molregno,chembl_id,Molar unbound plasma concentration,pref_name,Molar total plasma concentration
0,27185,CHEMBL633,5e-10,AMIODARONE,
1,6243,CHEMBL72,1.08e-07,DESIPRAMINE,
2,11143,CHEMBL517,7.42e-07,DISOPYRAMIDE,
3,2223,CHEMBL41,2.9e-08,FLUOXETINE,
4,3859,CHEMBL54,3.6e-09,HALOPERIDOL,


In [26]:
combined_unbound.tail()

Unnamed: 0,molregno,chembl_id,Molar unbound plasma concentration,pref_name,Molar total plasma concentration
1923,228121,CHEMBL1229,2.218559e-07,OSELTAMIVIR,3.825102e-07
1924,469866,CHEMBL521686,1.442217e-06,OLAPARIB,5.984303e-06
1925,52087,CHEMBL806,1.031824e-07,FLUTAMIDE,1.719706e-06
1926,453579,CHEMBL1581,1.628355e-07,PERINDOPRIL,4.070888e-07
1927,21523,CHEMBL603,8.051208e-09,ZAFIRLUKAST,8.051208e-07


In [27]:
# Now, after adding unbound datapoints, calculate median
# Some of the total columns have NA values because some values were reported as unbound directly
# The default of 'median' is to skip NAs so they are not included in calculation
medians_unbound = combined_unbound.groupby('molregno').agg({'Molar unbound plasma concentration': 'median', 'chembl_id': 'first', 'pref_name': 'first', 'Molar total plasma concentration': 'median'})

In [28]:
medians_unbound.rename(columns = {'Molar unbound plasma concentration': 'median Molar unbound plasma concentration', 'Molar total plasma concentration': 'median Molar total plasma concentration'}, inplace=True)

In [29]:
medians_unbound.reset_index(drop=False, inplace=True)

In [30]:
medians_unbound.columns

Index(['molregno', 'median Molar unbound plasma concentration', 'chembl_id',
       'pref_name', 'median Molar total plasma concentration'],
      dtype='object')

In [39]:
# Add back in the rows without unbound from total dataframe (which were already medians)
# Rows from total plasma without unbound data
total_only_rows = plasma_conc.loc[~plasma_conc['molregno'].isin(medians_unbound['molregno'])].sort_values(by='pref_name')
total_only_rows.head()

Unnamed: 0,molregno,pref_name,chembl_id,median Molar total plasma concentration,median pMolar total plasma concentration
449,169652,ACITRETIN,CHEMBL1131,1.531675e-07,6.814833
470,226880,ACRIVASTINE,CHEMBL1224,2.008897e-07,6.697042
103,8209,AGOMELATINE,CHEMBL10878,1.232995e-06,5.909039
673,654243,ALCURONIUM,CHEMBL1180293,4.498358e-06,5.346946
495,282661,AMIFAMPRIDINE,CHEMBL354077,9.163383e-07,6.037944


In [41]:
medians_total_unbound = pd.concat([medians_unbound, total_only_rows], sort=False)

In [49]:
medians_total_unbound.loc[medians_total_unbound['median Molar unbound plasma concentration'].isnull()]

Unnamed: 0,molregno,median Molar unbound plasma concentration,chembl_id,pref_name,median Molar total plasma concentration,median pMolar total plasma concentration
449,169652,,CHEMBL1131,ACITRETIN,1.531675e-07,6.814833
470,226880,,CHEMBL1224,ACRIVASTINE,2.008897e-07,6.697042
103,8209,,CHEMBL10878,AGOMELATINE,1.232995e-06,5.909039
673,654243,,CHEMBL1180293,ALCURONIUM,4.498358e-06,5.346946
495,282661,,CHEMBL354077,AMIFAMPRIDINE,9.163383e-07,6.037944
...,...,...,...,...,...,...
427,141484,,CHEMBL89598,VIGABATRIN,3.484051e-04,3.457916
401,114406,,CHEMBL306700,VILOXAZINE,3.371260e-05,4.472208
671,644928,,CHEMBL1165342,VINCAMINE,7.053181e-07,6.151615
612,452689,,CHEMBL517199,XIPAMIDE,5.636661e-05,4.248978


In [50]:
len(medians_total_unbound['molregno'].drop_duplicates())

765

In [51]:
# calulate -log(molar unbound concentration)
medians_total_unbound['median pMolar total plasma concentration'] = medians_total_unbound['median Molar total plasma concentration'].apply(lambda x: -np.log10(x))
medians_total_unbound['median pMolar unbound plasma concentration'] = medians_total_unbound['median Molar unbound plasma concentration'].apply(lambda x: -np.log10(x))

In [52]:
medians_total_unbound.head()

Unnamed: 0,molregno,median Molar unbound plasma concentration,chembl_id,pref_name,median Molar total plasma concentration,median pMolar total plasma concentration,median pMolar unbound plasma concentration
0,97,3.129809e-09,CHEMBL2,PRAZOSIN,5.216348e-08,7.282633,8.504482
1,115,1.756657e-07,CHEMBL3,NICOTINE,1.849112e-07,6.733037,6.755313
2,146,6.018762e-06,CHEMBL4,OFLOXACIN,8.025016e-06,5.095554,5.220493
3,147,1.033414e-05,CHEMBL5,NALIDIXIC ACID,0.0001291767,3.888816,4.985726
4,173,5.589871e-08,CHEMBL6,INDOMETHACIN,5.589871e-06,5.252598,7.252598


In [55]:
# Save to CSV
medians_total_unbound[['molregno', 'chembl_id', 'pref_name', 'median Molar total plasma concentration', 'median pMolar total plasma concentration', 'median Molar unbound plasma concentration', 'median pMolar unbound plasma concentration']].to_csv(basedir + '/results/molregno2median_plasma_total_unbound.txt', sep='\t', index=False)