In [1]:
"""There are files with bioactivity data and those with plasma conc. data. Need to integrate these files.
Write a file with integrated plasma concentration/activity"""

'There are files with bioactivity data and those with plasma conc. data. Need to integrate these files.\nWrite a file with integrated plasma concentration/activity'

In [2]:
import pandas as pd

In [6]:
basedir = '/scratch/ias41/ae_code'

In [16]:
# Open files

# Bioactivity data
median_bioact = pd.read_csv(basedir + '/bioactivities/results/bioact_medians_ae_drugs.txt', sep='\t')

# Total plasma concentration data
median_plasma = pd.read_csv(basedir + '/plasma_concentrations/results/molregno2median_plasma_total_unbound.txt', sep='\t')
median_plasma.rename(columns={'molregno': 'parent_molregno'}, inplace=True)

In [19]:
# Defining the way that plasma concentrations will be integrated

# We want the bioactivity concentration to be higher than the plasma concentration 
# To achieve that, the pMolar plasma concentration value should be higher than the pchembl_value 
# This is the case when the Ratio pIC50/pCmax > 1, in other words the Difference of pchembl-pmolar is positive (pchembl is higher than pmolar, so bioactivity concentration is lower than cmax)
# to accept upto 10-fold difference in the other direction, which is when difference of pchembl-pmolar is negative (pchembl is lower than pmolar, so bioactivity concentration higher than plasma concentration)
# Difference of pchembl-pmolar must be 1 or less (if negative), i.e. between 0 and -1

def define_integrated_activity(x, margin):
    """Integrate the plasma concentration with the bioactivity data to an 'integrated_plasma_activity', which is binary.
    The 'margin' is whether to accept a 1-log unit/10-fold margin in case the Ratio of IC50/Cmax is not above 1. (plasma concentration is lower than IC50).
    kwargs -- x, dataframe
    margin -- True or False"""
    
    if margin == True:
    
        if x['Ratio'] > 1:
            return 1
        elif ((x['Difference']<0)&(x['Difference']>-1)) == True:
            return 1
        else:
            return 0
        
    if margin == False:
        if x['Ratio'] > 1:
            return 1
        else:
            return 0

In [25]:
def integrate_plasma(bioact_df, plasma_concentrations, plasma_column_name, margin):
    """Merge bioactivity and plasma concentration files, and compute IC50/Cmax Ratio. 
    Return dataframe with integrated plasma activity (binary) per compound-target combo.
    kwargs:
    bioact_df -- bioactivity dataframe
    plasma_concentrations -- dataframe with plasma concentrations
    plasma_column_name -- name of pMolar (log units) total conc. or unbound conc. column in plasma dataframe
    margin -- True or False, whether to use 1-log unit margin for difference between ic50 and cmax
    """
    
    # extract relevant columns 
    bioact_slim = bioact_df[['parent_molregno', 'accession', 'summary']]
    bioact_numeric = bioact_slim.loc[bioact_slim['summary']!='inactive',:]
    bioact_numeric['summary'] = bioact_numeric['summary'].astype('float')
    bioact_qualitative = bioact_slim.loc[bioact_slim['summary']=='inactive',:]
    
    # merge plasma concentrations with bioactivity and compoute Ratio, Difference, and integrated_plasma_activity
    plasma_concentrations_selected = plasma_concentrations.loc[~plasma_concentrations[plasma_column_name].isnull()]
    
    merged = plasma_concentrations_selected.merge(bioact_numeric, on = 'parent_molregno')    
    merged['Ratio'] = merged['summary'] / merged[plasma_column_name]
    merged['Difference'] = merged['summary'] - merged[plasma_column_name]
    merged['integrated_plasma_activity'] = merged.apply(define_integrated_activity, axis=1, margin=margin)
    
    # for qualitative inactive data, set integrated_plasma_activity to 0
    bioact_qualitative['integrated_plasma_activity'] = 0
    
    # Concatenate numeric and inactive data again
    merged_inactive = pd.concat([merged, bioact_qualitative], sort=False)
            
    return merged_inactive

In [43]:
def restrict_min_n(integrated_df):
    """Return copies of dataframe with bioactivities with less than 5 compounds and less than 5 active compounds removed.
    kwargs: integrated_df -- dataframe with integrated bioact&plasma concentration"""
    
    # Find which targets have less than 5 compounds associated
    targets_without_5_compounds = list()
    for group in integrated_df.groupby('accession'):
        if len(group[1]['parent_molregno'].drop_duplicates()) < 5:
            targets_without_5_compounds.append(group[0])
            
    # Find which targets have less than 5 active compounds associated
    targets_without_5_active_compounds = list()    
    for group in integrated_df.groupby('accession'):
        if len(group[1].loc[group[1]['integrated_plasma_activity']==1,:]) < 5:
            targets_without_5_active_compounds.append(group[0])
               
    chembl_plasma_margin_minimum5 = integrated_df.loc[~integrated_df['accession'].isin(targets_without_5_compounds),:]
    chembl_plasma_margin_minimum5active = integrated_df.loc[~integrated_df['accession'].isin(targets_without_5_active_compounds),:]
    
    return chembl_plasma_margin_minimum5, chembl_plasma_margin_minimum5active

In [33]:
# Do integration - not restricted by minimum n

total_median_margin = integrate_plasma(bioact_df = median_bioact, plasma_concentrations = median_plasma, plasma_column_name='median pMolar total plasma concentration', margin=True)
total_median_no_margin = integrate_plasma(bioact_df = median_bioact, plasma_concentrations = median_plasma, plasma_column_name='median pMolar total plasma concentration', margin=False)

unbound_median_margin = integrate_plasma(bioact_df = median_bioact, plasma_concentrations = median_plasma, plasma_column_name='median pMolar unbound plasma concentration', margin=True)
unbound_median_no_margin = integrate_plasma(bioact_df = median_bioact, plasma_concentrations = median_plasma, plasma_column_name='median pMolar unbound plasma concentration', margin=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [40]:
# Save files not restricted by min 5 - so that can combine with target predictions and then do cut off min_n later
total_median_margin.to_csv(basedir + '/integration_bioact_plasma_conc/results/total_median_margin.txt', sep = '\t', index=False)
total_median_no_margin.to_csv(basedir + '/integration_bioact_plasma_conc/results/total_median_no_margin.txt', sep = '\t', index=False)

unbound_median_margin.to_csv(basedir + '/integration_bioact_plasma_conc/results/unbound_median_margin.txt', sep = '\t', index=False)
unbound_median_no_margin.to_csv(basedir + '/integration_bioact_plasma_conc/results/unbound_median_no_margin.txt', sep = '\t', index=False)

In [46]:
# Do integration - restricted by min n

total_median_margin_min5, total_median_margin_min5active = restrict_min_n(total_median_margin)
total_median_no_margin_min5, total_median_no_margin_min5active = restrict_min_n(total_median_no_margin)

unbound_median_margin_min5, unbound_median_margin_min5active = restrict_min_n(unbound_median_margin)
unbound_median_no_margin_min5, unbound_median_no_margin_min5active = restrict_min_n(unbound_median_no_margin)

In [47]:
# Save files min5

# Files restricted by min_n
total_median_margin_min5active.to_csv(basedir + '/integration_bioact_plasma_conc/results/total_median_margin_min5active.txt', sep = '\t', index=False)
total_median_no_margin_min5active.to_csv(basedir + '/integration_bioact_plasma_conc/results/total_median_no_margin_min5active.txt', sep = '\t', index=False)

unbound_median_margin_min5active.to_csv(basedir + '/integration_bioact_plasma_conc/results/unbound_median_margin_min5active.txt', sep = '\t', index=False)
unbound_median_no_margin_min5active.to_csv(basedir + '/integration_bioact_plasma_conc/results/unbound_median_no_margin_min5active.txt', sep = '\t', index=False)