In [1]:
"""Find overlap between FAERS and SIDER compounds with bioactivities, write report of % of datapoints from different sources within ChEMBL, and calculate % matrix sparsity.
Then add additional measurements from target prediction and do same counts."""

'Find overlap between FAERS and SIDER compounds with bioactivities, write report of % of datapoints from different sources within ChEMBL, and calculate % matrix sparsity.\nThen add additional measurements from target prediction and do same counts.'

In [2]:
import pandas as pd
import pickle
import numpy as np

In [3]:
basedir = '/scratch/ias41/ae_code'
project_dir = basedir + '/bioactivities'

In [4]:
bioact = pd.read_csv(project_dir + '/data/bioactivities_processed.txt', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [5]:
with open(basedir + '/faers_aes/results/20200108_PSM_molregno2aes_PRR2_chi4_faers_min5drugs_all_random_controls.pkl', 'rb') as f:
    faers_aes = pickle.load(f)
with open(basedir + '/sider/results/20191215_molregno2aes_sider_min5drugs.pkl', 'rb') as f:
    sider_aes = pickle.load(f)

### Processed bioactivities overlap with AE datasets

In [6]:
drugs = faers_aes.keys() | sider_aes.keys()

In [7]:
bioact_drugs = bioact.loc[bioact['parent_molregno'].isin(drugs)]

In [8]:
len(bioact_drugs)

113710

### Check sources of bioactivity data

In [9]:
src_dict = {}
for group in bioact_drugs.groupby('src_description'):
     src_dict[group[0]] = '{:.3f}'.format((len(group[1])/len(bioact_drugs))*100)

In [10]:
with open(project_dir + '/results/bioact_sources_report.txt', 'w') as f:
    f.write("Data sources (%s of retrieved measurements before taking medians) after overlapping with AE compounds: {}".format(src_dict))

In [11]:
# Separate 'inactive' rows for a moment

bioact_drugs_inactive = bioact_drugs.loc[bioact_drugs['summary']=='inactive',:]
bioact_drugs_numeric = bioact_drugs.loc[bioact_drugs['summary']!='inactive',:]
bioact_drugs_numeric['summary'] = bioact_drugs_numeric['summary'].astype('float')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [12]:
# Take medians of numeric data
bioact_medians = bioact_drugs_numeric.groupby(['parent_molregno','accession']).agg({'parent_chembl_id': 'first', 'target_organism': 'first', 'summary': 'median'}).reset_index(drop=False)

In [13]:
# Add back 'inactive' rows but only if not numeric value available

In [14]:
measured_compound_target_pairs = [tuple(x) for x in bioact_medians[['parent_molregno','accession']].values]

In [15]:
def determine_numeric_avail(x):
    if (int(x['parent_molregno']), x['accession']) in measured_compound_target_pairs:
        return 1
    else:
        return 0

In [16]:
bioact_drugs_inactive['numeric_avail'] = bioact_drugs_inactive.apply(determine_numeric_avail, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [17]:
len(bioact_drugs_inactive)

92506

In [18]:
len(bioact_drugs_inactive.loc[bioact_drugs_inactive['numeric_avail']==1])

764

In [19]:
bioact_medians.columns

Index(['parent_molregno', 'accession', 'parent_chembl_id', 'target_organism',
       'summary'],
      dtype='object')

In [20]:
# Add back inactive rows
bioact_drugs_inactive_selection = bioact_drugs_inactive.loc[bioact_drugs_inactive['numeric_avail']==0]
bioact_drugs_inactive_selection = bioact_drugs_inactive_selection[['parent_molregno', 'accession', 'parent_chembl_id', 'target_organism','summary']].drop_duplicates()
bioact_drugs_all = pd.concat([bioact_medians, bioact_drugs_inactive_selection], sort=False)

In [21]:
len(bioact_drugs_all[['parent_molregno','accession','summary']].drop_duplicates())

56909

In [22]:
len(bioact_drugs_all)

56909

In [23]:
# Create a report about the bioactivity data

total_cells = len(bioact_drugs_all['parent_molregno'].drop_duplicates())*len(bioact_drugs_all['accession'].drop_duplicates())

frac_filled = len(bioact_drugs_all[['parent_molregno', 'accession']].drop_duplicates()) / total_cells
frac_inactive_comment = len(bioact_drugs_all.loc[bioact_drugs_all['summary']=='inactive',['parent_molregno','accession']].drop_duplicates()) / total_cells
frac_below6 = len(bioact_medians.loc[bioact_medians['summary']<6,['parent_molregno', 'accession']].drop_duplicates()) / total_cells
frac_over6 = len(bioact_medians.loc[bioact_medians['summary']>=6,['parent_molregno', 'accession']].drop_duplicates()) / total_cells
total = frac_inactive_comment + frac_below6 + frac_over6

info = [
'Number of drug-target pairs: {}'.format(len(bioact_drugs_all[['parent_molregno', 'accession']]))
, 'Number of unique drugs from AE datasets with bioactivities: {}'.format(len(bioact_drugs_all['parent_molregno'].drop_duplicates()))
, 'Number of unique Uniprot IDs: {}'.format(len(bioact_drugs_all['accession'].drop_duplicates()))
, 'Total cells: {} x {} = {}'.format(str(len(bioact_drugs_all['parent_molregno'].drop_duplicates())), len(bioact_drugs_all['accession'].drop_duplicates()), total_cells)
, 'Percentage of cells of matrix (compound-target pairs) with data (active/inactive/activity_comment): {:.3f}%'.format(frac_filled*100)
, 'Percentage of drug-target pairs with "inactive" from activity_comment: {:.3f}%'.format(frac_inactive_comment*100)
, 'Percentage of drug-target pairs with pchembl < 6: {:.3f}%'.format(frac_below6*100)
, 'Percentage of drug-target pairs with pchembl >= 6: {:.3f}%'.format(frac_over6*100)
, 'Total last 3 percentages check: {:.3f}%'.format(total*100)
]    

with open(project_dir + '/results/bioactivities_matrix_counts_report.txt', 'w') as f:
    f.write('\n'.join(info))

In [24]:
# Save bioactivity
bioact_drugs_all.to_csv(project_dir + '/results/bioact_medians_ae_drugs.txt', sep='\t', index=False)

### Open target prediction data

In [25]:
tp = pd.read_csv(project_dir + '/data/pidgin_input.smi_out_predictions_20200108-164123_ad70pr0.7.txt', sep='\t')
tp.set_index('Compound', inplace=True)

In [26]:
# Reformat dataframe

all_compound_target_combinations = [(compound,target) for compound in tp.index for target in set([i.split('_')[0] for i in tp.columns])]
conc_data = dict()
for item in all_compound_target_combinations:
    compound = item[0]
    target = item[1]
    conc_data[(compound,target)] = {'Target': target, 'Compound': compound, 7: np.nan, 6: np.nan, 5: np.nan, 4: np.nan}

pconc = {'0.1': 7, '1': 6, '10': 5, '100': 4}
for row in tp.iterrows():
    compound = row[0]
    for column, value in zip(row[1].index, row[1].values):
        target = column.split('_')[0]
        concentration = column.split('_')[1]
        conc_data[(compound,target)][pconc[concentration]] = value
tp_pivoted = pd.DataFrame(list(conc_data.values()))

In [27]:
def is_negative_prediction(x):
    if np.isnan(x):
        return False
    if x < 0.4:
        return True
    else: 
        return False
def is_positive_prediction(x):
    if np.isnan(x):
        return False
    if x > 0.6:
        return True
    else: 
        return False

In [28]:
def make_no_data_summary(x):
    if all(np.isnan(i) for i in [x[7], x[6], x[5], x[4]]):
        return 'no information'
    else:
        return np.nan
def find_negative_predictions(x):
    if any(is_negative_prediction(i) for i in [x[7], x[6], x[5], x[4]]):
        return 'negative prediction'
    else:
        return np.nan
def find_positive_predictions(x):
    if any(is_positive_prediction(i) for i in [x[7], x[6], x[5], x[4]]):
        return 'positive prediction'
    else:
        return np.nan

In [29]:
# Restrict to those rows with at least one active/inactive prediction
tp_pivoted['no information'] = tp_pivoted.apply(make_no_data_summary, axis=1)
tp_pivoted = tp_pivoted.loc[tp_pivoted['no information'].isnull()]

# Identify rows with positive and rows with negative predictions
tp_pivoted['negative prediction'] = tp_pivoted.apply(find_negative_predictions, axis=1)
tp_pivoted['positive prediction'] = tp_pivoted.apply(find_positive_predictions, axis=1)

In [30]:
# Keep only rows with at least one positive prediction or one negative prediction
tp_pivoted = tp_pivoted.loc[(tp_pivoted['positive prediction']=='positive prediction')|(tp_pivoted['negative prediction']=='negative prediction')]

In [31]:
# Identify rows that already have a measurement in above bioactivity

In [32]:
measured_pairs = set(zip(bioact_drugs_all['parent_molregno'], bioact_drugs_all['accession']))

In [33]:
def id_measurement_avail(row):
    if (row['Compound'], row['Target']) in measured_pairs:
        return 1
    else:
        return 0

In [34]:
tp_pivoted['measurement_available'] = tp_pivoted.apply(id_measurement_avail, axis=1)

In [35]:
tp_pivoted.head()

Unnamed: 0,Target,Compound,7,6,5,4,no information,negative prediction,positive prediction,measurement_available
0,Q71U36,264400,,,0.005556,0.007692,,negative prediction,,0
1,O15530,264400,0.001,0.001,0.001,,,negative prediction,,0
2,P31644,264400,0.001,,,,,negative prediction,,0
3,P22001,264400,0.015789,,0.001,,,negative prediction,,0
5,P25440,264400,0.001,0.001,0.007143,0.005,,negative prediction,,0


In [36]:
# Keep tp only where no measurements available. Keep only targets that have at least some measurements
# To prevent associations being based on only predictions

In [37]:
measured_targets = set(bioact_drugs_all['accession'])

In [38]:
# Keep only where measurement for target is available and only drugs from AE datasets
tp_pivoted_selected = tp_pivoted.loc[(tp_pivoted['measurement_available']==0)&(tp_pivoted['Target'].isin(measured_targets))&(tp_pivoted['Compound'].isin(drugs))]

In [39]:
# Check data added by target predictions

In [40]:
len(tp_pivoted_selected['Compound'].drop_duplicates()), len(bioact_drugs_all['parent_molregno'].drop_duplicates()), len(set(tp_pivoted_selected['Compound'].drop_duplicates()) & set(bioact_drugs_all['parent_molregno'].drop_duplicates()))

(1463, 1147, 1119)

In [41]:
# Additional compounds from tp:
len(set(tp_pivoted_selected['Compound'].drop_duplicates()) - set(bioact_drugs_all['parent_molregno'].drop_duplicates()))

344

In [42]:
tp_pivoted_selected.rename(columns={'Compound': 'parent_molregno', 'Target': 'accession'}, inplace=True)

combined_df = pd.concat([tp_pivoted_selected, bioact_drugs_all], sort=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [43]:
combined_df.head()

Unnamed: 0,accession,parent_molregno,7,6,5,4,no information,negative prediction,positive prediction,measurement_available,parent_chembl_id,target_organism,summary
1,O15530,264400,0.001,0.001,0.001,,,negative prediction,,0.0,,,
2,P31644,264400,0.001,,,,,negative prediction,,0.0,,,
17,P14920,264400,0.013636,0.001,0.001,,,negative prediction,,0.0,,,
24,P27361,264400,0.001,0.001,0.001,,,negative prediction,,0.0,,,
27,Q59H18,264400,,,0.001,0.001,,negative prediction,,0.0,,,


In [44]:
# Additional bioactivities from tp (total)
sum(combined_df['measurement_available'].notnull())/len(combined_df), sum(combined_df['measurement_available'].isnull())/len(combined_df)
# approx 13% from measured, 86 from predicted

(0.8624855016431471, 0.1375144983568529)

In [45]:
len(combined_df), len(combined_df[['parent_molregno', 'accession',]].drop_duplicates())

(413840, 413840)

In [46]:
combined_df_numeric = combined_df.loc[combined_df['summary']!='inactive',:]
combined_df_numeric['summary'] = combined_df_numeric['summary'].astype('float')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [47]:
len(combined_df), len(combined_df[['parent_molregno', 'accession','summary']].drop_duplicates())

(413840, 413840)

In [48]:
# Save bioactivity
tp_pivoted_selected.to_csv(project_dir + '/results/tp_ae_drugs.txt', sep='\t', index=False)
combined_df.to_csv(project_dir + '/results/bioact_medians_plus_tp_ae_drugs.txt', sep='\t', index=False)

In [49]:
# Do matrix counts on new dataframe

# Create a report about the bioactivity data

total_cells = len(combined_df['parent_molregno'].drop_duplicates())*len(combined_df['accession'].drop_duplicates())

frac_filled = len(combined_df) / total_cells
frac_inactive = len(combined_df.loc[combined_df['summary']=='inactive',['parent_molregno','accession']].drop_duplicates()) / total_cells
frac_negative = len(combined_df_numeric.loc[((combined_df_numeric['summary']<6)|(combined_df_numeric['negative prediction']=='negative prediction')),:])/total_cells
frac_positive = len(combined_df_numeric.loc[((combined_df_numeric['summary']>=6)|(combined_df_numeric['positive prediction']=='positive prediction')),:])/total_cells
frac_positive_and_negative = len(combined_df_numeric.loc[(combined_df_numeric['positive prediction']=='positive prediction')&(combined_df_numeric['negative prediction']=='negative prediction')])/total_cells
total_check = (frac_inactive + frac_negative + frac_positive) - frac_positive_and_negative

info = [
'Number of drug-target pairs: {}'.format(len(combined_df[['parent_molregno', 'accession']]))
, 'Number of drugs with bioactivity added by target prediction: {}'.format(len(set(tp_pivoted_selected['parent_molregno'].drop_duplicates()) - set(bioact_drugs_all['parent_molregno'].drop_duplicates())))    
, 'Number of unique drugs from AE datasets with measured or predicted bioactivities: {}'.format(len(combined_df['parent_molregno'].drop_duplicates()))
, 'Number of unique Uniprot IDs: {}'.format(len(combined_df['accession'].drop_duplicates()))
, 'Total cells: {} x {} = {}'.format(str(len(combined_df['parent_molregno'].drop_duplicates())), len(combined_df['accession'].drop_duplicates()), total_cells)
, 'Percentage of cells of matrix (compound-target pairs) with data (active/inactive/activity_comment): {:.3f}%'.format(frac_filled*100)
, 'Percentage of drug-target pairs with "inactive" from activity_comment: {:.3f}%'.format(frac_inactive*100)
, 'Percentage of drug-target pairs with pchembl < 6 or negative prediction: {:.3f}%'.format(frac_negative*100)
, 'Percentage of drug-target pairs with pchembl >= 6 or positive prediction: {:.3f}%'.format(frac_positive*100)
, 'Percentage of drug-target pairs with both positive and negative predictions (depending on concentration): {:.3f}%'.format(frac_positive_and_negative*100)
, 'Total above 4 perc check: {:.3f}%'.format(total_check*100)
, 'Cases with both positive and negative predictions will be processed later'
, 'Percentage of datapoints from measured bioactivities: {:.3f}%'.format((len(combined_df.loc[~combined_df['summary'].isnull()])/len(combined_df))*100)
, 'Percentage of datapoints from prediction: {:.3f}%'.format((len(combined_df.loc[combined_df['summary'].isnull()])/len(combined_df))*100)
]    

with open(project_dir + '/results/bioactivities_plus_target_prediction_matrix_counts_report.txt', 'w') as f:
    f.write('\n'.join(info))