In [1]:
"""Open results from querying ChEMBL with FAERS compounds. Process activity comments so that we can use inactive data."""

'Open results from querying ChEMBL with FAERS compounds. Process activity comments so that we can use inactive data.'

In [2]:
import pandas as pd
import numpy as np

In [3]:
pd.set_option('display.max_colwidth',200)
pd.set_option('display.max_rows', 200)

In [4]:
basedir = '/scratch/ias41/ae_code'

In [5]:
query_results = pd.read_csv(basedir + '/bioactivities/data/bioactivities.txt', sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
len(query_results)

3314700

In [7]:
query_results.head()

Unnamed: 0,parent_molregno,parent_chembl_id,accession,tid,target_organism,target_name,target_type,standard_type,standard_relation,pchembl_value,standard_flag,activity_comment,data_validity_comment,description,assay_chembl_id,src_id,src_description
0,123534,CHEMBL307429,O43451,1,Homo sapiens,Maltase-glucoamylase,SINGLE PROTEIN,IC50,=,7.4,1,,,Inhibitory concentration against human alpha-glucosidase,CHEMBL827835,1,Scientific Literature
1,623303,CHEMBL1089559,O43451,1,Homo sapiens,Maltase-glucoamylase,SINGLE PROTEIN,Ki,=,6.7,1,,,Inhibition of recombinant human maltase glucoamylase N-terminal catalytic domain,CHEMBL1112641,1,Scientific Literature
2,623304,CHEMBL1093264,O43451,1,Homo sapiens,Maltase-glucoamylase,SINGLE PROTEIN,Ki,=,6.72,1,,,Inhibition of recombinant human maltase glucoamylase N-terminal catalytic domain,CHEMBL1112641,1,Scientific Literature
3,623305,CHEMBL1093524,O43451,1,Homo sapiens,Maltase-glucoamylase,SINGLE PROTEIN,Ki,=,7.0,1,,,Inhibition of recombinant human maltase glucoamylase N-terminal catalytic domain,CHEMBL1112641,1,Scientific Literature
4,623306,CHEMBL1093525,O43451,1,Homo sapiens,Maltase-glucoamylase,SINGLE PROTEIN,Ki,=,6.89,1,,,Inhibition of recombinant human maltase glucoamylase N-terminal catalytic domain,CHEMBL1112641,1,Scientific Literature


### Do some filtering of results

In [8]:
selected_data = query_results.copy()

In [9]:
# throw away rows with 'NaN' and no pchembl value
selected_data.drop(labels=selected_data.loc[(selected_data['pchembl_value'].isnull())&(selected_data['activity_comment'].isnull())].index, inplace=True)

In [10]:
# Do not use measurements outside typical range
selected_data.drop(labels=selected_data.loc[selected_data['data_validity_comment']=='Outside typical range',:].index, inplace=True)

In [11]:
len(selected_data)

3160229

In [12]:
# Inspect activity comments, the 'and thus' captures Drugmatrix data
selected_data.loc[(~selected_data['activity_comment'].isnull())&(selected_data['activity_comment'].str.contains('^[0-9]+$|Original reference')==False)&(selected_data['activity_comment'].str.contains('inhibitor [', regex=False)==False)&(selected_data['pchembl_value'].isnull()),'activity_comment'].drop_duplicates()

45                                                                             Not Active
463                                                                        Not Determined
625        Not Active (inhibition < 50% @ 10 uM and thus dose-reponse curve not measured)
914                                           Note: corresponding IC50 reported as Active
2886                                                                               Active
16879                                                                        Slow Binding
17013                                                                      Not determined
18025                                                                        Inconclusive
28926                                                               Dose-dependent effect
29558                                                                        inconclusive
29560                                                                              active
29929     

In [13]:
# First set summary equal to pchembl_value
selected_data['summary'] = selected_data['pchembl_value']

In [14]:
inactive_comments = [
    'Not Active'
    , 'inactive'
    , 'Inactive'
    , 'No inhibition'
    , 'Not Active (inhibition < 50% @ 10 uM and thus dose-reponse curve not measured)'
]

In [15]:
def make_activity_summary(x):
    if np.isnan(x['summary']) and x['activity_comment'] in inactive_comments:
        return 'inactive'
    else:
        return x['pchembl_value']

In [16]:
selected_data['summary'] = selected_data.apply(make_activity_summary, axis=1)

In [17]:
# Inspect all summaries assigned
selected_data.loc[(selected_data['pchembl_value'].isnull())&(~selected_data['summary'].isnull()),['activity_comment', 'summary']].drop_duplicates()

Unnamed: 0,activity_comment,summary
45,Not Active,inactive
625,Not Active (inhibition < 50% @ 10 uM and thus dose-reponse curve not measured),inactive
29929,inactive,inactive
211909,No inhibition,inactive
222087,Inactive,inactive


In [18]:
# Drop rows without a summary
selected_data.drop(labels=selected_data.loc[selected_data['summary'].isnull(),:].index, inplace=True)

In [19]:
selected_data.loc[selected_data['standard_relation']=='>',['pchembl_value', 'summary']].drop_duplicates()

Unnamed: 0,pchembl_value,summary
17981,,inactive
1104986,4.3,4.3


In [20]:
# I'm not sure what this 'larger than' datapoint means, so drop it
selected_data.drop(labels=selected_data.loc[(~selected_data['pchembl_value'].isnull())&(selected_data['standard_relation']=='>'),:].index, inplace=True)

In [21]:
# Save copy of processed results
selected_data.to_csv(basedir + '/bioactivities/data/bioactivities_processed.txt', sep='\t', index=False)