In [2]:
import configargparse
import sqlite3
import pandas as pd
import numpy as np
import scipy.io
import os
import logging


In [3]:
VERSION="chembl_29"
input_args = "--sqlite input/"+VERSION+"_sqlite/"+VERSION+".db --prefix "+VERSION
input_args.split()

['--sqlite', 'input/chembl_29_sqlite/chembl_29.db', '--prefix', 'chembl_29']

In [4]:
p = configargparse.ArgParser(default_config_files=["default.ini"])
p.add('-c', '--config', required=False, is_config_file=True, help='Config file path')
p.add('--sqlite', required=True, type=str, help="ChEMBL sqlite database")
#p.add("--organism", required=True, help="Organisms for protein filtering" )
#p.add("--targettype", required=True, help="Target type for protein filtering")
p.add('--mincmpdcount', required=True, help='Minimal number of compounds required for an assays', type=int)
p.add('--thresholds', required=True, help="Thresholds for classification", type=float, action="append")
p.add('--datadir', required=True, help="Data directory to write to (append prefix)", type=str)
p.add('--prefix', required=True, help="Prefix for the current dataset", type=str)
options = p.parse_args(input_args.split())

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

outdir = options.datadir + "/" + options.prefix

In [5]:
print(options)
print(outdir)

Namespace(config=None, sqlite='input/chembl_29_sqlite/chembl_29.db', mincmpdcount=100, thresholds=[5.5, 6.5, 7.5, 8.5], datadir='./output/', prefix='chembl_29')
./output//chembl_29


### Create Results fron Chmebl database 

In [6]:

conn = sqlite3.connect("input/chembl_29_sqlite/chembl_29.db")

In [7]:
df = pd.read_sql_query("""SELECT target_dictionary.chembl_id   as target_id, 
                                 molecule_dictionary.chembl_id as cmpd_id, 
                                 activities.standard_units     as stu,
                                 CASE activities.standard_units
                                    WHEN 'nM'      THEN activities.standard_value
                                    WHEN 'ug.mL-1' THEN activities.standard_value / compound_properties.full_mwt * 1E6
                                 END                           as ic50,
                                 CASE activities.standard_relation 
                                    WHEN '<'  THEN '<'
                                    WHEN '<=' THEN '<'
                                    WHEN '='  THEN '='
                                    WHEN '>'  THEN '>'
                                    WHEN '>=' THEN '>' 
                                    ELSE 'drop' 
                                 END                           as relation
                            FROM molecule_dictionary 
                            JOIN activities ON 
                                 activities.molregno == molecule_dictionary.molregno 
                            JOIN assays ON 
                                 assays.assay_id == activities.assay_id 
                            JOIN target_dictionary ON
                                 target_dictionary.tid == assays.tid
                            JOIN compound_properties ON 
                                 compound_properties.molregno = molecule_dictionary.molregno
                           WHERE 
                                target_dictionary.organism='Homo sapiens' AND 
                                target_dictionary.target_type='SINGLE PROTEIN' AND
                                activities.standard_type = 'IC50' AND 
                                activities.standard_units IN  ('nM','ug.mL-1') AND
                                activities.standard_relation IN ('<', '<=', '=','>', '>=')  AND
                                ic50 < 10e9 AND 
                                ic50 >= 10e-5 
                                ORDER BY target_id, cmpd_id """, conn)

In [8]:
conn.close()

In [11]:
df.to_csv('10.0_sql_output.csv')

In [12]:
df = df.groupby(["target_id","cmpd_id"]).min().reset_index()
df
# df.to_csv('1_groupby_min.csv')
# df = pd.read_csv('1_groupby_min.csv')
# df.head()


Unnamed: 0,target_id,cmpd_id,stu,ic50,relation
0,CHEMBL1075092,CHEMBL1092618,nM,30000.0,=
1,CHEMBL1075092,CHEMBL1092619,nM,30000.0,=
2,CHEMBL1075092,CHEMBL1093582,nM,30000.0,=
3,CHEMBL1075092,CHEMBL1093848,nM,30000.0,=
4,CHEMBL1075092,CHEMBL2398350,nM,67000.0,=
...,...,...,...,...,...
705432,CHEMBL6195,CHEMBL4129140,nM,200000.0,>
705433,CHEMBL6195,CHEMBL4130194,nM,80000.0,>
705434,CHEMBL6195,CHEMBL4635160,nM,10000.0,>
705435,CHEMBL6195,CHEMBL4641424,nM,80000.0,>


### Read SQL results from CSV file

In [13]:
df = pd.read_csv('10.0_sql_output.csv')
df = df.groupby(["target_id","cmpd_id"]).min().reset_index()

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 705437 entries, 0 to 705436
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   target_id   705437 non-null  object 
 1   cmpd_id     705437 non-null  object 
 2   Unnamed: 0  705437 non-null  int64  
 3   stu         705437 non-null  object 
 4   ic50        705437 non-null  float64
 5   relation    705437 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 32.3+ MB


In [15]:
c  = df.groupby("target_id")["cmpd_id"].nunique()
# c.to_csv('2_groupby_nunique.csv')
c.head()
c

target_id
CHEMBL1075092       8
CHEMBL1075094       3
CHEMBL1075097     120
CHEMBL1075102      31
CHEMBL1075104    1202
                 ... 
CHEMBL6175        684
CHEMBL6177          7
CHEMBL6186          8
CHEMBL6191          1
CHEMBL6195         13
Name: cmpd_id, Length: 2483, dtype: int64

### Pickup targets with more than 100 compounds

In [16]:
i  = c[c >= 100].index
i

Index(['CHEMBL1075097', 'CHEMBL1075104', 'CHEMBL1075138', 'CHEMBL1075145',
       'CHEMBL1075165', 'CHEMBL1075315', 'CHEMBL1075317', 'CHEMBL1075319',
       'CHEMBL1075322', 'CHEMBL1163101',
       ...
       'CHEMBL6101', 'CHEMBL6115', 'CHEMBL6120', 'CHEMBL6136', 'CHEMBL6144',
       'CHEMBL6154', 'CHEMBL6164', 'CHEMBL6166', 'CHEMBL6167', 'CHEMBL6175'],
      dtype='object', name='target_id', length=888)

In [17]:
df1 = df[df.target_id.isin(i)]
df1

Unnamed: 0.1,target_id,cmpd_id,Unnamed: 0,stu,ic50,relation
11,CHEMBL1075097,CHEMBL1234777,11,nM,1360.0,=
12,CHEMBL1075097,CHEMBL1812661,12,nM,311.0,=
13,CHEMBL1075097,CHEMBL1812662,16,nM,3430.0,=
14,CHEMBL1075097,CHEMBL2326084,17,nM,550.0,=
15,CHEMBL1075097,CHEMBL2326085,18,nM,270.0,=
...,...,...,...,...,...,...
705403,CHEMBL6175,CHEMBL4646564,858256,nM,7626.0,=
705404,CHEMBL6175,CHEMBL4648732,858257,nM,11.0,=
705405,CHEMBL6175,CHEMBL4649004,858258,nM,242.0,=
705406,CHEMBL6175,CHEMBL578512,858259,nM,500.0,=


In [21]:
df1["log_ic50"] = np.log10(df1["ic50"])
df1["pic50"] = 9 - np.log10(df1["ic50"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1["log_ic50"] = np.log10(df1["ic50"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1["pic50"] = 9 - np.log10(df1["ic50"])


In [22]:
df1

Unnamed: 0.1,target_id,cmpd_id,Unnamed: 0,stu,ic50,relation,log_ic50,pic50
11,CHEMBL1075097,CHEMBL1234777,11,nM,1360.0,=,3.133539,5.866461
12,CHEMBL1075097,CHEMBL1812661,12,nM,311.0,=,2.492760,6.507240
13,CHEMBL1075097,CHEMBL1812662,16,nM,3430.0,=,3.535294,5.464706
14,CHEMBL1075097,CHEMBL2326084,17,nM,550.0,=,2.740363,6.259637
15,CHEMBL1075097,CHEMBL2326085,18,nM,270.0,=,2.431364,6.568636
...,...,...,...,...,...,...,...,...
705403,CHEMBL6175,CHEMBL4646564,858256,nM,7626.0,=,3.882297,5.117703
705404,CHEMBL6175,CHEMBL4648732,858257,nM,11.0,=,1.041393,7.958607
705405,CHEMBL6175,CHEMBL4649004,858258,nM,242.0,=,2.383815,6.616185
705406,CHEMBL6175,CHEMBL578512,858259,nM,500.0,=,2.698970,6.301030


In [23]:
value_vars = []
for thr in   [5.5, 6.5, 7.5, 8.5]:
    logging.info("Processing threshold : '%1.1f'" % thr)
    value_vars.append("%1.1f" % thr)
    thr_str = "%1.1f" % thr
    print("thr_str ", thr_str ," type:", type(thr_str))
    ## using +1 and -1 for actives and inactives
    df1[thr_str] = (df1["pic50"] >= thr) * 2.0 - 1.0
    df1[thr_str] = np.where(np.logical_and((df1["relation"] == '<'), (df1['pic50'] < thr)), np.nan, df1[thr_str]) 
    df1[thr_str] = np.where(np.logical_and((df1["relation"] == '>'), (df1['pic50'] > thr)), np.nan, df1[thr_str]) 

2021-11-16 15:34:45,166 - INFO - Processing threshold : '5.5'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1[thr_str] = (df1["pic50"] >= thr) * 2.0 - 1.0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1[thr_str] = np.where(np.logical_and((df1["relation"] == '<'), (df1['pic50'] < thr)), np.nan, df1[thr_str])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-

thr_str  5.5  type: <class 'str'>
thr_str  6.5  type: <class 'str'>
thr_str  7.5  type: <class 'str'>


2021-11-16 15:34:45,446 - INFO - Processing threshold : '8.5'


thr_str  8.5  type: <class 'str'>


In [24]:
df1.head(50)

Unnamed: 0.1,target_id,cmpd_id,Unnamed: 0,stu,ic50,relation,log_ic50,pic50,5.5,6.5,7.5,8.5
11,CHEMBL1075097,CHEMBL1234777,11,nM,1360.0,=,3.133539,5.866461,1.0,-1.0,-1.0,-1.0
12,CHEMBL1075097,CHEMBL1812661,12,nM,311.0,=,2.49276,6.50724,1.0,1.0,-1.0,-1.0
13,CHEMBL1075097,CHEMBL1812662,16,nM,3430.0,=,3.535294,5.464706,-1.0,-1.0,-1.0,-1.0
14,CHEMBL1075097,CHEMBL2326084,17,nM,550.0,=,2.740363,6.259637,1.0,-1.0,-1.0,-1.0
15,CHEMBL1075097,CHEMBL2326085,18,nM,270.0,=,2.431364,6.568636,1.0,1.0,-1.0,-1.0
16,CHEMBL1075097,CHEMBL2326086,19,nM,140.0,=,2.146128,6.853872,1.0,1.0,-1.0,-1.0
17,CHEMBL1075097,CHEMBL2326087,20,nM,100.0,=,2.0,7.0,1.0,1.0,-1.0,-1.0
18,CHEMBL1075097,CHEMBL2326088,21,nM,1450.0,=,3.161368,5.838632,1.0,-1.0,-1.0,-1.0
19,CHEMBL1075097,CHEMBL2326089,23,nM,18.0,=,1.255273,7.744727,1.0,1.0,1.0,-1.0
20,CHEMBL1075097,CHEMBL2326090,26,nM,59.7,=,1.775974,7.224026,1.0,1.0,-1.0,-1.0


###  Unpivot target_id and cmpd_id

In [25]:
logging.info("Unpivot target_id and cmpd_id ")
# Unpivot df from wide to long format
melted = pd.melt(df1, id_vars=['target_id','cmpd_id'], value_vars=value_vars).dropna()   

2021-11-16 15:34:49,050 - INFO - Unpivot target_id and cmpd_id 


In [21]:
melted.info()
melted

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2612997 entries, 0 to 2677867
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   target_id  object 
 1   cmpd_id    object 
 2   variable   object 
 3   value      float64
dtypes: float64(1), object(3)
memory usage: 99.7+ MB


Unnamed: 0,target_id,cmpd_id,variable,value
0,CHEMBL1075097,CHEMBL1234777,5.5,1.0
1,CHEMBL1075097,CHEMBL1812661,5.5,1.0
2,CHEMBL1075097,CHEMBL1812662,5.5,-1.0
3,CHEMBL1075097,CHEMBL2326084,5.5,1.0
4,CHEMBL1075097,CHEMBL2326085,5.5,1.0
...,...,...,...,...
2677863,CHEMBL6175,CHEMBL4646564,8.5,-1.0
2677864,CHEMBL6175,CHEMBL4648732,8.5,-1.0
2677865,CHEMBL6175,CHEMBL4649004,8.5,-1.0
2677866,CHEMBL6175,CHEMBL578512,8.5,-1.0


### Write results
    
    
    1.5_melted.csv
    chembl_29_thresh.csv
    chembl_29_compounds.csv
    chmebl_29_targets.csv

In [26]:
melted.to_csv('10.5_melted.csv', index=True)  
melted.to_csv('%s/10_%s_thresh.csv' % (outdir, options.prefix), index = False)

#Write unique compound IDs
np.savetxt("%s/10_%s_compounds.csv" % (outdir, options.prefix), melted["cmpd_id"].unique(), fmt="%s")
#Write unique target  IDs
np.savetxt("%s/10_%s_targets.csv"   % (outdir, options.prefix), melted["target_id"].unique(), fmt="%s")


In [27]:
"%s/%s_compounds.csv" % (outdir, options.prefix)

'./output//chembl_29/chembl_29_compounds.csv'

In [28]:
tmp = pd.read_csv('10.5_melted.csv')

In [29]:
tmp

Unnamed: 0.1,Unnamed: 0,target_id,cmpd_id,variable,value
0,0,CHEMBL1075097,CHEMBL1234777,5.5,1.0
1,1,CHEMBL1075097,CHEMBL1812661,5.5,1.0
2,2,CHEMBL1075097,CHEMBL1812662,5.5,-1.0
3,3,CHEMBL1075097,CHEMBL2326084,5.5,1.0
4,4,CHEMBL1075097,CHEMBL2326085,5.5,1.0
...,...,...,...,...,...
2612992,2677863,CHEMBL6175,CHEMBL4646564,8.5,-1.0
2612993,2677864,CHEMBL6175,CHEMBL4648732,8.5,-1.0
2612994,2677865,CHEMBL6175,CHEMBL4649004,8.5,-1.0
2612995,2677866,CHEMBL6175,CHEMBL578512,8.5,-1.0
