In [1]:
import numpy as np
import pandas as pd
from DataFilters import *
CMerModel.mol_id = 'MOLREGNO'
CMerModel.target_id= 'TARGET_ID'

files = {
    'mols': 'data/chembl_17_10uM_mol_data.csv',
    'targets': 'data/chembl_17_10uM_target_data.csv',
    'map': 'data/chembl_17_10uM_target_mol.csv'
}

action_list = [
    'keep_single_mapping',
    'sanitize',
    'add_position',
    'smiles_largest_frag'
]

mol_data = pd.read_csv(files['mols'])
target_data = pd.read_csv(files['targets'])
tm_map = pd.read_csv(files['map'])

mol_data[['MOLREGNO', 'MAX_PHASE', 'THERAPEUTIC']] = mol_data[
    ['MOLREGNO', 'MAX_PHASE', 'THERAPEUTIC']].apply(pd.to_numeric)
target_data[['TARGET_ID']] = target_data[['TARGET_ID']].apply(pd.to_numeric)
tm_map[['TARGET_ID', 'MOLREGNO', 'STANDARD_VALUE']] = tm_map[
    ['TARGET_ID', 'MOLREGNO', 'STANDARD_VALUE']].apply(pd.to_numeric)

target_data = target_data.sort_values(['TARGET_ID']).set_index(['TARGET_ID'])
target_data.index.names = ['TARGET_ID']

mol_data = mol_data.sort_values(['MOLREGNO']).set_index(['MOLREGNO'])
mol_data.index.names = ['MOLREGNO']

tm_map = tm_map.set_index(['TARGET_ID', 'MOLREGNO'])

for func in action_list:
    print "running %s" % (str(func))
    mol_data, target_data, tm_map = methods[func](mol_data, target_data, tm_map)

running keep_single_mapping
running sanitize
running add_position
running smiles_largest_frag


In [25]:
from bokeh.io import output_notebook
from bokeh.layouts import gridplot
from bokeh.plotting import figure, show
from bokeh.palettes import Spectral6
from bokeh.layouts import row, column
from bokeh.models.ranges import Range1d, DataRange1d
from bokeh.models.axes import LinearAxis
output_notebook()

In [81]:
target_mol_count = np.concatenate(tm_map.STANDARD_RELATION.groupby(level=CMerModel.target_id).agg(['count']).values)

In [82]:
print np.count_nonzero(target_mol_count == 1), len(target_mol_count)

185 1806


In [86]:
tmc_hist, tmc_h_edges = np.histogram(target_mol_count, density=True, bins='auto')
tmc_count, tmc_c_edges = np.histogram(target_mol_count, density=False, bins='auto')

In [87]:
p1 = figure(x_axis_label='Coumpounds per target', y_axis_label='Percentage of targets')
p1.quad(top=tmc_hist, bottom=0, left=tmc_c_edges[:-1], right=tmc_c_edges[1:],
        fill_color=Spectral6[1])
p1.extra_y_ranges = {}
p1.extra_y_ranges['y2'] = DataRange1d(start=min(tmc_count), end=max(tmc_count))
p1.add_layout(LinearAxis(y_range_name = 'y2', axis_label = 'No. of targets'), 'right')


In [None]:
output_file

In [88]:
show(p1, notebook_handle=True)

In [73]:
p1 = figure()
p1.quad(top=tmc_hist, bottom=0, left=tmc_h_edges[:-1], right=tmc_h_edges[1:],
        fill_color=Spectral6[4])

show(p1, notebook_handle=True)

In [95]:
from pymongo import MongoClient
import gridfs
db = MongoClient().sacred
fs = gridfs.GridFS(db)
cth = fs.find_one({"filename": 'artifact://runs/24/Compound target histogram'})
data = np.load(cth)

In [96]:
data.files

['count', 'density_edges', 'count_edges', 'density']

In [97]:
p1 = figure(x_axis_label='Coumpounds per target', y_axis_label='Percentage of targets')
p1.quad(top=data['density'], bottom=0, left=data['density_edges'][:-1], right=data['density_edges'][1:],
        fill_color=Spectral6[1])
show(p1, notebook_handle=True)

In [288]:
fs.list()

[u'/Users/sh/Code/newSeaModel/tfidf_experiment/CMerModel.py',
 u'/Users/sh/Code/newSeaModel/tfidf_experiment/cf_ingredient.py',
 u'/Users/sh/Code/newSeaModel/tfidf_experiment/dataset_ingredient.py',
 u'/Users/sh/Code/newSeaModel/tfidf_experiment/filter_ingredient.py',
 u'/Users/sh/Code/newSeaModel/tfidf_experiment/log_ingredient.py',
 u'/Users/sh/Code/newSeaModel/tfidf_experiment/similarity_measures.py',
 u'/Users/sh/Code/newSeaModel/tfidf_experiment/tfidf_experiment.py',
 u'artifact://runs/22/Compound target histogram',
 u'artifact://runs/22/Target compound histogram',
 u'artifact://runs/23/Compound target histogram',
 u'artifact://runs/23/Target compound histogram',
 u'artifact://runs/24/Compound target histogram',
 u'artifact://runs/24/Target compound histogram',
 u'artifact://runs/24/c17mols',
 u'artifact://runs/24/c17targets',
 u'artifact://runs/24/c17tm',
 u'artifact://runs/26/Compound target histogram',
 u'artifact://runs/26/Target compound histogram',
 u'artifact://runs/27/Comp

In [201]:
cth = fs.find_one({"filename": 'artifact://runs/39/compound based target similarity'})
data = np.load(cth)

In [241]:
cth = fs.find_one({"filename": 'artifact://runs/39/chemical features based target similarity'})
data = np.load(cth)

In [353]:
cth = fs.find_one({"filename": 'artifact://runs/103/C20 TFIDF doc:raw_count*idf, query:double_norm*idf similarity positions'})
data = np.load(cth)

In [354]:
data.files

['cosine_sim_pos', 'dice_sim_pos']

In [355]:
data['cosine_sim_pos']

array([[20,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0],
       [ 3,  0,  1, 22, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,
        38, 38, 38, 38],
       [ 2,  1,  2, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,
        38, 38, 38, 38],
       [ 9,  0,  4,  6,  9, 20, 21, 29, 31, 35, 38, 38, 38, 38, 38, 38, 38,
        38, 38, 38, 38],
       [ 2, 14, 27, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,
        38, 38, 38, 38],
       [ 1, 33, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,
        38, 38, 38, 38],
       [19,  0,  1,  2,  3,  4,  5,  6,  7,  8, 10, 11, 12, 13, 14, 18, 22,
        25, 27, 31, 38],
       [ 2, 14, 30, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,
        38, 38, 38, 38],
       [ 2,  0, 19, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,
        38, 38, 38, 38],
       [ 2, 32, 36, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,
        38, 38,

In [358]:
(data['dice_sim_pos'][:,1:20] < 31).sum()

204

In [359]:
data['dice_sim_pos'][:,0].sum()

220

In [360]:
204/float(220)

0.9272727272727272

In [181]:
iu1 = np.triu_indices(data['dice_similarity'].shape[0], 1)
len(data['dice_similarity'][iu1])

3321

In [256]:
np.diagonal(data['cosine_similarity'])

array([ 0.00664786,  0.01942077,  0.00435298,  0.00059826,  0.02459053,
        0.00359271,  0.00753114,  0.02806139,  0.01041852,  0.01999435,
        0.02391898,  0.00903934,  0.02524426,  0.0010468 ,  0.14990014,
        0.01214686,  0.02721564,  0.01617415,  0.00700634,  0.01500623,
        0.01928594,  0.00887264,  0.03252033,  0.02862813,  0.01950742,
        0.03096686,  0.01316449,  0.05247587,  0.03582582,  0.03954834,
        0.03726218,  0.00625069,  0.00608051,  0.02062885,  0.03836094])

In [255]:
np.diagonal(data['dice_similarity'])

array([ 0.00513887,  0.01418224,  0.00255933,  0.00033982,  0.0098182 ,
        0.00076302,  0.00443928,  0.01107934,  0.00509706,  0.00804548,
        0.01126287,  0.00438509,  0.0080587 ,  0.0002848 ,  0.06678889,
        0.00511741,  0.01555696,  0.01553298,  0.00155628,  0.00552059,
        0.01016119,  0.00513602,  0.00898034,  0.00994821,  0.01048672,
        0.00904403,  0.01029255,  0.02177278,  0.01293873,  0.00996673,
        0.00925787,  0.00229885,  0.00213676,  0.01213948,  0.00962927])

In [259]:
hist, edges = np.histogram(data['cosine_similarity'], density=False, bins='auto')
hist = np.true_divide(hist, hist.sum())*100
p1 = figure(x_axis_label='Coumpounds per target', y_axis_label='Percentage of targets')
p1.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
        fill_color=Spectral6[1])
show(p1, notebook_handle=True)

In [219]:
hist

array([707, 619, 351, 207, 178, 123, 118, 108, 111,  97,  88,  85,  67,
        54,  34,  30,  27,  29,  25,  24,  17,  17,  11,  13,  10,   8,
         7,   6,   8,   2,   4,   7,   3,   5,   2,   3,   2,   3,   2,
         1,   1,   0,   1,   4,   0,   1,   2,   1,   0,   2,   5,   6,
         1,   2,   1,   3,   1,   2,   0,   1,   0,   1,   0,   0,   1,
         2,   0,   2,   2,   0,   1,   0,   2,   1,   2,   3,   2,   2,
         1,   1,   0,   0,   1,   1,   0,   0,   1,   0,   0,   0,   0,
         1,   1,   1,   1,   1,   0,   1,   0,   0,   1,   2,   4,   1,
         2,   2,   1,   4,   0,   2,   2,   1,   1,   3,   1,   3,   0,
         1,   1,   4,   0,   2,   0,   0,   0,   1,   1,   0,   0,   0,
         0,   0,   1,   0,   1])

In [260]:
hist, edges = np.histogram(data['dice_similarity'], density=False, bins='auto')
hist = np.true_divide(hist, hist.sum())*100
p1 = figure(x_axis_label='Coumpounds per target', y_axis_label='Percentage of targets')
p1.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
        fill_color=Spectral6[1])
show(p1, notebook_handle=True)

In [239]:
hist, edges = np.histogram(data['cosine_similarity'][iu1], density=True, bins='auto')
p1 = figure(x_axis_label='Coumpounds per target', y_axis_label='Percentage of targets')
p1.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
        fill_color=Spectral6[1])
show(p1, notebook_handle=True)

In [240]:
hist, edges = np.histogram(data['dice_similarity'][iu1], density=True, bins='auto')
p1 = figure(x_axis_label='Coumpounds per target', y_axis_label='Percentage of targets')
p1.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
        fill_color=Spectral6[1])
show(p1, notebook_handle=True)

In [103]:
pickle.load(data)

AttributeError: 'DataFrame' object has no attribute 'readline'