In [1]:
%reload_ext autoreload
%autoreload 2
%reload_ext sql 
%pylab inline
%matplotlib inline

import pandas as pd
import os, random

import pickle,time

tmstmp = time.strftime("%m-%d-%Y",time.localtime())


  warn("IPython.utils.traitlets has moved to a top-level traitlets package.")


Populating the interactive namespace from numpy and matplotlib


# Initialisation

## Directory paths
* TOP = path to the toplevel directory 
* TOP+/lib = path to the python source files that are required
* FIG_DIR = path to where figures are stored

In [2]:
# Set up the local source files
TOP = os.getcwd().replace('notebooks','')
LIB = TOP+'lib'
if not LIB in sys.path: sys.path.append(LIB)

DAT_DIR = TOP + '/data/'
RES_DIR = TOP + '/results/'
FIG_DIR = TOP + '/figs/'


## Load packages
Open organtox_v1 MongoDB - if this fails then check your mongodb installation

In [4]:
from organtox import *

DB = openMongo(host='pb.epa.gov',user='devel',passwd='devel',db='organtox_v1')
DB.collection_names()

[u'chm_fp', u'ml_lr_v1', u'bio_fp', u'ml_run_v1', u'tox_fp']

## Parallelisation

The code uses [IPython's parallel computing](https://ipyparallel.readthedocs.io/) library. 
Make sure the parallel machine specified as the input to initParallel is running. 

In [6]:
from organtox import *

initParallel(parallel_machine='my_parallel')

d_view

<DirectView [0, 1, 2, 3,...]>

In [7]:
resp=d_view.push(dict(TOP=TOP,LIB=LIB))

d_view.execute("""
import sys
if not LIB in sys.path: sys.path.append(LIB)
from organtox import *
DB = openMongo(host='pb.epa.gov',user='devel',passwd='devel',db='organtox_v1')
""")



<AsyncResult: finished>

# Supplemental Data

# Supervised Machine Learning

## The target organ toxicity outcomes

In [8]:
Outcomes =[u'CHR:Adrenal Gland',
 u'CHR:Bone Marrow',
 u'CHR:Brain',
 u'CHR:Eye',
 u'CHR:Heart',
 u'CHR:Kidney',
 u'CHR:Liver',
 u'CHR:Lung',
 u'CHR:Lymph Node',
 u'CHR:Mammary Gland',
 u'CHR:Pancreas',
 u'CHR:Pituitary Gland',
 u'CHR:Spleen',
 u'CHR:Stomach',
 u'CHR:Testes',
 u'CHR:Thymus',
 u'CHR:Thyroid Gland',
 u'CHR:Urinary Bladder',
 u'CHR:Uterus',
 u'MGR:Brain',
 u'MGR:Kidney',
 u'MGR:Ovary',
 u'MGR:Testes',
 u'SUB:Adrenal Gland',
 u'SUB:Bone Marrow',
 u'SUB:Brain',
 u'SUB:Heart',
 u'SUB:Kidney',
 u'SUB:Liver',
 u'SUB:Lung',
 u'SUB:Spleen',
 u'SUB:Stomach',
 u'SUB:Testes',
 u'SUB:Thymus',
 u'SUB:Thyroid Gland']
len(Outcomes)

35

## Identify all the datasets for machine learning



In [None]:
from organtox import *
import random
P_ALL = []

DB.ml_run_v1.drop()

for tox in Outcomes:
    RN0=[]
    # Get one seed for each toxicity classification
    seed = random.randint(1,1e6)      
    
    # Get the data 
    DS0 = getToxDataSet(tox,MDB=DB)

    N_p,N_n = (DS0['tox'][tox]==1).sum(),(DS0['tox'][tox]==0).sum()
    
    # For a balanced analysis 
    N_pn = N_p if N_p<N_n else N_n
    
    print "> Tox ",tox,N_p,'+', N_n,'-',time.strftime("%H:%M",time.localtime())
    sys.stdout.flush()
    sys.stderr.flush()
    
    for dt in ['bio','chm','ct','bc','bct']:
        for n_np in range(50,N_pn,5):
            RN0.append(dict(tox_class=tox,descriptor_type=dt,
                            num_negpos=n_np,rand_seed=seed))
        if n_np<N_pn:
            RN0.append(dict(tox_class=tox,descriptor_type=dt,
                            num_negpos=N_pn,rand_seed=seed))
            
    DB.ml_run_v1.insert_many(RN0)


In [8]:
DB.ml_run_v1.count(),DB.ml_run_v1.find_one()

(2670,
 {u'_id': ObjectId('585882cb072e60f47103a079'),
  u'descriptor_type': u'bio',
  u'num_negpos': 50,
  u'rand_seed': 819775,
  u'tox_class': u'CHR:Adrenal Gland'})

In [None]:
DB.ml_lr_v1.drop()

WORK = [(w['tox_class'],w['descriptor_type'],w['num_negpos'],w['rand_seed']) for w in DB.ml_run_v1.find()]
shuffle(WORK)

print "Starting ML Analysis to assign toxicity " + time.strftime("%d/%m/%Y %H:%M",time.localtime())
print "Data sets: ", DB.ml_run_v1.count()


P = lb_view.map(lambda (tox_i,dt_i,np_i,rs_i): runOrganToxML(tox_i,dt_i,np_i,rs_i,
                                                             ss_iters=20,cv_iters=10,cv_nfolds=5,
                                                             n_ds_min=5,n_ds_max=26,n_ds_step=1,
                                                             Col_ds=None,Col_lr=DB.ml_lr_v1,MDB=DB),
                    WORK)

print "Ending ML Analysis to assign activity " + time.strftime("%d/%m/%Y %H:%M",time.localtime())
#send_email(txt="Done",subj="ToxBrain Done")

Starting ML Analysis to assign toxicity 19/12/2016 20:17
Data sets:  2670


In [None]:
for i in ['pred','lr','n_ds','dt_in','dt_out','n_obs','ds_id',
          'perf_cvt.f1_mn','perf_cvt.bacc_mn']: 
    print '>',i
    DB.ml_lr_v1.create_index(i)

## Summarize ML results

In [None]:
from organtox import *

DB.ml_summary_v1.drop()
for pred in DB.ml_lr_v1.distinct('pred'):
    predPerfSummary(pred,Col_ml=DB.ml_lr_v1,Col_sum=DB.ml_summary_v1)
    print '>',pred,DB.ml_summary_v1.count()

In [None]:
DB.ml_summary_v1.drop()

print "Starting ML Summary  " + time.strftime("%d/%m/%Y %H:%M",time.localtime())

initParallel(Code="""
from organtox import *
""")

TOX = list(set(DB.ml_lr_v1.distinct('pred'))#.difference(DB.ml_summary_v1.distinct('pred')))

P = lb_view.map(lambda (tox_i): predPerfSummary(tox_i,Col_ml=DB.ml_lr_v1,Col_sum=DB.ml_summary_v1), TOX)

print "Ending ML Summary " + time.strftime("%d/%m/%Y %H:%M",time.localtime())


In [None]:
for i in ['pred','lr','n_ds','dt_in','dt_out','n_obs','f1_mn','pt','sens_mn','spec_mn','bacc_mn','acc_mn']: 
    print '>',i
    DB.ml_summary_v1.create_index(i)