In [2]:
# %load ../kiwi.py
# %load ../env.py
%load_ext autoreload
%autoreload 2
%pylab inline
%matplotlib inline
#%load_ext rpy2.ipython

import matplotlib.text as text
import pandas as pd
import numpy as np
import pylab as pl
import scipy as sp
import sys
import rpy2 
from rpy2.robjects import r, pandas2ri
import os 
from sklearn.metrics.pairwise import euclidean_distances,manhattan_distances,cosine_similarity

#Set environment variables


# Set up the local source files
#TOP = os.getcwd().replace('notebooks','')
TOP = "/home/ishah/ipynb/Camda18/"

LIB = TOP+'lib'
if not LIB in sys.path: 
    sys.path.insert(0,LIB)

os.environ['PYTHONPATH']=LIB


DAT_DIR = TOP + '/data/'
FIG_DIR = TOP + '/figs/'

CMAP_DIR= '/share/home/ishah/projects/HTTR/data/'
CMAP_DAT= '/mnt/data/CMap/CMap2.0/CEL1/'

from db.mongo import *
#DB=openMongo(db='httr_ph1',host='pb.epa.gov')
#DB1=openMongo(db='httr_v1',host='pb.epa.gov')
#MSG=openMongo(db='msigdb_v6',host='pb.epa.gov')
CMP=openMongo(db='cmap_v2',host='localhost')
#CMP = pymongo.MongoClient("mongodb://localhost/cmap_v2")['cmap_v2']
#from gexp.deseq2 import *
pd.options.display.max_colwidth = 500


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


# Goal

Use ML to predict DILI classifications. Each instance is an affymetrix array result for a single chemical treatment. 

## Data Processing
* Raw Data processing: The CEL data have to be loaded, normalized, and linked with relevant matched vehicle controls

* Probe Effect analysis: Probe level L2FC effects calcualted by comparison with controls  

* Probe filtering & FP generation: Create discrete versions of the data by z-score analysis and filtering out low perturbation probes

## DILI Prediction

* ML

* Connectivity mapping


## CAMDA

In [3]:
CAMDA0 = pd.read_excel(DAT_DIR+'CAMDA_Challange_dataset_filenames.xlsx',skiprows=2)
D_mcf7 = CAMDA0.iloc[:,:5]
D_mcf7.columns=['s_id','data_type','tox','perturbation_scan_id','vehicle_scan_ids']
D_mcf7['cell']='MCF7'
D_pc3  = CAMDA0.iloc[:,[0,1,2,5,6]]
D_pc3.columns=['s_id','data_type','tox','perturbation_scan_id','vehicle_scan_ids']
D_pc3['cell']='PC3'

CAMDA1 = pd.concat((D_mcf7,D_pc3))
CAMDA1['perturbation_scan_id']=CAMDA1.perturbation_scan_id.str.replace("'","")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


## CMap

In [4]:
#CMap2 = pd.read_excel(DAT_DIR+'cmap_instances_02.xls')
CMap2_db=pd.DataFrame(list(CMP.cmap_trt_info.find(dict(cell={'$in':['PC3','MCF7']}),
                                          dict(_id=0,pert_id=1,name=1,conc=1,timeh=1))))\
         .drop_duplicates()
            

In [5]:
CAMDA2 = CAMDA1.merge(CMap2_db,left_on='perturbation_scan_id',right_on='pert_id',how='left')

In [6]:
CAMDA2.head()

Unnamed: 0,s_id,data_type,tox,perturbation_scan_id,vehicle_scan_ids,cell,conc,name,pert_id,timeh
0,1,Training,1.0,5500024030403071907255.C05,.H01.E03.D04.B05.A06,MCF7,1.1e-05,acebutolol,5500024030403071907255.C05,6.0
1,2,Training,1.0,5500024030403071907253.A09,.H07.G08.E09.D10.B11.A12,MCF7,1.8e-05,aciclovir,5500024030403071907253.A09,6.0
2,3,Training,0.0,5500024030402071707279.B01,.H01.G02.E03.D04.B05.A06,MCF7,7e-06,amikacin,5500024030402071707279.B01,6.0
3,4,Training,1.0,5500024030403071907257.G04,.G02.E03.D04.B05.A06,MCF7,1.3e-05,amiloride,5500024030403071907257.G04,6.0
4,5,Training,0.0,5500024037496121008324.E08,.H07.G08.E09.D10.B11.A12,MCF7,3e-05,aminocaproic acid,5500024037496121008324.E08,6.0


In [7]:
CMP.cmap_dili.drop()
X = CAMDA2[['data_type','tox','cell','pert_id','name','conc','timeh']]
CMP.cmap_dili.insert_many(X.to_dict('records'))

<pymongo.results.InsertManyResult at 0x7fa47acb9200>

# DILI Set


In [None]:
RES = []
ii=0
trt_fmt = "%(cell)s-%(name)s-%(timeh)dh-%(conc)sM"
C = ['data_type','tox','cell','name','conc']
for P in CMP.cmap_dili.find({},dict(_id=0,pert_id=1,data_type=1,tox=1),
                           no_cursor_timeout=True):
    ii+=1
    X = CMP.cmap_fc.find_one(dict(pert_id=P['pert_id']),dict(_id=0))
    X.update(P)
    trt_str = trt_fmt % X
    Y = {k:X[k] for k in C}
    Y.update(X.pop('fc1'))
    Y.update(dict(trt_id=trt_str))
    
    RES.append(Y)
    if ii % 10 ==0: print ii
C.insert(0,'trt_id')
FC0 = pd.DataFrame(RES).set_index(C)
AFX=[i for i in FC0.columns if i.startswith('AFFX')]

FC0 = FC0.drop(AFX,axis=1)


# load dili chemical information

In [17]:
os.listdir(DAT_DIR)


['.ipynb_checkpoints',
 'dili-chemicals-1.xlsx',
 'ChemistryDashboard-Batch-Search_2018-04-16_05_19_23.tsv',
 'ChemistryDashboard-Batch-Search_2018-04-13_22_37_43.tsv',
 'hgu133_info.csv',
 'cmap_instances_02.xls',
 'ChemistryDashboard-Batch-Search_2018-04-16_05_03_06.tsv',
 'ChemistryDashboard-Batch-Search_2018-04-13_22_40_22.tsv',
 'cmap-db-v1a.ipynb',
 'ChemistryDashboard-Batch-Search_2018-04-13_21_54_14.tsv',
 'CAMDA_Challange_dataset_filenames.xlsx',
 'cmap_dili.csv',
 'cmap-chems-v1.tsv']

In [9]:
X1 = pd.read_csv(DAT_DIR+'ChemistryDashboard-Batch-Search_2018-04-13_22_37_43.tsv',sep='\t')
X2 = pd.read_csv(DAT_DIR+'ChemistryDashboard-Batch-Search_2018-04-13_22_40_22.tsv',sep='\t')

DSSTOX0 =pd.concat((X1,X2)).drop_duplicates()

In [10]:
DILI0 = pd.concat([pd.read_excel(DAT_DIR+'dili-chemicals-1.xlsx',sheet_name=i) for i in range(3)])

## get cmap chemical information

In [11]:
CMP.cmap_fc.create_index('name')

u'name_1'

In [39]:
X1 = pd.DataFrame(list(CMP.cmap_fc.distinct('name')))
X1.columns=['name']

In [42]:
file(DAT_DIR+'cmap-chems-v1.tsv','w').write('\n'.join(map(str,X1.name.unique())))

In [13]:
os.path.exists('/home/ishah/ipynb/Camda18/data/ChemistryDashboard-Batch-Search_2018-04-16_05_19_23.tsv')


True

In [22]:
#os.path.exists('/home/ishah/ipynb/Camda18/data/ChemistryDashboard-Batch-Search_2018-04-16_05_19_23.tsv')

X1 = pd.read_csv(DAT_DIR+'ChemistryDashboard-Batch-Search_2018-04-16_05_03_06.tsv',sep='\t')
X2 = pd.read_csv(DAT_DIR+'ChemistryDashboard-Batch-Search_2018-04-16_05_19_23.tsv',sep='\t')

CMAP0 =pd.concat((X1,X2)).drop_duplicates()\
        .rename(columns=dict(INPUT='cmap_name',
                             FOUND_BY='found_by',
                             DTXSID='dsstox_sid',
                             PREFERRED_NAME='name',
                             CASRN='casrn',
                             INCHI_KEY='inchi_key',
                             IUPAC_NAME='iupac_name',
                             SMILES='smiles',
                             QSAR_READY_SMILES='qsar_smiles'
                             ))

In [26]:
CMP.cmap_chems.insert_many(CMAP0.to_dict('records'))

<pymongo.results.InsertManyResult at 0x7fa47aba6ab8>