In [1]:
import pandas as pd
import os
import numpy as np
from collections import defaultdict
import rpy2.robjects as robjects
import time

In [2]:
robjects.r('''
getObs_ch1 <- function(ls) {
  return(data.frame(CELL_LINE=as.character(ls$CELL_LINE),
                    COMBINATION_ID=as.character(ls$COMBINATION_ID),
                    OBSERVATION=ls$SYNERGY_SCORE))
}

# Get the drug combinations score of Subchallenge 1
getDrugCombiScore_ch1 <- function(obs, pred, confidence=NA, topX=10) {
  R <- c()
  obs <- read.csv(obs,stringsAsFactors = F)
  obs <- getObs_ch1(obs)
  pred <- read.csv(pred,stringsAsFactors=F)
  pred <- pred[match(paste(obs$CELL_LINE,obs$COMBINATION_ID),paste(pred$CELL_LINE,pred$COMBINATION_ID)),]

  pred$COMBINATION_ID <- gsub(" ", "", pred$COMBINATION_ID)
  for (i in as.character(unique(obs$COMBINATION_ID))) {
      R <- c(R, cor(obs[obs$COMBINATION_ID == i, 'OBSERVATION'], 
                    pred[pred$COMBINATION_ID == i, 'PREDICTION']))
  }
  #Make NA's in R = 0
  R[is.na(R)] = 0
  names(R) <- as.character(unique(obs$COMBINATION_ID))
  
  if (!file.exists(confidence))
    return(round(c(mean=mean(R),
             ste=sd(R),
             n=sum(!is.na(R))),2))
  
  confidence <- read.csv(confidence,stringsAsFactors=F)
  confidence <- confidence[match(unique(obs$COMBINATION_ID),confidence$COMBINATION_ID),]

  nStep <- 1000
  nVal <- round(topX * (length(R) / 100))
  boot_R <- rep(0, nVal)
  for (i in 1:nStep) {
    idx <- order(confidence$CONFIDENCE, sample(length(R)), decreasing = T)[1:nVal]
    boot_R <- boot_R + R[idx]
  }
  
  return(round(c(mean=mean(boot_R/nStep),
                 ste=sd(boot_R/nStep),
                 n=sum(!is.na(boot_R/nStep))),2))
}

# ------------------------------------------------------------------------------------
# Get the global score of Subchallenge 1
# ------------------------------------------------------------------------------------
getGlobalScore_ch1 <- function(obs, pred) {
  obs <- read.csv(obs, stringsAsFactors=F)
  obs <- getObs_ch1(obs)
  pred <- read.csv(pred,stringsAsFactors=F)
  pred <- pred[match(paste(obs$CELL_LINE,obs$COMBINATION_ID),paste(pred$CELL_LINE,pred$COMBINATION_ID)),]

  x = obs$OBSERVATION
  y = pred$PREDICTION
  
  agg <- aggregate(OBSERVATION ~ CELL_LINE, obs, median)
  z0 <- agg$OBSERVATION[match(obs$CELL_LINE, agg$CELL_LINE)]
  
  agg <- aggregate(OBSERVATION ~ COMBINATION_ID, obs, median)
  z1 <- agg$OBSERVATION[match(obs$COMBINATION_ID, agg$COMBINATION_ID)]
   
  parCor <- function(u,v,w) {
    numerator = cor(u,v) - cor(u,w) * cor(w,v)
    denumerator = sqrt(1-cor(u,w)^2) * sqrt(1-cor(w,v)^2)
    return(numerator/denumerator)
  }
  
  numerator=parCor(x,y,z1) - parCor(x,z0,z1) * parCor(z0,y,z1)
  denumerator= sqrt(1-parCor(x,z0,z1)^2) * sqrt(1-parCor(z0,y,z1)^2)
  
  # partial out the mean of synergy across cell lines and combinationations
  return(c(score=numerator/denumerator))
}
''')

<SignatureTranslatedFunction - Python:0x000000000A9679C8 / R:0x000000000C9179B0>

In [6]:
file='challone.ipynb'
print "last modified: %s" % time.ctime(os.path.getmtime(file)) #파일 수정된 날짜
print "created: %s\n" % time.ctime(os.path.getctime(file)) #파일 생성된 날짜

cell_line_bool=1
drug_id_bool=1
drug_target_bool=1
mutation_bool=1
lasso_polynomial_feature_bool=1
lasso_polynomial_feature_leaderboard_bool=0

#feature 무엇을 썼는지 file 이름에 명시
output_file_name=''
if cell_line_bool: output_file_name+='cell_line_id'
if drug_id_bool: output_file_name+='drug_id_'
if drug_target_bool: output_file_name+='drug_target_'
if mutation_bool: output_file_name+='mutation_'
if lasso_polynomial_feature_bool: output_file_name+='lasso_polynomial_'
if lasso_polynomial_feature_leaderboard_bool: output_file_name+='lasso_polynomial_leaderboard_'
output_file_name+='challone_result'

libfm_id=0 #libfm의 id 값
meta_list=[] #libfm option중 meta id list
meta_id=0 #meta id 값

if cell_line_bool:
    cell_line_id_dict={}
    #cell_line을 보면 train data에서 한 cell line은 모든 qa가 -1이다 그러나 제외하지 않고 넣자, 결과가 더 좋음
    df=pd.read_csv('input/ch1_train_combination_and_monoTherapy.csv')
    for i,name in enumerate(set(df['CELL_LINE'])):
        cell_line_id_dict[name]=i
        meta_list.append(str(meta_id))

    print 'cell line id',len(cell_line_id_dict)
    libfm_id+=len(cell_line_id_dict)
    meta_id+=1

if drug_id_bool:
    drug_name_id_dict={}
    df=pd.read_csv('drug/Drug_info_release.csv')
    for i,name in enumerate(set(df['ChallengeName']),start=libfm_id):
        drug_name_id_dict[name]=i
        meta_list.append(str(meta_id))
    
    print 'drug id',len(drug_name_id_dict)
    libfm_id+=len(drug_name_id_dict)
    meta_id+=1

if drug_target_bool:
    df=pd.read_csv('drug/Drug_info_release_extended.csv')
    drug_target_set=set() # drug target set
    for targets in df['Target']:
        if pd.notnull(targets):
            targetSplit=targets.split(',')
            for i in range(len(targetSplit)):
                if '*' not in targetSplit[i]: #*이 들어가는 gene target은 해석이 불가능 하므로 제외
                    drug_target_set.add(targetSplit[i].strip())

    drug_target_id_dict={}
    for i,drug_target in enumerate(drug_target_set,start=libfm_id):
        drug_target_id_dict [drug_target]=i
        meta_list.append(str(meta_id))

    drug_target_dict={}
    for row in df.itertuples():
        tmp=set()
        if pd.notnull(row[2]):
            targetSplit=row[2].split(',')
            for i in range(len(targetSplit)):
                if '*' not in targetSplit[i]:
                    tmp.add(drug_target_id_dict[targetSplit[i].strip()])
        drug_target_dict[row[1]]=tmp

    print 'drug target',len(drug_target_set)
    libfm_id+=len(drug_target_set)
    meta_id+=1


if mutation_bool:
    df=pd.read_csv('drug/notInMut.csv')
    drug_target_set=set() # drug target set
    for targets in df['Target']:
        if pd.notnull(targets): #빈 공간 아닌 경우
            targetSplit=targets.split(',')
            for i in range(len(targetSplit)):
                if '*' not in targetSplit[i]:
                    drug_target_set.add(targetSplit[i].strip())

    drug_target_id_dict={}
    for i,drug_target in enumerate(drug_target_set,start=libfm_id):
        drug_target_id_dict [drug_target]=i
        meta_list.append(str(meta_id))

    cell_line_mutation={}
    df=pd.read_csv('mutation/mutations.csv')
    
    #---------------------------------------------------------------------------------------
    #metastasis는 tumor gene이 drug target에 포함 되어도 mutation feature로 포함하지 않았다
    #---------------------------------------------------------------------------------------
    for row in df[df['Tumour.origin']=='primary'].itertuples():
        gene_name=row[1]
        if gene_name in drug_target_id_dict:
            if gene_name=='BRAF' and row[15]=='p.V600E':
                if row[5] in cell_line_mutation: cell_line_mutation[row[5]].add(drug_target_id_dict['BRAF_V600E'])
                else: cell_line_mutation[row[5]]=set([drug_target_id_dict['BRAF_V600E']])
            else:
                if row[5] in cell_line_mutation: 
                    cell_line_mutation[row[5]].add(drug_target_id_dict[gene_name])
                else: cell_line_mutation[row[5]]=set([drug_target_id_dict[gene_name]])

    #drug target gene이 cell line mutation에 없는 경우 -1로 줌
    #drug target dimension을 생각할 때 만일 2번 gene target만 존재 할 경우 -1 -1 1 -1 -1 ... 이런 value들을 갖게되는 것이다
    drug_target_not_in_cell_line=defaultdict(set)
    for cell_line_var in cell_line_mutation:
        for drug_target_id_var in drug_target_id_dict.values():
            if drug_target_id_var not in cell_line_mutation[cell_line_var]:
                drug_target_not_in_cell_line[cell_line_var].add(drug_target_id_var)

    print 'mutations',len(drug_target_id_dict)
    libfm_id+=len(drug_target_id_dict)
    meta_id+=1
    
if lasso_polynomial_feature_bool:
    lasso_poly_feature_list=[]
    feature_num=0 #feature 개수
    
    with open('lasso/df_simple_0.03_350.csv','r') as fr:
        lines=fr.readlines()
        feature_num=len(lines[0].strip().split(',')[1:])
        
        for line in lines[1:]:
            feature=line.strip().split(',')
            result=''
            for i in range(len(feature[1:])):
                if float(feature[i+1]): result+=str(i+libfm_id)+':'+feature[i+1]+' '
            lasso_poly_feature_list.append(result.strip())
            
    meta_list.extend([str(meta_id)]*feature_num)
    print 'poly feature',feature_num
    libfm_id+=feature_num
    meta_id+=1
    
if lasso_polynomial_feature_leaderboard_bool:
    lasso_poly_feature_leaderboard_list=[]
    feature_num=0 #feature 개수
    
    with open('lasso/df_simple_0.03_350.csv','r') as fr:
        lines=fr.readlines()
        feature_num=len(lines[0].strip().split(',')[1:])
        
        for line in lines[1:]:
            feature=line.strip().split(',')
            result=''
            for i in range(len(feature[1:])):
                if float(feature[i+1]): result+=str(i+libfm_id)+':'+feature[i+1]+' '
            lasso_poly_feature_leaderboard_list.append(result.strip())
            
    meta_list.extend([str(meta_id)]*feature_num)
    print 'poly feature leaderboard',feature_num
    libfm_id+=feature_num
    meta_id+=1

print '총 feature 개수: ',libfm_id

trainDir='input/chall1/ch1_train_combination_and_monoTherapy_qa1_shuffle.csv'
print 'train data:',trainDir.split('/')[-1]

#최종제출
#testDir='input/chall1/ch1_leaderBoard_monoTherapy_blankErased.csv'
#print testDir.split('/')[-1]

libfm_train='libfmInput/fmTrain.libfm' #libfm input file 이름
libfm_test='libfmInput/fmTest.libfm'
meta_file_directory='libfmInput/meta.txt' #group data file 이름

with open(meta_file_directory,'w') as fw:
    result=''
    for item in meta_list:
        result+=item+'\n'
    fw.write(result)

def makeLibfmInput(file_directory,libfm_file,score):
    df=pd.read_csv(file_directory)

    with open(libfm_file,'w') as fw:
        featureString=''
        row_id=0
        for row in df.itertuples():
            synergy_score=row[12]
            cell_line_name=row[1]
            drug_a=row[2]
            drug_b=row[3]
            
            if score: featureString+=str(synergy_score)+' ' # score
            else: featureString+='0 ' # score

            if cell_line_bool:
                if cell_line_name in cell_line_id_dict: featureString+=str(cell_line_id_dict[cell_line_name])+':1 ' # cell line

            if mutation_bool:
                if cell_line_name in cell_line_mutation:
                    for x in cell_line_mutation[cell_line_name]: featureString+=str(x)+':1 '
                if cell_line_name in drug_target_not_in_cell_line:
                    for x in drug_target_not_in_cell_line[cell_line_name]: featureString+=str(x)+':-1 '

            if drug_id_bool: featureString+=str(drug_name_id_dict[drug_a])+':1 '+str(drug_name_id_dict[drug_b])+':1 '

            if drug_target_bool:
                u=drug_target_dict[drug_a].intersection(drug_target_dict[drug_b])
                for x in u: featureString+=str(x)+':2 ' #target이 겹치는 경우 feature의 value를 2배로 주었다
                for x in drug_target_dict[drug_a]-u: featureString+=str(x)+':1 '
                for x in drug_target_dict[drug_b]-u: featureString+=str(x)+':1 '
            
            if lasso_polynomial_feature_bool:
                featureString+=lasso_poly_feature_list[row_id]+' '
                
            if lasso_polynomial_feature_leaderboard_bool:
                featureString+=lasso_polynomial_feature_leaderboard_bool[row_id]+' '
            
            featureString=featureString.strip()+'\n'
            row_id+=1 #lasso feature를 위한 index
            
        fw.write(featureString)

makeLibfmInput(trainDir,libfm_train,True)
#makeLibfmInput(testDir,libfm_test,False) #최종 제출시
        
#train data를 cross validation set으로 분할하는 코드 (미리 train data가 shuffle 되어 있어야 한다)
#input: libfm_train
#output: cvTrain.libfm & cvTest.libfm

is_cross_validation=False
if is_cross_validation:
    feature_string=[] #feature string
    number_of_rows=0 #number of rows

    print 'cross validation libfm data:',libfm_train.split('/')[-1]
    with open(libfm_train,'r') as fr:
        for row in fr:
            feature_string.append(row.strip())
            number_of_rows+=1

    subset_size=number_of_rows/10
    for i in range(10):
        with open('cv/cv_test'+str(i)+'.libfm','w') as fw:
            if i==9:
                for item in feature_string[i*subset_size:]:
                    print>>fw, item
            else:
                for item in feature_string[i*subset_size:][:subset_size]:
                    print>>fw, item
        with open('cv/cv_train'+str(i)+'.libfm','w') as fw:
            for item in feature_string[:i*subset_size] + feature_string[(i+1)*subset_size:]:
                print>> fw, item

last modified: Wed Feb 03 14:49:11 2016
created: Tue Sep 29 22:53:21 2015

cell line id 85
drug id 119
drug target 90
mutations 68
poly feature 306
총 feature 개수:  668
train data: ch1_train_combination_and_monoTherapy_qa1_shuffle.csv


In [None]:
#challenege 1 cross validation sgd \r-> escape 문
#libFM 동시에 돌리는 코드 ( 10 cross validation은 시간이 오래 걸린다)
#
#input: cvTrain'X'.libfm, cvTest'X'.libfm, meta'X'.txt
#output: prediction/result0~9

def create_cross_validation_libFM_bat_file(dim='30',iter='1000',regular='0,0,1',method='mcmc'):
    for i in range(10):
        with open('bat/train'+str(i)+'.bat','w') as fw:
            fw.write("..\\libfm -task r -train ..\\cv\\cv_train"+str(i)+".libfm -test ..\\cv\\cv_test"+str(i)+".libfm -dim '1,1,")
            if method=='sgd':
                fw.write(dim+"' -iter "+iter+" -method sgd -learn_rate 0.00001 -regular "+regular)
            elif method=='mcmc':
                fw.write(dim+"' -iter "+iter)
            fw.write(" -meta ..\\libfmInput\\meta"+str(i)+".txt -init_stdev 0.1 -out ..\\prediction\\result"+str(i))
            
create_cross_validation_libFM_bat_file(iter='1000',regular='0,0.01,1',method='sgd')

In [257]:
#cross validation 결과 출력 코드
#
#input: result0 ~ result9
#output: cvFileName

cvFileName='cv/cross_validation'+output_file_name+'.csv'

prTest=pd.read_csv(trainDir)
pred=prTest.loc[:,['CELL_LINE','COMBINATION_ID','SYNERGY_SCORE']]

fold=10
tmp=[]
for j in range(fold):
    with open('prediction/result'+str(j),'r') as fr:
        for r in fr:tmp.append(float(r.strip()))

pred.loc[:,'PREDICTION']=np.asarray(tmp)
pred.to_csv(cvFileName,index=False)

quiet=False #score 결과 출력
if not quiet:
    print trainDir.split('/')[-1],cvFileName
    #global score
    print robjects.r['getGlobalScore_ch1'](trainDir,cvFileName)[0]
    confidenceFile='NA'
    print str(robjects.r['getDrugCombiScore_ch1'](trainDir,cvFileName,confidenceFile)[0])+','+str(robjects.r['getDrugCombiScore_ch1'](trainDir,cvFileName,confidenceFile)[1])

ch1_train_combination_and_monoTherapy_qa1_shuffle.csv cv/1-1cv.csv
0.250985291443
0.22,0.36


#  leaderboard libFM submission code

In [None]:
#leader board bat file
def create_leaderboard_libFM_bat_file(dim='30',iter='1000',regular='0,0,1',method='mcmc'):
    with open('bat/train_leaderboard.bat','w') as fw:
        fw.write("..\\libfm -task r -train ..\\libfmInput\\fmTrain.libfm -test ..\\libfmInput\\fmTest.libfm -dim '1,1,")
        if method=='sgd':
            fw.write(dim+"' -iter "+iter+" -method sgd -learn_rate 0.00001 -regular "+regular)
        elif method=='mcmc':
            fw.write(dim+"' -iter "+iter)
        fw.write("-meta ..\\libfmInput\\meta.txt -init_stdev 0.1 -out ..\\prediction\\result_leaderboard")

create_leaderboard_libFM_bat_file(iter='1000',regular='0,0.01,1',method='sgd')

In [None]:
leaderboardFileName='chall1/1-2submission2.csv'
prTest=pd.read_csv(testDir)
pred=prTest.loc[:,['CELL_LINE','COMBINATION_ID']]

fold=10
tmp=[]
with open('prediction/result_leaderboard','r') as fr:
    for r in fr:tmp.append(float(r.strip()))

pred.loc[:,'PREDICTION']=np.asarray(tmp)
pred.to_csv(leaderboardFileName,index=False)

In [None]:
if prerher2Bool:
    pr=pd.read_csv('cell/cbioportal_EMT_BREAST.csv')
    clprerher2={}
    #for row in pr[['CELL_LINE','HER2']].itertuples():
    #    tmp=''
    #    if row[2]:tmp+=str(libfmId)+':'+str(row[2])+' '
    #    #if row[3]:tmp+=str(libfmId+1)+':'+str(row[3])+' '
    #    #if row[4]:tmp+=str(libfmId+2)+':'+str(row[4])+' '
    #    clprerher2[row[1]]=tmp
    #libfmId+=1
    #groupData.extend(str(groupId))
    
    for row in pr[['CELL_LINE','PR','ER','HER2']].itertuples():
        tmp=''
        if row[2]:tmp+=str(libfmId)+':'+str(row[2])+' '
        if row[3]:tmp+=str(libfmId+1)+':'+str(row[3])+' '
        if row[4]:tmp+=str(libfmId+2)+':'+str(row[4])+' '
        clprerher2[row[1]]=tmp
    libfmId+=3
    groupData.extend([str(groupId)]*3)
    groupId+=1
    print 'PR,ER,HER2',3
    
if monoconcenBool:
    monoconcen={}
    denominator=100 #교수님의 전언 100으로 나누어라!
    print 'normalized by',denominator,
    for csvfile in os.listdir('ch1_training_combinations/'):
        pr=pd.read_csv('ch1_training_combinations/'+csvfile,index_col=0,engine='c')
        monoconcenVal=[]
        monoconcenVal.extend(pr.loc[:,'0'][1:6]) #Agent 1, Agent 2=0
        monoconcenVal.extend(pr.loc['0',:][1:6]) #Agent 2, Agent 1=0
        monoconcenVal=[float(x)/denominator for x in monoconcenVal]

        cellDrugcomboName=csvfile[:-9]
        if cellDrugcomboName in monoconcen:monoconcen[cellDrugcomboName].append(monoconcenVal)
        else:monoconcen[cellDrugcomboName]=[monoconcenVal]

    monoconcenAverage={}
    for mc in monoconcen:
        if len(monoconcen[mc])==3:averagetmp=[sum(x)/3 for x in zip(monoconcen[mc][0],monoconcen[mc][1],monoconcen[mc][2])]
        elif len(monoconcen[mc])==2:averagetmp=[sum(x)/2 for x in zip(monoconcen[mc][0],monoconcen[mc][1])]
        else:averagetmp=monoconcen[mc][0]

        tmp=''
        for i in range(len(averagetmp)):
            tmp+=str(libfmId+i)+':'+str(averagetmp[i])+' '
        monoconcenAverage[mc]=tmp
       
    groupData.extend([str(groupId)]*10)
    libfmId+=10
    groupId+=1
    print 'monotherapy concentration',10

if gseaBool:
    gseaFileName = "gsea/gsea_thre2_noMetabolism.csv"
    pr=pd.read_csv(gseaFileName,index_col=0)
    pathwayTmp=pr.columns
    pathwayId={}
    for i,name in enumerate(pathwayTmp,start=libfmId):
        pathwayId[name]=i
        groupData.append(str(groupId))
        
    clPathway=defaultdict(list)
    for col in pr.columns:
        for cellLine in pr.index:
            if pr.loc[cellLine,col]:clPathway[cellLine].append(pathwayId[col])
    
    libfmId+=len(pathwayId)
    groupId+=1
    print 'GSEA',len(pathwayId)
    
if tspairBool:
    pr=pd.read_csv("TSpair/tspair_10_negbinary(0.7).csv",index_col=0)
    tspairTmp=pr.columns
    tspairId={}
    for i,name in enumerate(tspairTmp,start=libfmId):
        tspairId[name]=i
        groupData.append(str(groupId))

    libfmId+=len(tspairId)

    clTSpair=defaultdict(list) 
    for col in pr.columns:
        for cellLine in pr.index:
            diff = pr[col].max() - pr[col].min()
            #if pr.loc[cellLine,col]:clTSpair[cellLine].append(str(tspairId[col])+":"+str(pr.loc[cellLine,col]/diff))   
            if pr.loc[cellLine,col]:clTSpair[cellLine].append(str(tspairId[col])+":"+str(pr.loc[cellLine,col]))  

    groupId+=1
    print 'TSpair 개수:',len(tspairId)
if drug_target_bool1:
    pr=pd.read_csv('drug/Drug_info_release_extended.csv')
    dtl=set() # drug target list
    for targets in pr['Target']:
        if pd.notnull(targets):
            targetSplit=targets.split(',')
            for i in range(len(targetSplit)):
                if '*' not in targetSplit[i]:
                    dtl.add(targetSplit[i].strip())

    dtId={}
    for i,d0 in enumerate(dtl,start=libfmId):
        dtId[d0]=i
        groupData.append(str(groupId))

    ddt1={}
    for row in pr.itertuples():
        tmp=set()
        if pd.notnull(row[2]):
            targetSplit=row[2].split(',')
            for i in range(len(targetSplit)):
                if '*' not in targetSplit[i]:
                    tmp.add(dtId[targetSplit[i].strip()])
        ddt1[row[1]]=tmp

    print 'drug target1',len(dtl)
    libfmId+=len(dtl)
    groupId+=1    
    
if maxconcBool:
    maxconca=libfmId
    maxconcb=libfmId+1
    libfmId+=2
    groupData.extend([str(groupId),str(groupId)])
    groupId+=1
    print 'max concentration',2

if einfBool:
    einfaId=libfmId
    einfbId=libfmId+1
    libfmId+=2
    groupData.extend([str(groupId),str(groupId)])
    groupId+=1
    print 'einf',2
if drugCombBool==1:
    dc={}
    pr=pd.read_csv('input/ch1_train_combination_and_monoTherapy.csv')
    for i,name in enumerate(set(pr['COMBINATION_ID']),start=libfmId):
        dc[name]=i
        groupData.append(str(groupId))
    libfmId+=len(dc)
    groupId+=1
    print 'drug combination id',len(dc)
if lassoPolyBool:
    df=pd.read_csv('tmp/lasso_polyfeatures.csv',index_col=0)
    idPolyFeature={}
    for _ in df.index:
        feature=''
        for poly in df.columns:
            if df.loc[_,poly]:feature+=str(int(poly)+libfmId)+':'+str(df.loc[_,poly])+' '
        idPolyFeature[_]=feature
    
    featureNum=len(df.columns)
    groupData.extend([str(groupId)]*featureNum)
    libfmId+=featureNum
    groupId+=1
    print 'lasso',featureNum
if stageBool:
    pr=pd.read_csv('cell/cell_comment_stage.csv')
    clStage={}
    maxStage=max(pr['stage'])
    #print maxStage
    for row in pr.itertuples():
        #row[1] == cell line name, row[2] == stage
        clStage[row[1]]=str(libfmId+row[2]-1)+':1'

    libfmId+=maxStage
    groupData.extend([str(groupId)]*maxStage)
    groupId+=1
    print 'cell comment stage',maxStage
    
if lipinskiBool:
    pr=pd.read_csv('drug/drug_lipinski.csv')
    drugLipinski={}
    maxLipinski=int(max(pr.loc[pr.Lipinski.notnull(),'Lipinski']))
    
    for row in pr[pr.Lipinski.notnull()].itertuples():
        drugLipinski[row[1]]=str(libfmId+int(row[2]))+':1'

    libfmId+=maxLipinski
    groupData.extend([str(groupId)]*maxLipinski)
    groupId+=1
    print 'lipinski',maxLipinski
    
if gexBool:
    dtl=[]
    with open('gex/drugTargetGex.txt','r') as fr:
        for line in fr:
            dtl.append(line.strip())

    gexId={}
    for i,d in enumerate(dtl,start=libfmId):
        gexId[d]=i
        groupData.append(str(groupId))    

    libfmId+=len(gexId)
    groupId+=1

    df=pd.read_csv('gex/gex.csv',index_col=0)
    df=df.loc[dtl,:]
    clGex={}
    for gene in df.index:
        std=df.loc[gene,:].std()
        mean=df.loc[gene,:].mean()
        for cellLine in df.columns:
            if cellLine not in clGex: clGex[cellLine]=''
            stdCoef=0 #이걸 바꿔주면서 실험해보면 어떨까, 0이면 그냥 양 음 체크
            #양의 값
            if df.loc[gene,cellLine]>=(mean+stdCoef*std):clGex[cellLine]+=str(gexId[gene])+':'+str(df.loc[gene,cellLine]/14)+' '
            #음의 값
            #if df.loc[gene,cellLine]<=(mean-stdCoef*std):clGex[cellLine]+=str(gexId[gene])+':'+str(df.loc[gene,cellLine]/14)+' '
    
    print 'gex',len(gexId)

if stageBool:
    if row[1] in clStage: featureString+=clStage[row[1]]+' '
if lipinskiBool:
    if row[2] in drugLipinski: featureString+=drugLipinski[row[2]]+' '
    if row[3] in drugLipinski: featureString+=drugLipinski[row[3]]+' '

if gexBool:
    if row[1] in clGex: featureString+=clGex[row[1]]
    
if lassoPolyBool:
    featureString+=idPolyFeature[rowId]
    rowId+=1
if drug_target_bool1:
    u=ddt1[row[2]].intersection(ddt1[row[3]])
    for x in u: featureString+=str(x)+':2 ' #target이 겹치는 경우 feature의 value를 2배로 주었다
    for x in ddt1[row[2]]-u: featureString+=str(x)+':1 '
    for x in ddt1[row[3]]-u: featureString+=str(x)+':1 '

if maxconcBool:
    featureString+=str(maxconca)+':'+str(row[4]/pr['MAX_CONC_A'].max())+' '
    featureString+=str(maxconcb)+':'+str(row[5]/pr['MAX_CONC_B'].max())+' '

if einfBool:
    featureString+=str(einfaId)+':'+str(row[8]/100)+' '
    featureString+=str(einfbId)+':'+str(row[11]/100)+' '

if prerher2Bool:
    if row[1] in clprerher2:featureString+=clprerher2[row[1]]

if monoconcenBool:
    if row[14]+'.'+row[1] in monoconcenAverage:featureString+=monoconcenAverage[row[14]+'.'+row[1]]

if gseaBool:
    if row[1] in clPathway:
        for x in clPathway[row[1]]:featureString+=str(x)+':1 '

if tspairBool:
    if row[1] in clTSpair: featureString+=' '.join(clTSpair[row[1]])+' '