In [31]:
from nipype import Function, Node, Workflow, IdentityInterface, JoinNode

wf = Workflow(name='wf_rf')
wf.base_dir = '/om/user/ysa'

def preproc(iteration):
    import os
    import numpy as np
    import pandas as pd
    from custom import utils
    
    txtb = '/storage/gablab001/data/genus/current/structured/genus/text_files_for_indexing'
    
    # directory to where the genus brain that is
    bd = '/storage/gablab001/data/genus/current/structured/brain/'
    
    # the header text file for the 170 columns, this will be used to subset the 
    # entire genus data
    thickness = np.genfromtxt(os.path.join(txtb, '170_columns.txt'), dtype=str)
    
    # making equivalent volume headers from the thickness headers
    volume = ' '.join(thickness).replace('thickness_D','volume_D').split(' ')
    
    # this variable is for convenience so that i dont hav to change
    # where the headers are in multiple places, just here
    colheads = thickness
    
    # loading the covariate headers that will be one hot encoded
    cvar_encode = np.genfromtxt(os.path.join(txtb, 'covars_ecn.txt'), dtype=str)
    
    # the covariate headers that wont be one hot encoded
    cvar = np.genfromtxt(os.path.join(txtb, 'covars_no_ecn.txt'), dtype=str)
    
    # GENUS brain data
    brain = pd.read_csv(os.path.join(bd, 'GENUS_FS_ATLAS_D.csv'), low_memory=False)
    
    # GENUS response variable
    response = brain[['IID','GROUP']]
    
    # here i combined all the needed data so that I can drop rows all together and 
    # make sure all parts of the data, brain regions, covariates, ID, response are
    # sorted by the same rows
    combined = pd.concat([
        brain[colheads],
        brain[cvar],
        brain[cvar_encode],
        brain[['IID','GROUP']]
    ], axis=1).dropna().drop_duplicates('IID')

    # the genus y in the classification analysis
    response = combined['GROUP'].values

    # subsetting genus data to only include the 170 features
    X_data = combined[colheads].reset_index(drop=True)
    
    # getting the covariates that wont be one hot encoded
    cvar_ne = combined[cvar].reset_index(drop=True)
    
    # and the covariates that will be one hot encoded
    cvar_e = combined[cvar_encode].reset_index(drop=True)
    
    # performing one hot encoding
    cvar_e = pd.concat([
        pd.DataFrame(utils.encoder(cvar_e[col])) for col in cvar_e.columns
    ], axis=1, ignore_index=True)

    # recombining covariates
    cvars = pd.concat([cvar_ne, cvar_e], axis=1)
    
    # response variable to be fed into classifier
    y = np.array([1 if i == 'Schizophrenia' else 0 for i in response])
    
    non_sing_cvars = utils.make_non_singular(cvars.values)
    XG = utils.proj(X_data.values, non_sing_cvars)

    data_dict = {'X':XG, 'y': y, 'XCOLS':X_data.columns.values, 'it':iteration}

    return data_dict

Preproc = Node(name='Preproc',
    interface=Function(input_names=['iteration'],
                       output_names=['data_dict'],
                       function=preproc)
)

iters = Node(IdentityInterface(fields=['iteration']), name='iters')
iters.iterables = ('iteration', range(170))
wf.connect(iters, 'iteration', Preproc, 'iteration')

def rf_func(data_dict):
    import numpy as np
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    from sklearn.model_selection import StratifiedShuffleSplit
    from sklearn.metrics import roc_auc_score
    from sklearn.model_selection import cross_val_score
    from sklearn.metrics import make_scorer
    from sklearn.ensemble import RandomForestClassifier
    
    res_dict = {}
    
    for i in range(100):
    
        clf = Pipeline([
            ('scale', StandardScaler()),
            ('rf', RandomForestClassifier(n_estimators=200))
        ])


        X = data_dict['X'][:, np.setdiff1d(range(170), data_dict['it'])]
        y = data_dict['y']
        cv = StratifiedShuffleSplit(n_splits=8)
        sfunc = make_scorer(roc_auc_score)
        score = np.mean(cross_val_score(clf, X=X, y=y, cv=cv, scoring=sfunc))
        res_dict[str(data_dict['it']) + '{}'.format(i)] = score
        
    return res_dict

RF = Node(name='RF',
    interface=Function(input_names=['data_dict'],
                       output_names=['res_dict'],
                       function=rf_func)
)

wf.connect(Preproc, 'data_dict', RF, 'data_dict')       
      
join_iter = JoinNode(IdentityInterface(fields=['res_dict']),
    joinsource='iters',
    joinfield=['res_dict'],
    name='join_iter'
)
        
wf.connect(RF, 'res_dict', join_iter, 'res_dict')
        
def save_result(res_dict):
    import pickle
    res = {}
    for result in res_dict:
        res.update(result)
    with open("/storage/gablab001/data/genus/pkls/rf_results.pkl", "wb") as s:
        pickle.dump(res, s, protocol=pickle.HIGHEST_PROTOCOL)
    return None
        
Save = Node(name='Save',
        interface=Function(input_names=['res_dict'],
                           output_names=[],
                           function=save_result)
)

wf.connect(join_iter, 'res_dict', Save, 'res_dict')

In [None]:
wf.run(plugin='SLURM', plugin_args={'sbatch_args':'--mem=12G -t 03:00:00'})

170414-14:36:13,48 workflow INFO:
	 ['check', 'execution', 'logging']
170414-14:36:13,526 workflow INFO:
	 Running in parallel.
170414-14:36:13,545 workflow INFO:
	 Pending[0] Submitting[170] jobs Slots[inf]
170414-14:36:13,548 workflow INFO:
	 Submitting: Preproc.a008 ID: 0
170414-14:36:13,554 workflow INFO:
	 [Job finished] jobname: Preproc.a008 jobid: 0
170414-14:36:13,558 workflow INFO:
	 Finished submitting: Preproc.a008 ID: 0
170414-14:36:13,561 workflow INFO:
	 Submitting: Preproc.a012 ID: 1
170414-14:36:13,569 workflow INFO:
	 [Job finished] jobname: Preproc.a012 jobid: 1
170414-14:36:13,572 workflow INFO:
	 Finished submitting: Preproc.a012 ID: 1
170414-14:36:13,575 workflow INFO:
	 Submitting: Preproc.a106 ID: 2
170414-14:36:13,583 workflow INFO:
	 [Job finished] jobname: Preproc.a106 jobid: 2
170414-14:36:13,587 workflow INFO:
	 Finished submitting: Preproc.a106 ID: 2
170414-14:36:13,590 workflow INFO:
	 Submitting: Preproc.a076 ID: 3
170414-14:36:13,596 workflow INFO:
	 [Jo

170414-14:36:14,203 workflow INFO:
	 Submitting: Preproc.a055 ID: 54
170414-14:36:14,208 workflow INFO:
	 [Job finished] jobname: Preproc.a055 jobid: 54
170414-14:36:14,213 workflow INFO:
	 Finished submitting: Preproc.a055 ID: 54
170414-14:36:14,218 workflow INFO:
	 Submitting: Preproc.a129 ID: 56
170414-14:36:14,223 workflow INFO:
	 [Job finished] jobname: Preproc.a129 jobid: 56
170414-14:36:14,229 workflow INFO:
	 Finished submitting: Preproc.a129 ID: 56
170414-14:36:14,231 workflow INFO:
	 Submitting: Preproc.a167 ID: 58
170414-14:36:14,240 workflow INFO:
	 [Job finished] jobname: Preproc.a167 jobid: 58
170414-14:36:14,245 workflow INFO:
	 Finished submitting: Preproc.a167 ID: 58
170414-14:36:14,247 workflow INFO:
	 Submitting: Preproc.a059 ID: 59
170414-14:36:14,265 workflow INFO:
	 [Job finished] jobname: Preproc.a059 jobid: 59
170414-14:36:14,269 workflow INFO:
	 Finished submitting: Preproc.a059 ID: 59
170414-14:36:14,271 workflow INFO:
	 Submitting: Preproc.a060 ID: 63
170414-

170414-14:36:15,232 workflow INFO:
	 Finished submitting: Preproc.a041 ID: 122
170414-14:36:15,235 workflow INFO:
	 Submitting: Preproc.a038 ID: 126
170414-14:36:15,243 workflow INFO:
	 [Job finished] jobname: Preproc.a038 jobid: 126
170414-14:36:15,246 workflow INFO:
	 Finished submitting: Preproc.a038 ID: 126
170414-14:36:15,248 workflow INFO:
	 Submitting: Preproc.a039 ID: 128
170414-14:36:15,253 workflow INFO:
	 [Job finished] jobname: Preproc.a039 jobid: 128
170414-14:36:15,257 workflow INFO:
	 Finished submitting: Preproc.a039 ID: 128
170414-14:36:15,259 workflow INFO:
	 Submitting: Preproc.a065 ID: 131
170414-14:36:15,263 workflow INFO:
	 [Job finished] jobname: Preproc.a065 jobid: 131
170414-14:36:15,267 workflow INFO:
	 Finished submitting: Preproc.a065 ID: 131
170414-14:36:15,269 workflow INFO:
	 Submitting: Preproc.a034 ID: 133
170414-14:36:15,273 workflow INFO:
	 [Job finished] jobname: Preproc.a034 jobid: 133
170414-14:36:15,278 workflow INFO:
	 Finished submitting: Prepro

170414-14:36:15,812 workflow INFO:
	 Submitting: Preproc.a025 ID: 192
170414-14:36:15,817 workflow INFO:
	 [Job finished] jobname: Preproc.a025 jobid: 192
170414-14:36:15,821 workflow INFO:
	 Finished submitting: Preproc.a025 ID: 192
170414-14:36:15,823 workflow INFO:
	 Submitting: Preproc.a104 ID: 196
170414-14:36:15,827 workflow INFO:
	 [Job finished] jobname: Preproc.a104 jobid: 196
170414-14:36:15,831 workflow INFO:
	 Finished submitting: Preproc.a104 ID: 196
170414-14:36:15,833 workflow INFO:
	 Submitting: Preproc.a162 ID: 197
170414-14:36:15,838 workflow INFO:
	 [Job finished] jobname: Preproc.a162 jobid: 197
170414-14:36:15,842 workflow INFO:
	 Finished submitting: Preproc.a162 ID: 197
170414-14:36:15,844 workflow INFO:
	 Submitting: Preproc.a067 ID: 199
170414-14:36:15,848 workflow INFO:
	 [Job finished] jobname: Preproc.a067 jobid: 199
170414-14:36:15,852 workflow INFO:
	 Finished submitting: Preproc.a067 ID: 199
170414-14:36:15,854 workflow INFO:
	 Submitting: Preproc.a114 ID

170414-14:36:16,499 workflow INFO:
	 [Job finished] jobname: Preproc.a019 jobid: 268
170414-14:36:16,503 workflow INFO:
	 Finished submitting: Preproc.a019 ID: 268
170414-14:36:16,505 workflow INFO:
	 Submitting: Preproc.a108 ID: 270
170414-14:36:16,509 workflow INFO:
	 [Job finished] jobname: Preproc.a108 jobid: 270
170414-14:36:16,513 workflow INFO:
	 Finished submitting: Preproc.a108 ID: 270
170414-14:36:16,515 workflow INFO:
	 Submitting: Preproc.a020 ID: 275
170414-14:36:16,520 workflow INFO:
	 [Job finished] jobname: Preproc.a020 jobid: 275
170414-14:36:16,524 workflow INFO:
	 Finished submitting: Preproc.a020 ID: 275
170414-14:36:16,526 workflow INFO:
	 Submitting: Preproc.a107 ID: 276
170414-14:36:16,530 workflow INFO:
	 [Job finished] jobname: Preproc.a107 jobid: 276
170414-14:36:16,534 workflow INFO:
	 Finished submitting: Preproc.a107 ID: 276
170414-14:36:16,536 workflow INFO:
	 Submitting: Preproc.a021 ID: 282
170414-14:36:16,550 workflow INFO:
	 [Job finished] jobname: Pre

170414-14:38:30,70 workflow INFO:
	 Finished submitting: RF.a077 ID: 31
170414-14:38:30,72 workflow INFO:
	 Submitting: RF.a054 ID: 39
170414-14:38:43,219 workflow INFO:
	 Finished submitting: RF.a054 ID: 39
170414-14:38:43,221 workflow INFO:
	 Submitting: RF.a083 ID: 40
170414-14:38:55,811 workflow INFO:
	 Finished submitting: RF.a083 ID: 40
170414-14:38:55,813 workflow INFO:
	 Submitting: RF.a093 ID: 44


In [12]:
import os
import numpy as np
import pandas as pd
import pickle

def read_pickle(name):
    with open(name, "rb") as data:
        data = pickle.load(data)
    return data


In [16]:
results = '/storage/gablab001/data/genus/pkls/rf_results.pkl'
res = read_pickle(results)
txtb = '/storage/gablab001/data/genus/current/structured/genus/text_files_for_indexing'
bd = '/storage/gablab001/data/genus/current/structured/brain/'
thickness = np.genfromtxt(os.path.join(txtb, '170_columns.txt'), dtype=str)

In [29]:
from collections import Counter

def get_min(res, cols):

    zipped = zip([v for v in res.values()],
                 [k for k in res.keys()],
                 cols)

    ma = 100
    for iteration in zipped:
        a, b, c = iteration
        if a < ma:
            ma = a
            mb = b
            mc = c

    return (ma, mb, mc)

In [30]:
get_min(res, thickness)

(0.56900813678030693, 120, 'rh_S_oc.temp_med_and_Lingual_thickness_D')