In [14]:
from nipype import Function, Node, Workflow, IdentityInterface, JoinNode

wf = Workflow(name='wf_rf')
wf.base_dir = '/om/user/ysa'

def preproc(iteration):
    import os
    import numpy as np
    import pandas as pd
    from custom import utils
    
    txtb = '/storage/gablab001/data/genus/current/structured/genus/text_files_for_indexing'
    
    # directory to where the genus brain that is
    bd = '/storage/gablab001/data/genus/current/structured/brain/'
    
    # the header text file for the 170 columns, this will be used to subset the 
    # entire genus data
    thickness = np.genfromtxt(os.path.join(txtb, '170_columns.txt'), dtype=str)
    
    # making equivalent volume headers from the thickness headers
    volume = ' '.join(thickness).replace('thickness_D','volume_D').split(' ')
    
    # this variable is for convenience so that i dont hav to change
    # where the headers are in multiple places, just here
    colheads = thickness
    
    # loading the covariate headers that will be one hot encoded
    cvar_encode = np.genfromtxt(os.path.join(txtb, 'covars_ecn.txt'), dtype=str)
    
    # the covariate headers that wont be one hot encoded
    cvar = np.genfromtxt(os.path.join(txtb, 'covars_no_ecn.txt'), dtype=str)
    
    # GENUS brain data
    brain = pd.read_csv(os.path.join(bd, 'GENUS_FS_ATLAS_D.csv'), low_memory=False)
    
    # GENUS response variable
    response = brain[['IID','GROUP']]
    
    # here i combined all the needed data so that I can drop rows all together and 
    # make sure all parts of the data, brain regions, covariates, ID, response are
    # sorted by the same rows
    combined = pd.concat([
        brain[colheads],
        brain[cvar],
        brain[cvar_encode],
        brain[['IID','GROUP']]
    ], axis=1).dropna().drop_duplicates('IID')

    # the genus y in the classification analysis
    response = combined['GROUP'].values

    # subsetting genus data to only include the 170 features
    X_data = combined[colheads].reset_index(drop=True)
    
    # getting the covariates that wont be one hot encoded
    cvar_ne = combined[cvar].reset_index(drop=True)
    
    # and the covariates that will be one hot encoded
    cvar_e = combined[cvar_encode].reset_index(drop=True)
    
    # performing one hot encoding
    cvar_e = pd.concat([
        pd.DataFrame(utils.encoder(cvar_e[col])) for col in cvar_e.columns
    ], axis=1, ignore_index=True)

    # recombining covariates
    cvars = pd.concat([cvar_ne, cvar_e], axis=1)
    
    # response variable to be fed into classifier
    y = np.array([1 if i == 'Schizophrenia' else 0 for i in response])
    
    non_sing_cvars = utils.make_non_singular(cvars.values)
    XG = utils.proj(X_data.values, non_sing_cvars)

    data_dict = {'X':XG, 'y': y, 'XCOLS':X_data.columns.values, 'it':iteration}

    return data_dict

Preproc = Node(name='Preproc',
    interface=Function(input_names=['iteration'],
                       output_names=['data_dict'],
                       function=preproc)
)

iters = Node(IdentityInterface(fields=['iteration']), name='iters')
iters.iterables = ('iteration', range(100))
wf.connect(iters, 'iteration', Preproc, 'iteration')

def rf_func(data_dict):
    import numpy as np
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    from sklearn.model_selection import StratifiedShuffleSplit
    from sklearn.metrics import roc_auc_score
    from sklearn.model_selection import cross_val_score
    from sklearn.metrics import make_scorer
    from sklearn.ensemble import RandomForestClassifier
    
    res_dict = {}
    
    for i in range(170):
    
        clf = Pipeline([
            ('scale', StandardScaler()),
            ('rf', RandomForestClassifier(n_estimators=200))
        ])


        mc = data_dict['XCOLS'][i]
        X = data_dict['X'][:, np.setdiff1d(range(170), i)]
        y = data_dict['y']
        cv = StratifiedShuffleSplit(n_splits=8)
        sfunc = make_scorer(roc_auc_score)
        score = np.mean(cross_val_score(clf, X=X, y=y, cv=cv, scoring=sfunc))
        res_dict[str(data_dict['it']) + '_{}_{}'.format(i, mc)] = score
        
    return res_dict

RF = Node(name='RF',
    interface=Function(input_names=['data_dict'],
                       output_names=['res_dict'],
                       function=rf_func)
)

wf.connect(Preproc, 'data_dict', RF, 'data_dict')       
      
join_iter = JoinNode(IdentityInterface(fields=['res_dict']),
    joinsource='iters',
    joinfield=['res_dict'],
    name='join_iter'
)
        
wf.connect(RF, 'res_dict', join_iter, 'res_dict')
        
def save_result(res_dict):
    import pickle
    res = {}
    for result in res_dict:
        res.update(result)
    with open("/storage/gablab001/data/genus/pkls/rf_results.pkl", "wb") as s:
        pickle.dump(res, s, protocol=pickle.HIGHEST_PROTOCOL)
    return None
        
Save = Node(name='Save',
        interface=Function(input_names=['res_dict'],
                           output_names=[],
                           function=save_result)
)

wf.connect(join_iter, 'res_dict', Save, 'res_dict')

In [15]:
wf.run(plugin='SLURM', plugin_args={'sbatch_args':'--mem=12G -t 03:00:00'})

170415-17:30:24,678 workflow INFO:
	 ['check', 'execution', 'logging']
170415-17:31:02,577 workflow INFO:
	 Running in parallel.
170415-17:31:02,590 workflow INFO:
	 Pending[0] Submitting[100] jobs Slots[inf]
170415-17:31:02,591 workflow INFO:
	 Submitting: Preproc.a18 ID: 0
170415-17:31:24,178 workflow INFO:
	 [Job finished] jobname: Preproc.a18 jobid: 0
170415-17:31:24,181 workflow INFO:
	 Finished submitting: Preproc.a18 ID: 0
170415-17:31:24,182 workflow INFO:
	 Submitting: Preproc.a75 ID: 1
170415-17:31:24,763 workflow INFO:
	 [Job finished] jobname: Preproc.a75 jobid: 1
170415-17:31:24,766 workflow INFO:
	 Finished submitting: Preproc.a75 ID: 1
170415-17:31:24,767 workflow INFO:
	 Submitting: Preproc.a28 ID: 2
170415-17:31:25,155 workflow INFO:
	 [Job finished] jobname: Preproc.a28 jobid: 2
170415-17:31:25,158 workflow INFO:
	 Finished submitting: Preproc.a28 ID: 2
170415-17:31:25,160 workflow INFO:
	 Submitting: Preproc.a32 ID: 4
170415-17:31:25,412 workflow INFO:
	 [Job finishe

170415-17:31:32,62 workflow INFO:
	 [Job finished] jobname: Preproc.a10 jobid: 59
170415-17:31:32,66 workflow INFO:
	 Finished submitting: Preproc.a10 ID: 59
170415-17:31:32,68 workflow INFO:
	 Submitting: Preproc.a05 ID: 61
170415-17:31:32,246 workflow INFO:
	 [Job finished] jobname: Preproc.a05 jobid: 61
170415-17:31:32,249 workflow INFO:
	 Finished submitting: Preproc.a05 ID: 61
170415-17:31:32,252 workflow INFO:
	 Submitting: Preproc.a39 ID: 63
170415-17:31:32,362 workflow INFO:
	 [Job finished] jobname: Preproc.a39 jobid: 63
170415-17:31:32,365 workflow INFO:
	 Finished submitting: Preproc.a39 ID: 63
170415-17:31:32,367 workflow INFO:
	 Submitting: Preproc.a20 ID: 66
170415-17:31:33,971 workflow INFO:
	 [Job finished] jobname: Preproc.a20 jobid: 66
170415-17:31:33,975 workflow INFO:
	 Finished submitting: Preproc.a20 ID: 66
170415-17:31:33,978 workflow INFO:
	 Submitting: Preproc.a85 ID: 68
170415-17:31:34,192 workflow INFO:
	 [Job finished] jobname: Preproc.a85 jobid: 68
170415-1

170415-17:32:24,4 workflow INFO:
	 [Job finished] jobname: Preproc.a65 jobid: 137
170415-17:32:24,7 workflow INFO:
	 Finished submitting: Preproc.a65 ID: 137
170415-17:32:24,10 workflow INFO:
	 Submitting: Preproc.a47 ID: 139
170415-17:32:24,151 workflow INFO:
	 [Job finished] jobname: Preproc.a47 jobid: 139
170415-17:32:24,154 workflow INFO:
	 Finished submitting: Preproc.a47 ID: 139
170415-17:32:24,156 workflow INFO:
	 Submitting: Preproc.a82 ID: 140
170415-17:32:24,256 workflow INFO:
	 [Job finished] jobname: Preproc.a82 jobid: 140
170415-17:32:24,260 workflow INFO:
	 Finished submitting: Preproc.a82 ID: 140
170415-17:32:24,263 workflow INFO:
	 Submitting: Preproc.a54 ID: 142
170415-17:32:24,349 workflow INFO:
	 [Job finished] jobname: Preproc.a54 jobid: 142
170415-17:32:24,362 workflow INFO:
	 Finished submitting: Preproc.a54 ID: 142
170415-17:32:24,373 workflow INFO:
	 Submitting: Preproc.a38 ID: 145
170415-17:32:24,445 workflow INFO:
	 [Job finished] jobname: Preproc.a38 jobid: 1

170415-17:42:19,530 workflow INFO:
	 Finished submitting: RF.a80 ID: 34
170415-17:42:19,532 workflow INFO:
	 Submitting: RF.a46 ID: 36
170415-17:46:12,789 workflow INFO:
	 Finished submitting: RF.a46 ID: 36
170415-17:46:12,790 workflow INFO:
	 Submitting: RF.a24 ID: 37
170415-17:46:23,71 workflow INFO:
	 Finished submitting: RF.a24 ID: 37
170415-17:46:23,72 workflow INFO:
	 Submitting: RF.a13 ID: 40
170415-17:49:28,492 workflow INFO:
	 Finished submitting: RF.a13 ID: 40
170415-17:49:28,494 workflow INFO:
	 Submitting: RF.a88 ID: 41
170415-17:50:26,318 workflow INFO:
	 Finished submitting: RF.a88 ID: 41
170415-17:50:26,320 workflow INFO:
	 Submitting: RF.a44 ID: 45
170415-17:50:46,433 workflow INFO:
	 Finished submitting: RF.a44 ID: 45
170415-17:50:46,435 workflow INFO:
	 Submitting: RF.a12 ID: 46
170415-17:50:57,608 workflow INFO:
	 Finished submitting: RF.a12 ID: 46
170415-17:50:57,610 workflow INFO:
	 Submitting: RF.a87 ID: 50
170415-17:52:28,534 workflow INFO:
	 Finished submitting:

170415-18:15:45,507 workflow INFO:
	 Submitting: RF.a99 ID: 151
170415-18:15:59,69 workflow INFO:
	 Finished submitting: RF.a99 ID: 151
170415-18:15:59,70 workflow INFO:
	 Submitting: RF.a54 ID: 152
170415-18:16:13,282 workflow INFO:
	 Finished submitting: RF.a54 ID: 152
170415-18:16:13,284 workflow INFO:
	 Submitting: RF.a37 ID: 153
170415-18:16:26,794 workflow INFO:
	 Finished submitting: RF.a37 ID: 153
170415-18:16:26,795 workflow INFO:
	 Submitting: RF.a81 ID: 155
170415-18:16:37,663 workflow INFO:
	 Finished submitting: RF.a81 ID: 155
170415-18:16:37,665 workflow INFO:
	 Submitting: RF.a33 ID: 158
170415-18:16:48,504 workflow INFO:
	 Finished submitting: RF.a33 ID: 158
170415-18:16:48,505 workflow INFO:
	 Submitting: RF.a64 ID: 159
170415-18:17:02,40 workflow INFO:
	 Finished submitting: RF.a64 ID: 159
170415-18:17:02,43 workflow INFO:
	 Submitting: RF.a06 ID: 161
170415-18:17:13,236 workflow INFO:
	 Finished submitting: RF.a06 ID: 161
170415-18:17:13,238 workflow INFO:
	 Submitti

170415-19:39:34,52 workflow INFO:
	 [Job finished] jobname: RF.a93 jobid: 169
170415-19:40:35,408 workflow INFO:
	 [Job finished] jobname: RF.a76 jobid: 122
170415-19:40:36,865 workflow INFO:
	 [Job finished] jobname: RF.a36 jobid: 185
170415-19:41:38,342 workflow INFO:
	 [Job finished] jobname: RF.a38 jobid: 146
170415-19:41:38,644 workflow INFO:
	 [Job finished] jobname: RF.a00 jobid: 148
170415-19:41:39,37 workflow INFO:
	 [Job finished] jobname: RF.a54 jobid: 152
170415-19:41:39,409 workflow INFO:
	 [Job finished] jobname: RF.a37 jobid: 153
170415-19:41:40,83 workflow INFO:
	 [Job finished] jobname: RF.a41 jobid: 174
170415-19:42:41,277 workflow INFO:
	 [Job finished] jobname: RF.a74 jobid: 126
170415-19:42:41,897 workflow INFO:
	 [Job finished] jobname: RF.a65 jobid: 144
170415-19:42:42,213 workflow INFO:
	 [Job finished] jobname: RF.a99 jobid: 151
170415-19:42:42,620 workflow INFO:
	 [Job finished] jobname: RF.a81 jobid: 155
170415-19:42:43,8 workflow INFO:
	 [Job finished] jobna

<networkx.classes.digraph.DiGraph at 0x2aca151a0990>

In [1]:
import os
import numpy as np
import pandas as pd
import pickle
import seaborn as sns
from collections import Counter
import nibabel as nb
% matplotlib inline

def read_pickle(name):
    with open(name, "rb") as data:
        data = pickle.load(data)
    return data

In [2]:
results = '/storage/gablab001/data/genus/pkls/rf_results.pkl'
res = read_pickle(results)
txtb = '/storage/gablab001/data/genus/current/structured/genus/text_files_for_indexing'
bd = '/storage/gablab001/data/genus/current/structured/brain/'
thickness = np.genfromtxt(os.path.join(txtb, '170_columns.txt'), dtype=str)

In [3]:
def make_dataframe(x):
    dflist = []
    cols=['iteration', 'feat_index', 'hemi','feat', 'auc']
    for key, val in x.items():
        split_key = key.split('_')
        if 'lh' or 'rh' not in split_key:
            if 'rh' in split_key[2]:
                dflist.append([int(split_key[0]), int(split_key[1]),
                           'rh', '_'.join(split_key[2:]), val])
            elif 'lh' in split_key[2]:
                dflist.append([int(split_key[0]), int(split_key[1]),
                           'lh', '_'.join(split_key[2:]), val])  
        else:
            dflist.append([int(split_key[0]), int(split_key[1]),
                        split_key[2], '_'.join(split_key[2:]), val])
    return pd.DataFrame(dflist, columns=cols)

In [4]:
results_df = make_dataframe(res)
groups = results_df.groupby('iteration')
smallest = {'rh':[[],[]], 'lh': [[],[]]}
for i in range(100):
    itera = groups.get_group(i)
    hemis = itera.groupby('hemi')
    for hemi in ['lh','rh']:
        data = hemis.get_group(hemi)
        gir = data[data['auc'] == data['auc'].min()]
        smallest[hemi][0].append(gir['feat'].values[0].replace('lh_','') \
                                 .replace('rh_','').replace('rh','') \
                                 .replace('lh', '').replace('_thickness_D', ''))
        smallest[hemi][1].append(gir['auc'].values[0])

In [5]:
cdi = dict(Counter(smallest['lh'][0]))

In [6]:
import nibabel as nb
from surfer import Brain

ImportError: No module named surfer