In [8]:
from nipype import Function, Node, Workflow, IdentityInterface, JoinNode

wf = Workflow(name='wf_thick')
wf.base_dir = '/om/user/ysa'

In [9]:
def preproc(iteration):
    import os
    import numpy as np
    import pandas as pd
    import utils
    
    # VALIDATION DATA
    valbase = '/storage/gablab001/data/genus/current/structured/validate'
    
    # 183_cor_subcor_D_thickness.csv contains the 170 features for the validation set
    valdatacob = '183_cor_subcor_D_thickness.csv'
    valdatacob = pd.read_csv(os.path.join(valbase, valdatacob))
    
    # the file containing the covariates
    valcov = pd.read_csv(os.path.join(valbase, '183_covariates_to_use.csv'))
    
    # this is y in the classification analysis
    valres = valcov['diag'].values
    
    # these are the covariates that wont be one hot encoded
    # gender is already 1 or 0, a one hot encoding will create a 
    # redundant 2 column matrix of gender that will cause problems
    #valcov = valcov[['ICV','age','gender']]

    # base directory where some text files live, they incude header names
    # example: example_header.txt will have:
    # pallidum
    # cerebellum
    txtb = '/storage/gablab001/data/genus/current/structured/genus/text_files_for_indexing'
    
    # directory to where the genus brain that is
    bd = '/storage/gablab001/data/genus/current/structured/brain/'
    
    # the header text file for the 170 columns, this will be used to subset the 
    # entire genus data
    thickness = np.genfromtxt(os.path.join(txtb, '170_columns.txt'), dtype=str)
    
    # making equivalent volume headers from the thickness headers
    volume = ' '.join(thickness).replace('thickness_D','volume_D').split(' ')
    
    # this variable is for convenience so that i dont hav to change
    # where the headers are in multiple places, just here
    colheads = thickness
    
    # loading the covariate headers that will be one hot encoded
    cvar_encode = np.genfromtxt(os.path.join(txtb, 'covars_ecn.txt'), dtype=str)
    
    # the covariate headers that wont be one hot encoded
    cvar = np.genfromtxt(os.path.join(txtb, 'covars_no_ecn.txt'), dtype=str)
    
    # GENUS brain data
    brain = pd.read_csv(os.path.join(bd, 'GENUS_FS_ATLAS_D.csv'), low_memory=False)
    
    # GENUS response variable
    response = brain[['IID','GROUP']]
    
    # here i combined all the needed data so that I can drop rows all together and 
    # make sure all parts of the data, brain regions, covariates, ID, response are
    # sorted by the same rows
    combined = pd.concat([
        brain[colheads],
        brain[cvar],
        brain[cvar_encode],
        brain[['IID','GROUP']]
    ], axis=1).dropna().drop_duplicates('IID')

    # the genus y in the classification analysis
    response = combined['GROUP'].values

    # subsetting genus data to only include the 170 features
    X_data = combined[colheads].reset_index(drop=True)
    
    # getting the covariates that wont be one hot encoded
    cvar_ne = combined[cvar].reset_index(drop=True)
    
    # and the covariates that will be one hot encoded
    cvar_e = combined[cvar_encode].reset_index(drop=True)
    
    # performing one hot encoding
    cvar_e = pd.concat([
        pd.DataFrame(utils.encoder(cvar_e[col])) for col in cvar_e.columns
    ], axis=1, ignore_index=True)

    # recombining covariates
    cvars = pd.concat([cvar_ne, cvar_e], axis=1)
    
    # response variable to be fed into classifier
    y = np.array([1 if i == 'Schizophrenia' else 0 for i in response])

    # remove effects of covariates
    # NOTE: this will result in extremely poor results for the validation set
    #       because the same covariates cannot be applied to the validation set
    #       it's only when the same covariates are removed from each set that we see
    #       results similar to what I show
    XG = utils.proj(X_data.values, cvars.values)
    XCO = utils.proj(valdatacob.values, valcov.values)

    # dictionary that will be passed to the classifier node
    data_dict = {'GENUS_X':XG, 'GENUS_y':y,
                 'GENUS_XCOLS':X_data.columns.values,
                 'GENUS_COVCOLS':cvars.columns.values,
                 'COBREFMRI_X': XCO, 'COBREFMRI_y': valres,
                 'COBREFMRI_XCOLS': valdatacob.columns.values,
                 'COBREFMRI_COVCOLS': valcov.columns.values,
                 'it':iteration}

    return data_dict

# wraping prepoc function in nipype node
Preproc = Node(name='Preproc',
    interface=Function(input_names=['iteration'],
                       output_names=['data_dict'],
                       function=preproc)
)

In [10]:
iters = Node(IdentityInterface(fields=['iteration']), name='iters')
iters.iterables = ('iteration', range(100))
wf.connect(iters, 'iteration', Preproc, 'iteration')

def score_func(data_dict):
    import numpy as np
    from sklearn import linear_model
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    from sklearn.model_selection import StratifiedShuffleSplit
    from sklearn.metrics import roc_auc_score
    from sklearn.model_selection import cross_val_score
    from sklearn.metrics import make_scorer

    score_func = make_scorer(roc_auc_score)
    cv_in = StratifiedShuffleSplit(n_splits=5)
    cv_out = StratifiedShuffleSplit(n_splits=5)

    result = {'coef_{}'.format(data_dict['it']): [],
              'test_aucs_cv_{}'.format(data_dict['it']): [],
              'train_aucs_cv_{}'.format(data_dict['it']): [],
              'val_aucs_cv_{}'.format(data_dict['it']): []}

    for train, test in cv_out.split(data_dict['GENUS_X'], data_dict['GENUS_y']):

        clf = Pipeline([
                ('scale', StandardScaler()),
                ('lg', linear_model.LogisticRegressionCV(
                        penalty='l1',
                        solver='liblinear',
                        cv=cv_in))
        ])

        aucs = []
        train_aucs = []
        val_aucs = []
        clf.fit(data_dict['GENUS_X'][train], data_dict['GENUS_y'][train])
        val_pred = clf.predict(data_dict['COBREFMRI_X'])
        val_aucs.append(roc_auc_score(data_dict['COBREFMRI_y'], val_pred))
        train_pred = clf.predict(data_dict['GENUS_X'][train])
        train_aucs.append(roc_auc_score(data_dict['GENUS_y'][train], train_pred))
        prediction = clf.predict(data_dict['GENUS_X'][test])
        aucs.append(roc_auc_score(data_dict['GENUS_y'][test], prediction))
        result['coef_{}'.format(data_dict['it'])].append(clf.named_steps['lg'].coef_)
        result['test_aucs_cv_{}'.format(data_dict['it'])].append(aucs)
        result['val_aucs_cv_{}'.format(data_dict['it'])].append(val_aucs)
        result['train_aucs_cv_{}'.format(data_dict['it'])].append(train_aucs)

    result['mean_test_auc_genus_{}'.format(data_dict['it'])] = np.mean(aucs)
    result['mean_train_auc_genus_{}'.format(data_dict['it'])] = np.mean(train_aucs)
    result['mean_val_auc_CO_{}'.format(data_dict['it'])] = np.mean(val_aucs)

    clf = Pipeline([
            ('scale', StandardScaler()),
            ('lg', linear_model.LogisticRegressionCV(
                    penalty='l1',
                    solver='liblinear',
                    cv=cv_in))
    ])

    clf.fit(data_dict['GENUS_X'], data_dict['GENUS_y'])
    pred = clf.predict(data_dict['COBREFMRI_X'])
    score = roc_auc_score(data_dict['COBREFMRI_y'], pred)
    result['CO_score_{}'.format(data_dict['it'])] = score

    return result

Score_func = Node(name='Score_func',
    interface=Function(input_names=['data_dict'],
                       output_names=['result'],
                       function=score_func)
)

wf.connect(Preproc, 'data_dict', Score_func, 'data_dict')

join_iter = JoinNode(IdentityInterface(fields=['result']),
    joinsource='iters',
    joinfield=['result'],
    name='join_iter'
)

wf.connect(Score_func, 'result', join_iter, 'result')

In [None]:
def save_result(results_list):
    import pickle
    res = {}
    for result in results_list:
        res.update(result)
    with open("/storage/gablab001/data/genus/res_thick_iters.pkl", "wb") as r:
        pickle.dump(res, r, protocol=pickle.HIGHEST_PROTOCOL)
    return None

Save_result = Node(name='Save_result',
    interface=Function(input_names=['results_list'],
                       output_names=[],
                       function=save_result)
)

wf.connect(join_iter, 'result', Save_result, 'results_list')
#wf.write_graph(graph2use='exec')
wf.run(plugin='SLURM', plugin_args={'sbatch_args':'--mem=12G -t 03:00:00'})

170412-18:22:46,564 workflow INFO:
	 ['check', 'execution', 'logging']
170412-18:22:46,895 workflow INFO:
	 Running in parallel.
170412-18:22:46,907 workflow INFO:
	 Pending[0] Submitting[100] jobs Slots[inf]
170412-18:22:46,909 workflow INFO:
	 Submitting: Preproc.a29 ID: 0
170412-18:22:47,6 workflow INFO:
	 Finished submitting: Preproc.a29 ID: 0
170412-18:22:47,7 workflow INFO:
	 Submitting: Preproc.a43 ID: 1
170412-18:22:47,95 workflow INFO:
	 Finished submitting: Preproc.a43 ID: 1
170412-18:22:47,96 workflow INFO:
	 Submitting: Preproc.a98 ID: 2
170412-18:22:47,172 workflow INFO:
	 Finished submitting: Preproc.a98 ID: 2
170412-18:22:47,173 workflow INFO:
	 Submitting: Preproc.a25 ID: 3
170412-18:22:47,239 workflow INFO:
	 Finished submitting: Preproc.a25 ID: 3
170412-18:22:47,241 workflow INFO:
	 Submitting: Preproc.a57 ID: 4
170412-18:22:47,340 workflow INFO:
	 Finished submitting: Preproc.a57 ID: 4
170412-18:22:47,341 workflow INFO:
	 Submitting: Preproc.a02 ID: 5
170412-18:22:47

170412-18:22:53,660 workflow INFO:
	 Finished submitting: Preproc.a47 ID: 87
170412-18:22:53,663 workflow INFO:
	 Submitting: Preproc.a91 ID: 89
170412-18:22:53,761 workflow INFO:
	 Finished submitting: Preproc.a91 ID: 89
170412-18:22:53,763 workflow INFO:
	 Submitting: Preproc.a68 ID: 92
170412-18:22:53,839 workflow INFO:
	 Finished submitting: Preproc.a68 ID: 92
170412-18:22:53,840 workflow INFO:
	 Submitting: Preproc.a15 ID: 94
170412-18:22:53,927 workflow INFO:
	 Finished submitting: Preproc.a15 ID: 94
170412-18:22:53,929 workflow INFO:
	 Submitting: Preproc.a90 ID: 97
170412-18:22:54,27 workflow INFO:
	 Finished submitting: Preproc.a90 ID: 97
170412-18:22:54,28 workflow INFO:
	 Submitting: Preproc.a67 ID: 98
170412-18:22:54,122 workflow INFO:
	 Finished submitting: Preproc.a67 ID: 98
170412-18:22:54,123 workflow INFO:
	 Submitting: Preproc.a14 ID: 100
170412-18:22:54,199 workflow INFO:
	 Finished submitting: Preproc.a14 ID: 100
170412-18:22:54,200 workflow INFO:
	 Submitting: Prep

170412-18:24:03,450 workflow INFO:
	 [Job finished] jobname: Preproc.a69 jobid: 29
170412-18:24:03,650 workflow INFO:
	 [Job finished] jobname: Preproc.a80 jobid: 30
170412-18:24:03,876 workflow INFO:
	 [Job finished] jobname: Preproc.a21 jobid: 32
170412-18:24:04,104 workflow INFO:
	 [Job finished] jobname: Preproc.a52 jobid: 35
170412-18:24:04,309 workflow INFO:
	 [Job finished] jobname: Preproc.a97 jobid: 37
170412-18:24:04,507 workflow INFO:
	 [Job finished] jobname: Preproc.a95 jobid: 38
170412-18:24:04,733 workflow INFO:
	 [Job finished] jobname: Preproc.a04 jobid: 39
170412-18:24:04,941 workflow INFO:
	 [Job finished] jobname: Preproc.a20 jobid: 41
170412-18:24:05,176 workflow INFO:
	 [Job finished] jobname: Preproc.a71 jobid: 43
170412-18:24:05,398 workflow INFO:
	 [Job finished] jobname: Preproc.a70 jobid: 44
170412-18:24:05,629 workflow INFO:
	 [Job finished] jobname: Preproc.a96 jobid: 46
170412-18:24:05,848 workflow INFO:
	 [Job finished] jobname: Preproc.a51 jobid: 47
1704

170412-18:26:05,510 workflow INFO:
	 Finished submitting: Score_func.a55 ID: 34
170412-18:26:05,511 workflow INFO:
	 Submitting: Score_func.a52 ID: 36
170412-18:26:14,980 workflow INFO:
	 Finished submitting: Score_func.a52 ID: 36
170412-18:26:14,981 workflow INFO:
	 Submitting: Score_func.a04 ID: 40
170412-18:26:24,392 workflow INFO:
	 Finished submitting: Score_func.a04 ID: 40
170412-18:26:24,393 workflow INFO:
	 Submitting: Score_func.a20 ID: 42
170412-18:26:33,801 workflow INFO:
	 Finished submitting: Score_func.a20 ID: 42
170412-18:26:33,802 workflow INFO:
	 Submitting: Score_func.a70 ID: 45
170412-18:26:43,164 workflow INFO:
	 Finished submitting: Score_func.a70 ID: 45
170412-18:26:43,165 workflow INFO:
	 Submitting: Score_func.a51 ID: 48
170412-18:26:52,567 workflow INFO:
	 Finished submitting: Score_func.a51 ID: 48
170412-18:26:52,568 workflow INFO:
	 Submitting: Score_func.a42 ID: 50
170412-18:27:01,957 workflow INFO:
	 Finished submitting: Score_func.a42 ID: 50
170412-18:27:0

In [1]:
from collections import Counter
import pickle

def save_pickle(x, save_name):
    with open(save_name, "wb") as save:
        pickle.dump(x, save, protocol=pickle.HIGHEST_PROTOCOL)
    return None

def read_pickle(name):
    with open(name, "rb") as data:
        data = pickle.load(data)
    return data

def count_regions(C, cols):
    n = len(C)
    col_vals = []
    for c in range(n):
        non_zero_idx = np.nonzero(C[c][0])[0]
        col_vals.extend(cols[non_zero_idx].tolist())
    return col_vals

#data = read_pickle("/storage/gablab001/data/genus/res_thick_iters.pkl")

In [None]:
outer_column_list = []

for itera in range(100):
    vtemp = count_regions(data['coef_{}'.format(itera)], cols)
    outer_column_list.extend(vtemp)
    
outer_column_list = [inner for outer in 
                    [count_regions(data['coef_{}'.format(i)]) 
                    for i in range(100)] for inner in outer]   
    
cdict = dict(Counter(outer_column_list))

rh = {k.replace('rh_', '').replace('.','-').replace('_thickness_D', ''):v
      for k, v in cdict.items() if 'rh_' in k}

lh = {k.replace('lh_', '').replace('.','-').replace('_thickness_D', ''):v
      for k, v in cdict.items() if 'lh_' in k}