In [None]:
from nipype import Function, Node, Workflow, IdentityInterface, JoinNode

rf_wf = Workflow(name='wf_rf')
rf_wf.base_dir = '/om/user/ysa'

def preproc(iteration):
    import os
    import numpy as np
    import pandas as pd
    
    txtb = '/storage/gablab001/data/genus/current/structured/genus/text_files_for_indexing'
    bd = '/storage/gablab001/data/genus/current/structured/brain/'
    thickness = np.genfromtxt(os.path.join(txtb, '170_columns.txt'), dtype=str)
    volume = ' '.join(thickness).replace('thickness_D','volume_D').split(' ')
    colheads = thickness
    cvar_encode = np.genfromtxt(os.path.join(txtb, 'covars_ecn.txt'), dtype=str)
    cvar = np.genfromtxt(os.path.join(txtb, 'covars_no_ecn.txt'), dtype=str)
    brain = pd.read_csv(os.path.join(bd, 'GENUS_FS_ATLAS_D.csv'), low_memory=False)
    response = brain[['IID','GROUP']]

    combined = pd.concat([
        brain[colheads],
        brain[cvar],
        brain[cvar_encode],
        brain[['IID','GROUP']]
    ], axis=1).dropna().drop_duplicates('IID')

    response = combined['GROUP'].values

    # GENUS DATA VOLUME
    X_data = combined[colheads].reset_index(drop=True)
    cvar_ne = combined[cvar].reset_index(drop=True)
    cvar_e = combined[cvar_encode].reset_index(drop=True)
    cvars = pd.concat([cvar_ne, cvar_e['SEX']], axis=1)

    y = np.array([1 if i == 'Control' else 0 for i in response])

    def proj(X , C):
        P = np.eye(C.shape[0]) - C.dot(np.linalg.inv(C.T.dot(C))).dot(C.T)
        return P.dot(X)

    XG = proj(X_data.values, cvars.values)

    data_dict = {'X':XG, 'y': y, 'XCOLS':X_data.columns.values, 'it':iteration}

    return data_dict

Preproc = Node(name='Preproc',
    interface=Function(input_names=['iteration'],
                       output_names=['data_dict'],
                       function=preproc)
)

iters = Node(IdentityInterface(fields=['iteration']), name='iters')
iters.iterables = ('iteration', range(170))
rf_wf.connect(iters, 'iteration', Preproc, 'iteration')

def rf_func(data_dict):
    import numpy as np
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    from sklearn.model_selection import StratifiedShuffleSplit
    from sklearn.metrics import roc_auc_score
    from sklearn.model_selection import cross_val_score
    from sklearn.metrics import make_scorer
    from sklearn.ensemble import RandomForestClassifier
    
    clf = Pipeline([
        ('scale', StandardScaler()),
        ('rf', RandomForestClassifier(n_estimators=200))
    ])

    X = data_dict['X'][:, np.setdiff1d(range(170), data_dict['it'])]
    y = data_dict['y']
    cv = StratifiedShuffleSplit(n_splits=8)
    sfunc = make_scorer(roc_auc_score)
    score = np.mean(cross_val_score(clf, X=X, y=y, cv=cv, scoring=sfunc))
    return {data_dict['it']: score}

RF = Node(name='RF',
    interface=Function(input_names=['data_dict'],
                       output_names=['res_dict'],
                       function=rf_func)
)


save_out = {}
for i in range(170):
    data_dict = preproc(i)
    result = rfnode(data_dict)
    save_out.update(result)
save_out['cols'] = data_dict['XCOLS']


with open("/storage/gablab001/data/genus/pkls/rftest_{}.pkl".format(ii), "wb") as f:
    pickle.dump(save_out, f, protocol=pickle.HIGHEST_PROTOCOL)
