In [1]:
import os
import numpy as np
import pandas as pd
from custom import utils
from collections import Counter
import inspect
import scipy.io

In [2]:
def lookup(x):
    print(inspect.getsource(x))
    return None
!ls

170_columns.txt			  covars_ecn.txt
2k_columns.txt			  covars_no_ecn.txt
bayes_reg.m			  cv_idxs.mat
bf_1				  data_prep.ipynb
bf_2				  FXVB_py.ipynb
bf_2_respaths			  genomic_N1547_P100006_matched.hdf5py
bf_3				  id_variables.txt
brain_gene.mat			  KCCA.ipynb
brain_N1547_P5927_matched.hdf5py  pyscript.m


# data that will be used for bayes script, still need to subset it

In [3]:
brain_data = pd.read_hdf("brain_N1547_P5927_matched.hdf5py")
genomic_data = pd.read_hdf("genomic_N1547_P100006_matched.hdf5py")
print("Brain data shape: {}".format(brain_data.shape))
print("Genomic data shape: {}".format(genomic_data.shape))

Brain data shape: (1547, 5927)
Genomic data shape: (1547, 100006)


# text files for subsetting some of the data, column wise

In [4]:
b170 = np.genfromtxt("170_columns.txt", dtype=str)
cvars_descrete = np.genfromtxt("covars_ecn.txt", dtype=str)
cvars_continuous = np.genfromtxt("covars_no_ecn.txt")

# paranoia check for matching IDs

In [5]:
assert((brain_data.IID.values == genomic_data.IID.values).mean() == 1.)

# further preprocessing

In [6]:
group = brain_data.GROUP.values
cnt_scz = np.logical_or(group == 'Control', group == 'Schizophrenia')
brain = brain_data.ix[cnt_scz, :]
genomic = genomic_data.ix[cnt_scz, :]
y_g = genomic.DIAG.values - 1
y_b = np.array([0 if i == 'Control' else 1 for i in brain.GROUP.values])

# remove covariates from brain data

In [7]:
sex = genomic['SEX'].values
age = brain['AGE_MRI'].values
# there is one missing value for age, replacing that with the column mean
age_imputed = np.nan_to_num(age)
age[age_imputed == 0] = age[age_imputed != 0].mean()
ICV = brain['EstimatedTotalIntraCranialVol'].values
cvars_cont = np.hstack((sex[:, None], age[:, None], ICV[:, None]))
cvars_desc = np.hstack([utils.encoder(brain[col]) for col in cvars_descrete[1:]])
cvars_desc = utils.make_non_singular(utils.encoder(brain.STUDY.values))
cvars_joined = utils.make_non_singular(np.hstack((cvars_cont, cvars_desc)))
brain_170 = brain[b170]
X = utils.proj(brain_170.values, cvars_joined)
y = y_b.copy()
G = utils.proj(genomic.ix[:, 1:-5].values, cvars_joined)
snp_ids = genomic.columns[1:-5].values

# this is the file that will be used in the bayes factor step

In [8]:
scipy.io.savemat("brain_gene.mat",
                 mdict={'G': G, 
                        'I': X, 
                        'Z': np.array([]),
                        'colnames': snp_ids,
                        'y': y})

# starting the bayes factor workflow

In [18]:
import os
import numpy as np
from nipype import Function, Node, Workflow, IdentityInterface

In [19]:
wf = Workflow(name='brain_bcv')
wf.base_dir = "/om/user/ysa"

In [20]:
Iternode = Node(IdentityInterface(fields=['col_idx', 'cv_idx']), name = 'Iternode')
Iternode.iterables = [('col_idx', np.arange(170) + 1), ('cv_idx', np.arange(10) + 1)]

# creates the cross validation

In [21]:
def cv_maker(data_path, save_path):
    import scipy.io
    from sklearn.model_selection import StratifiedKFold
    X = scipy.io.loadmat(data_path)['I']
    y = scipy.io.loadmat(data_path)['y'][0]
    cv = StratifiedKFold(n_splits=10, random_state=1)
    train_idx, test_idx = {}, {}
    for idx, (train, test) in enumerate(cv.split(X, y)):
        train_idx['train_{}'.format(idx + 1)] = train + 1
        test_idx['test_{}'.format(idx + 1)] = test + 1
    scipy.io.savemat(save_path, mdict={"train":train_idx, "test":test_idx})
    return save_path
 
CV_maker = Node(interface=Function(
    input_names = ['data_path', 'save_path'],
    output_names = ['save_path'],
    function = cv_maker
), name = 'CV_maker')

CV_maker.inputs.data_path = "/storage/gablab001/data/genus/brain_genomic_bayes/brain_gene.mat"
CV_maker.inputs.save_path = "/storage/gablab001/data/genus/brain_genomic_bayes/cv_idxs.mat"

# bayes factor step (1)

In [22]:
def run_bayes(in_file, cv_file, cv_idx, col_idx, out_file):
    import cPickle as pickle
    import numpy as np
    import os
    import nipype.interfaces.matlab as Matlab
    def outnames(col, out):
        return os.path.join(out, '{}.mat'.format(col))
    col_names = np.genfromtxt("/storage/gablab001/data/genus/brain_genomic_bayes/170_columns.txt", dtype=str)
    col_save_name = col_names[col_idx - 1] + "_{}_{}_BF".format(cv_idx, col_idx)
    with open("/storage/gablab001/data/genus/brain_genomic_bayes/bayes_reg.m", "r") as src:
        script = src.read().replace("\n", "")
    mat_file = outnames(in_file[:-4] + col_save_name, out_file)
    matlab = Matlab.MatlabCommand()
    matlab.inputs.paths = [
    '/storage/gablab001/data/genus/current/variational_bayes_wrap/varbvs/varbvs-MATLAB',
    '/storage/gablab001/data/genus/current/variational_bayes_wrap/varbvs',
    '/storage/gablab001/data/genus/current/variational_bayes_wrap/varbvs/varbvs-R']
    matlab.inputs.script = script.format(in_file, cv_file, cv_idx, col_idx, mat_file)
    res = matlab.run()
    return mat_file

Run_bayes = Node(interface=Function(
    input_names = ['in_file', 'cv_file', 
                   'cv_idx', 'col_idx',
                   'out_file'],
    output_names = ['mat_file'],
    function = run_bayes
), name='Run_bayes')

Run_bayes.inputs.in_file = "/storage/gablab001/data/genus/brain_genomic_bayes/brain_gene.mat"
Run_bayes.inputs.out_file = "/storage/gablab001/data/genus/brain_genomic_bayes/"

# run step 1

In [1]:
wf.connect(CV_maker, 'save_path', Run_bayes, 'cv_file')
wf.connect(Iternode, 'cv_idx', Run_bayes, 'cv_idx')
wf.connect(Iternode, 'col_idx', Run_bayes, 'col_idx')
#wf.run(plugin='SLURM', plugin_args={'sbatch_args':'--mem=12G -t 3-23:00:00', 'max_jobs': 200})

NameError: name 'wf' is not defined

In [24]:
col_names = np.genfromtxt("/storage/gablab001/data/genus/brain_genomic_bayes/170_columns.txt", dtype=str)
inf = "/storage/gablab001/data/genus/brain_genomic_bayes/brain_gene.mat"
outf = "/storage/gablab001/data/genus/brain_genomic_bayes/bf_3/"

# addon functions for step 2 (fixed form VB)

In [25]:
def make_name(BF_input, col_idx, cv_idx, col_names):
    return (BF_input[:-4].split("/")[-1] + 
            col_names[col_idx - 1] + 
            "_{}_{}_BF.mat".format(cv_idx, col_idx))

def csv(colnum, outfile):
    import pandas as pd
    import os
    df = pd.DataFrame(columns=['colNum', 'matFn'])
    for i in range(1, colnum):
        df.loc[i] = [i, outfile+'{}.mat'.format(i)]
    df.iloc[:,0] = df.iloc[:,0].astype(int)
    return df

def write_csv(cv_idx, col_names, in_file, save_path):
    idx = np.arange(len(col_names))+1
    df = pd.DataFrame(columns=['colNum', 'matFn'], index=idx)
    df['colNum'] = idx
    paths = [os.path.join(save_path, make_name(in_file, i, cv_idx, col_names)) for i in idx]
    df['matFn'] = paths
    bf_2_respaths = "/storage/gablab001/data/genus/brain_genomic_bayes/bf_3_respaths"
    to_save = os.path.join(bf_2_respaths, "BFRESULTSLIST_{}.csv".format(cv_idx))
    df.to_csv(to_save, index=None)
    return to_save

# creates text files with paths to files written in step 1

In [26]:
for cv_idx in np.arange(10)+1:
    write_csv(
        cv_idx, 
        col_names, 
        inf, 
        "/storage/gablab001/data/genus/brain_genomic_bayes/bf_3"
    )

In [33]:
def run_fixedformVB():
    import cPickle as pickle
    import numpy as np
    import os
    import nipype.interfaces.matlab as Matlab
    matlab = Matlab.MatlabCommand()
    matlab.inputs.paths = [
        '/storage/gablab001/data/genus/bayes_adni/basis/bayesianImagingGenetics/src',
        '/storage/gablab001/data/genus/bayes_adni/basis/bayesianImagingGenetics/src/Utils']
    matlab.inputs.script = """
    deployEndoPhenVB('step','fxvb',...
                    'csvFile','/storage/gablab001/data/genus/brain_genomic_bayes/bf_3_respaths/BFRESULTSLIST_1.csv',...
                    'inputMat','/storage/gablab001/data/genus/brain_genomic_bayes/brain_gene.mat',...
                    'outFile','/om/user/ysa/ffvb_test.mat')
    """
    res = matlab.run()
    

In [34]:
run_fixedformVB()

RuntimeError: Command:
matlab -nodesktop -nosplash -singleCompThread -r "addpath('/storage/gablab001/data/genus/brain_genomic_bayes');pyscript;exit"
Standard output:
MATLAB is selecting SOFTWARE OPENGL rendering.

                            < M A T L A B (R) >
                  Copyright 1984-2016 The MathWorks, Inc.
                   R2016b (9.1.0.441655) 64-bit (glnxa64)
                             September 7, 2016

 
To get started, type one of these: helpwin, helpdesk, or demo.
For product information, visit www.mathworks.com.
 
Executing pyscript at 26-May-2017 19:00:00:
----------------------------------------------------------------------------------------------------
MATLAB Version: 9.1.0.441655 (R2016b)
MATLAB License Number: 650662
Operating System: Linux 3.10.0-229.el7.x86_64 #1 SMP Fri Mar 6 11:36:42 UTC 2015 x86_64
Java Version: Java 1.7.0_60-b19 with Oracle Corporation Java HotSpot(TM) 64-Bit Server VM mixed mode
----------------------------------------------------------------------------------------------------
MATLAB                                                Version 9.1         (R2016b)
Simulink                                              Version 8.8         (R2016b)
Aerospace Blockset                                    Version 3.18        (R2016b)
Aerospace Toolbox                                     Version 2.18        (R2016b)
Antenna Toolbox                                       Version 2.1         (R2016b)
Audio System Toolbox                                  Version 1.1         (R2016b)
Bioinformatics Toolbox                                Version 4.7         (R2016b)
Communications System Toolbox                         Version 6.3         (R2016b)
Computer Vision System Toolbox                        Version 7.2         (R2016b)
Control System Toolbox                                Version 10.1        (R2016b)
Curve Fitting Toolbox                                 Version 3.5.4       (R2016b)
DSP System Toolbox                                    Version 9.3         (R2016b)
Database Toolbox                                      Version 7.0         (R2016b)
Datafeed Toolbox                                      Version 5.4         (R2016b)
Econometrics Toolbox                                  Version 3.5         (R2016b)
Embedded Coder                                        Version 6.11        (R2016b)
Filter Design HDL Coder                               Version 3.1         (R2016b)
Financial Instruments Toolbox                         Version 2.4         (R2016b)
Financial Toolbox                                     Version 5.8         (R2016b)
Fixed-Point Designer                                  Version 5.3         (R2016b)
Fuzzy Logic Toolbox                                   Version 2.2.24      (R2016b)
Global Optimization Toolbox                           Version 3.4.1       (R2016b)
HDL Coder                                             Version 3.9         (R2016b)
HDL Verifier                                          Version 5.1         (R2016b)
Image Acquisition Toolbox                             Version 5.1         (R2016b)
Image Processing Toolbox                              Version 9.5         (R2016b)
Instrument Control Toolbox                            Version 3.10        (R2016b)
LTE System Toolbox                                    Version 2.3         (R2016b)
MATLAB Coder                                          Version 3.2         (R2016b)
MATLAB Compiler                                       Version 6.3         (R2016b)
MATLAB Compiler SDK                                   Version 6.3         (R2016b)
MATLAB Report Generator                               Version 5.1         (R2016b)
Mapping Toolbox                                       Version 4.4         (R2016b)
Model Predictive Control Toolbox                      Version 5.2.1       (R2016b)
Neural Network Toolbox                                Version 9.1         (R2016b)
Optimization Toolbox                                  Version 7.5         (R2016b)
Parallel Computing Toolbox                            Version 6.9         (R2016b)
Partial Differential Equation Toolbox                 Version 2.3         (R2016b)
Phased Array System Toolbox                           Version 3.3         (R2016b)
Polyspace Bug Finder                                  Version 2.2         (R2016b)
RF Toolbox                                            Version 3.1         (R2016b)
Robotics System Toolbox                               Version 1.3         (R2016b)
Robust Control Toolbox                                Version 6.2         (R2016b)
Signal Processing Toolbox                             Version 7.3         (R2016b)
SimBiology                                            Version 5.5         (R2016b)
SimEvents                                             Version 5.1         (R2016b)
SimRF                                                 Version 5.1         (R2016b)
Simscape                                              Version 4.1         (R2016b)
Simscape Driveline                                    Version 2.11        (R2016b)
Simscape Electronics                                  Version 2.10        (R2016b)
Simscape Fluids                                       Version 2.1         (R2016b)
Simscape Multibody                                    Version 4.9         (R2016b)
Simscape Power Systems                                Version 6.6         (R2016b)
Simulink 3D Animation                                 Version 7.6         (R2016b)
Simulink Coder                                        Version 8.11        (R2016b)
Simulink Control Design                               Version 4.4         (R2016b)
Simulink Design Optimization                          Version 3.1         (R2016b)
Simulink Design Verifier                              Version 3.2         (R2016b)
Simulink Report Generator                             Version 5.1         (R2016b)
Simulink Verification and Validation                  Version 3.12        (R2016b)
Stateflow                                             Version 8.8         (R2016b)
Statistics and Machine Learning Toolbox               Version 11.0        (R2016b)
Symbolic Math Toolbox                                 Version 7.1         (R2016b)
System Identification Toolbox                         Version 9.5         (R2016b)
Trading Toolbox                                       Version 3.1         (R2016b)
Vision HDL Toolbox                                    Version 1.3         (R2016b)
WLAN System Toolbox                                   Version 1.2         (R2016b)
Wavelet Toolbox                                       Version 4.17        (R2016b)
inside of the code !
Input arguments : 

args = 

  struct with fields:

        step: 'fxvb'
     csvFile: '/storage/gablab001/data/genus/brain_genomic_bayes/bf_3_resp...'
    inputMat: '/storage/gablab001/data/genus/brain_genomic_bayes/brain_gen...'
     outFile: '/om/user/ysa/ffvb_test.mat'

Running fixed-form variational Bayes ...

Standard error:
MATLAB code threw an exception:
Assignment has more non-singleton rhs dimensions than non-singleton subscripts
File:/storage/gablab001/data/genus/bayes_adni/basis/bayesianImagingGenetics/src/deployEndoPhenVB.m
Name:/storage/gablab001/data/genus/bayes_adni/basis/bayesianImagingGenetics/src/deployEndoPhenVB.m
Line:47
File:storage/gablab001/data/genus/brain_genomic_bayes/pyscript.m
Name:parseFXVBArguments
Line:100
File:eployEndoPhenVB
Name:pyscript
Line:335
File:^
Name:
Line:
Return code: 0
Interface MatlabCommand failed to run. 

In [105]:
import scipy.io

In [106]:
scipy.io.loadmat("/storage/gablab001/data/genus/brain_genomic_bayes/brain_gene.mat").keys()

['Z',
 'G',
 'I',
 '__header__',
 '__globals__',
 'colnames',
 '__version__',
 'response']

In [40]:
b = scipy.io.loadmat("/storage/gablab001/data/genus/brain_genomic_bayes/bf_3/brain_genelh_Thalamus.Proper_1_168_BF.mat")

In [42]:
for key in b.keys():
    try:
        print(b[key].shape, key)
    except:
        pass

((100000, 9), 'mu')
((1, 9), 'lnZ')
((1, 1), 'lnZ_weighted')
((1, 9), 'w')
((100000, 1), 'PIP')
((100000, 9), 'alpha')


In [121]:
for k in ['mu', 'w', 'PIP', 'alpha']:
    print b[k].shape, k

(100000, 9) mu
(1, 9) w
(100000, 1) PIP
(100000, 9) alpha


In [32]:
b['lnZ']

array([[-8549.85556399, -8549.86356402, -8549.87174834, -8549.88107747,
        -8549.88980515, -8549.91771467, -8549.9586837 , -8550.03601001,
        -8550.17060025]])

0