# Taku Ito
## 12/12/2018

## Compute Ridge FC using cross-subject cross-validation

## Using Alex Huth's implementation for faster computation, maximizing predicted and actual responses of target variable

In [13]:
import numpy as np
import nibabel as nib
import os
import matplotlib.pyplot as plt
%matplotlib inline
os.sys.path.append('ridge/')
import ridge
import seaborn as sns
import h5py
os.environ['OMP_NUM_THREADS'] = str(1)
import multiprocessing as mp
import scipy.stats as stats
import time
from sklearn.linear_model import Ridge

sns.set_style("white")
plt.rcParams["font.family"] = "FreeSans"

In [3]:
# Excluding 084
subjNums = ['013','014','016','017','018','021','023','024','026','027','028','030','031','032','033',
            '034','035','037','038','039','040','041','042','043','045','046','047','048','049','050',
            '053','055','056','057','058','062','063','066','067','068','069','070','072','074','075',
            '076','077','081','085','086','087','088','090','092','093','094','095','097','098','099',
            '101','102','103','104','105','106','108','109','110','111','112','114','115','117','119',
            '120','121','122','123','124','125','126','127','128','129','130','131','132','134','135',
            '136','137','138','139','140','141']



basedir = '/projects3/SRActFlow/'

# Using final partition
networkdef = np.loadtxt('/projects3/NetworkDiversity/data/network_partition.txt')
networkorder = np.asarray(sorted(range(len(networkdef)), key=lambda k: networkdef[k]))
networkorder.shape = (len(networkorder),1)
# network mappings for final partition set
networkmappings = {'fpn':7, 'vis1':1, 'vis2':2, 'smn':3, 'aud':8, 'lan':6, 'dan':5, 'con':4, 'dmn':9, 
                   'pmulti':10, 'none1':11, 'none2':12}
networks = networkmappings.keys()

xticks = {}
reorderednetworkaffil = networkdef[networkorder]
for net in networks:
    netNum = networkmappings[net]
    netind = np.where(reorderednetworkaffil==netNum)[0]
    tick = np.max(netind)
    xticks[tick] = net

## General parameters/variables
nParcels = 360
nSubjs = len(subjNums)

glasserfile2 = '/projects/AnalysisTools/ParcelsGlasser2016/Q1-Q6_RelatedParcellation210.LR.CorticalAreas_dil_Colors.32k_fs_RL.dlabel.nii'
glasser2 = nib.load(glasserfile2).get_data()
glasser2 = np.squeeze(glasser2)

sortednets = np.sort(xticks.keys())
orderednetworks = []
for net in sortednets: orderednetworks.append(xticks[net])
    
networkpalette = ['royalblue','slateblue','paleturquoise','darkorchid','limegreen',
                  'lightseagreen','yellow','orchid','r','peru','orange','olivedrab']
networkpalette = np.asarray(networkpalette)

OrderedNetworks = ['VIS1','VIS2','SMN','CON','DAN','LAN','FPN','AUD','DMN','PMM','VMM','ORA']

# Load resting-state data

In [35]:
def loadRestActivity(subj,model='24pXaCompCorXVolterra',zscore=False):
    
    datadir = basedir + 'data/postProcessing/hcpPostProcCiric/'
    h5f = h5py.File(datadir + subj + '_glmOutput_data.h5','r')
    data = h5f['Rest1/nuisanceReg_resid_24pXaCompCorXVolterra'][:].copy()
    h5f.close()
    
    if zscore:
        data = stats.zscore(data,axis=1)
    return data

def loadMask(roi,dilated=True):
    maskdir = basedir + 'data/results/surfaceMasks/'
    if dilated:
        maskfile = maskdir + 'GlasserParcel' + str(roi) + '_dilated_10mm.dscalar.nii'
    else:
        maskfile = maskdir + 'GlasserParcel' + str(roi) + '.dscalar.nii'
    maskdata = np.squeeze(nib.load(maskfile).get_data())
    return maskdata

def ridgeCorrWrapper((sourceTrain,sourceTest,targetTrain,targetTest)):
    rcorr = ridge.ridge_corr(sourceTrain,sourceTest,targetTrain,targetTest,alphas=np.logspace(0,4,20))
#     rcorr = []
#     alpha = np.logspace(0,4,20)
#     for a in alpha:
#         clf = Ridge(alpha=a)
#         clf.fit(sourceTrain,targetTrain)
#         predictY = clf.predict(sourceTest)
#         r = stats.pearsonr(predictY,targetTest)[0]
#         rcorr.append(r)
        
    return rcorr

def ridgeWrapper((stim,resp,alpha)):
#     wt = ridge.ridge.ridge(stim,resp,alpha)
    clf = Ridge(alpha=alpha)
    clf.fit(stim,resp)
    wt = clf.coef_

    return wt

In [None]:
# Run for LEFT motor cortex
roi = 9 # Which ROI's FC do we want to predict
zscore = True
nboots = 10 # Will run these bootstraps in parallel
alphas = np.zeros((glasser2.shape[0],nboots))
scount = 0
rcorrs = []
for subj in subjNums:
    newSubjs = np.delete(subjNums,scount)
    subjData = loadRestActivity(subj,zscore=zscore)
    
    # Load dilated ROI mask
    mask = loadMask(roi,dilated=True)
    # Find vertices outside dilated mask (source data)
    source_ind = np.where(mask==0)[0]
    sourceTest = subjData[source_ind,:].copy()
    
    # Now load regular mask to identify responses/activities we want to predict (target data)
    mask = loadMask(roi,dilated=False)
    target_ind = np.where(mask)[0]
    targetTest = subjData[target_ind,:].copy()
    
    # Now find train subjects
    print 'Loading bootstrapped training data for subject', subj
    trainSubjs = np.random.choice(subjNums,nboots,replace=False) # No replacement
    inputs = []
    for trainsubj in trainSubjs:
        subjData = loadRestActivity(trainsubj,zscore=zscore)
        sourceTrain = subjData[source_ind,:].copy()
        targetTrain = subjData[target_ind,:].copy()
        inputs.append((sourceTrain.T,sourceTest.T,targetTrain.T,targetTest.T))
    
    del subjData
    
    os.environ['OMP_NUM_THREADS'] = str(2)
    print '\tBeginning regression/cross-validation'
    before = time.time()
    pool = mp.Pool(processes=nboots)
    rcorr = pool.map_async(ridgeCorrWrapper,inputs).get()
    pool.close()
    pool.join()
    
    del inputs, pool
    
    after = time.time()
    print '\tTime for regression for one subject:', after-before
    
    # Get average alphas across bootstrap
    rcorr_avg = np.mean(np.asarray(rcorr),axis=0)
    rcorrs.append(rcorr_avg)
    
    scount += 1

Loading bootstrapped training data for subject 013
	Beginning regression/cross-validation
	Time for regression for one subject: 179.479526043
Loading bootstrapped training data for subject 014
	Beginning regression/cross-validation
	Time for regression for one subject: 182.182345152
Loading bootstrapped training data for subject 016
	Beginning regression/cross-validation


# Find best alphas for each vertex, across subjects

In [29]:
# This will be an alphas X Vertices matrix
group_average_alphas = np.mean(np.asarray(rcorrs),axis=0)
max_alphas_ind = group_average_alphas.argmax(axis=0)

# Now for each vertex, identify the best alpha
alpha_by_vertex = np.zeros(max_alphas_ind.shape)
for vertex in range(max_alphas_ind.shape[0]):
    alpha_by_vertex[vertex] = np.logspace(0,4,20)[max_alphas_ind[vertex]]

# Save to disk group penalty terms (alphas) for each vertex

In [5]:
# ridgefcdir = '/projects3/SRActFlow/data/results/ridgeFC/'
# h5f = h5py.File(ridgefcdir + 'TargetParcel' + str(roi) + '_RidgeFC.h5','a')
# # First save out penalty alphas for each vertex
# try:
#     h5f.create_dataset('alphasPerVertex',data=alpha_by_vertex)
# except:
#     del h5f['alphasPerVertex']
#     h5f.create_dataset('alphasPerVertex',data=alpha_by_vertex)
# h5f.close()

# ridgefcdir = '/projects3/SRActFlow/data/results/ridgeFC/'
# h5f = h5py.File(ridgefcdir + 'TargetParcel' + str(roi) + '_RidgeFC.h5','r')
# alpha_by_vertex = h5f['alphasPerVertex'][:].copy()
# h5f.close()

# Now for each subject fit the regression model using the optimized penalty term (alpha), and save to disk

In [34]:
ridgefcdir = '/projects3/SRActFlow/data/results/ridgeFC/'

nproc = 20
for subj in subjNums:
    print 'Subject', subj
    print '\tLoading data...'
    subjData = loadRestActivity(subj,zscore=False)
    
    # Load dilated ROI mask
    mask = loadMask(roi,dilated=True)
    # Find vertices outside dilated mask (source data)
    source_ind = np.where(mask==0)[0]
    sourceData = subjData[source_ind,:].copy()
    
    # Now load regular mask to identify responses/activities we want to predict (target data)
    mask = loadMask(roi,dilated=False)
    target_ind = np.where(mask)[0]
    targetData = subjData[target_ind,:].copy()
    
#     inputs = []
#     i = 0
#     for target in target_ind[:20]:
#         target_data = subjData[target,:].copy()
#         target_data.shape = (len(target_data),1)
#         inputs.append((sourceData.T,target_data,alpha_by_vertex[i]))
#         i += 1
    
#     print '\tRunning source to target ridge regression,', nproc, 'vertices in parallel'
#     pool = mp.Pool(processes=nproc)
#     sourceCoefs = pool.map_async(ridgeWrapper,inputs).get()
#     pool.close()
#     pool.join()
#     sourceToTargetMappings = np.asarray(sourceCoefs)

    print '\tRunning source to target ridge regression using', nproc, 'processes'
    os.environ['OMP_NUM_THREADS'] = str(nproc)

    clf = Ridge(alpha=alpha_by_vertex)
    clf.fit(sourceData.T,targetData.T)
    sourceToTargetMappings = clf.coef_
    
    # All vertices to target indices (makes it easier for comparison)
    sourceToTargetMappings2 = np.zeros((len(glasser2),len(target_ind)))
    sourceToTargetMappings2[source_ind,:] = sourceToTargetMappings.T
    
    # Save out to file
    print '\tSaving out to disk'
    h5f = h5py.File(ridgefcdir + 'TargetParcel' + str(roi) + '_RidgeFC.h5','a')
    try:
        h5f.create_dataset(subj + '/sourceToTargetMapping',data=sourceToTargetMappings2)
    except:
        del h5f[subj+'/sourceToTargetMapping']
        h5f.create_dataset(subj + '/sourceToTargetMapping',data=sourceToTargetMappings2)
    h5f.close()
    
    del sourceToTargetMappings2, sourceToTargetMappings


Subject 013
	Loading data...
	Running source to target ridge regression using 20 processes
	Saving out to disk
Subject 014
	Loading data...
	Running source to target ridge regression using 20 processes
	Saving out to disk
Subject 016
	Loading data...
	Running source to target ridge regression using 20 processes
	Saving out to disk
Subject 017
	Loading data...
	Running source to target ridge regression using 20 processes
	Saving out to disk
Subject 018
	Loading data...
	Running source to target ridge regression using 20 processes
	Saving out to disk
Subject 021
	Loading data...
	Running source to target ridge regression using 20 processes
	Saving out to disk
Subject 023
	Loading data...
	Running source to target ridge regression using 20 processes
	Saving out to disk
Subject 024
	Loading data...
	Running source to target ridge regression using 20 processes
	Saving out to disk
Subject 026
	Loading data...
	Running source to target ridge regression using 20 processes
	Saving out to disk
S