# Taku Ito
## 12/12/2018

## Compute PCA FC (500 components)


In [1]:
import numpy as np
import nibabel as nib
import os
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import h5py
os.environ['OMP_NUM_THREADS'] = str(1)
import multiprocessing as mp
import scipy.stats as stats
import time
os.sys.path.append('utils/')
from sklearn.decomposition import PCA
import statsmodels.api as sm

sns.set_style("white")
plt.rcParams["font.family"] = "FreeSans"

  from pandas.core import datetools


In [2]:
# Excluding 084
subjNums = ['013','014','016','017','018','021','023','024','026','027','028','030','031','032','033',
            '034','035','037','038','039','040','041','042','043','045','046','047','048','049','050',
            '053','055','056','057','058','062','063','066','067','068','069','070','072','074','075',
            '076','077','081','085','086','087','088','090','092','093','094','095','097','098','099',
            '101','102','103','104','105','106','108','109','110','111','112','114','115','117','119',
            '120','121','122','123','124','125','126','127','128','129','130','131','132','134','135',
            '136','137','138','139','140','141']



basedir = '/projects3/SRActFlow/'

# Using final partition
networkdef = np.loadtxt('/projects3/NetworkDiversity/data/network_partition.txt')
networkorder = np.asarray(sorted(range(len(networkdef)), key=lambda k: networkdef[k]))
networkorder.shape = (len(networkorder),1)
# network mappings for final partition set
networkmappings = {'fpn':7, 'vis1':1, 'vis2':2, 'smn':3, 'aud':8, 'lan':6, 'dan':5, 'con':4, 'dmn':9, 
                   'pmulti':10, 'none1':11, 'none2':12}
networks = networkmappings.keys()

xticks = {}
reorderednetworkaffil = networkdef[networkorder]
for net in networks:
    netNum = networkmappings[net]
    netind = np.where(reorderednetworkaffil==netNum)[0]
    tick = np.max(netind)
    xticks[tick] = net

## General parameters/variables
nParcels = 360
nSubjs = len(subjNums)

glasserfile2 = '/projects/AnalysisTools/ParcelsGlasser2016/Q1-Q6_RelatedParcellation210.LR.CorticalAreas_dil_Colors.32k_fs_RL.dlabel.nii'
glasser2 = nib.load(glasserfile2).get_data()
glasser2 = np.squeeze(glasser2)

sortednets = np.sort(xticks.keys())
orderednetworks = []
for net in sortednets: orderednetworks.append(xticks[net])
    
networkpalette = ['royalblue','slateblue','paleturquoise','darkorchid','limegreen',
                  'lightseagreen','yellow','orchid','r','peru','orange','olivedrab']
networkpalette = np.asarray(networkpalette)

OrderedNetworks = ['VIS1','VIS2','SMN','CON','DAN','LAN','FPN','AUD','DMN','PMM','VMM','ORA']

# Load resting-state data

In [40]:
def loadRestActivity(subj,model='24pXaCompCorXVolterra',zscore=False):
    
    datadir = basedir + 'data/postProcessing/hcpPostProcCiric/'
    h5f = h5py.File(datadir + subj + '_glmOutput_data.h5','r')
    data = h5f['Rest1/nuisanceReg_resid_24pXaCompCorXVolterra'][:].copy()
    h5f.close()
    
    if zscore:
        data = stats.zscore(data,axis=1)
    return data

def loadMask(roi,dilated=True):
    maskdir = basedir + 'data/results/surfaceMasks/'
    if dilated:
        maskfile = maskdir + 'GlasserParcel' + str(roi) + '_dilated_10mm.dscalar.nii'
    else:
        maskfile = maskdir + 'GlasserParcel' + str(roi) + '.dscalar.nii'
    maskdata = np.squeeze(nib.load(maskfile).get_data())
    return maskdata


def pcaFC_inEigenspace(stim,resp,n_components=500,nproc=10):
    """
    stim    - time x feature/region matrix of regressors
    resp    - time x feature/region matrix of targets (y-values)
    """
    print '\tRunning PCA'
    os.environ['OMP_NUM_THREADS'] = str(nproc)
    pca = PCA(n_components)
    reduced_mat = pca.fit_transform(stim) # Time X Features
    components = pca.components_
    
    inputs = []
    for vert in range(resp.shape[1]):
        inputs.append((resp[:,vert],reduced_mat,True))

    print '\tRunning regression'
#     os.environ['OMP_NUM_THREADS'] = str(1)
#     pool = mp.Pool(processes=nproc)
#     results = pool.map_async(_regression2,inputs).get()
#     pool.close()
#     pool.join()
    
    betas, resid = _regression2((resp.T,reduced_mat,True))
    
    wt = betas
#     wt = np.zeros((n_components,resp.shape[1]))
#     vert = 0
#     for result in results:
#         betas, resid = result
#         wt[:,vert] = betas[1:]
        
#         # Remove colliders 
#         for eig in range(reduced_mat.shape[1]):
#             r = stats.pearsonr(reduced_mat[:,eig],resp[:,vert])[0]
#             # If variables are pos correlated, but coef is negative
#             if (r>0) and (wt[eig,vert]<0):
#                 wt[eig,vert] = 0
#             # If variables are neg correlated, but coef is positive
#             elif (r<0) and (wt[eig,vert]>0):
#                 wt[eig,vert] = 0
                
#         vert += 1

    return wt, components

def _regression2((data,regressors,constant)):
    """ 
    Hand coded OLS regression using closed form equation: betas = (X'X)^(-1) X'y
    """
    # Add 'constant' regressor
    if constant:
        regressors = sm.add_constant(regressors)
    X = regressors.copy()
    try:
#        #C_ss_inv = np.linalg.inv(np.dot(X.T,X))
        C_ss_inv = np.linalg.pinv(np.dot(X.T,X))
    except np.linalg.LinAlgError as err:
        C_ss_inv = np.linalg.pinv(np.cov(X.T))
    betas = np.dot(C_ss_inv,np.dot(X.T,data.T))
    resid = data - (betas[0] + np.dot(X[:,1:],betas[1:])).T
    return betas, resid

# Now for each subject fit the regression model using the optimized penalty term (alpha), and save to disk

In [41]:
# ROI to compute FC to
roi = 8

roi_lh = 188 # Right S1
roi_rh = 8 # Left M1

# dilateLH = loadMask(roi_lh,dilated=True)
# dilateRH = loadMask(roi_rh,dilated=True)
# combinedDilated = dilateLH + dilateRH

dilateLH = loadMask(roi_lh,dilated=True)
dilateRH = loadMask(roi_rh,dilated=True)
combinedDilated = dilateLH + dilateRH
# Exclude all SMN regions
smn_rois = np.where(networkdef==networkmappings['smn'])[0]
for x in smn_rois:
    roi_ind = np.where(glasser2==x)[0]
    combinedDilated[roi_ind]=1
source_ind = np.where(combinedDilated==0)[0]

# Now load regular mask to identify responses/activities we want to predict (target data)
mask = loadMask(roi,dilated=False)
target_ind = np.where(mask)[0]

pcafcdir = '/projects3/SRActFlow/data/results/pcaFC_inEigenspace/'

nproc = 20
for subj in subjNums:
    print 'Subject', subj
    print '\tLoading data...'
    subjData = loadRestActivity(subj,zscore=False)
    
    sourceData = subjData[source_ind,:].copy()
    
    targetData = subjData[target_ind,:].copy()

    print '\tRunning source to target PCA regression using', nproc, 'processes'

    sourceToTargetMappings, eigenvectors = pcaFC_inEigenspace(sourceData.T,targetData.T,n_components=500,nproc=nproc)
    
    # Save out to file
#     print '\tSaving out to disk'
#     h5f = h5py.File(pcafcdir + 'TargetParcel' + str(roi) + '_pcaFC_nozscore.h5','a')
#     try:
#         h5f.create_dataset(subj + '/sourceToTargetMapping',data=sourceToTargetMappings)
#         h5f.create_dataset(subj + '/eigenvectors',data=eigenvectors)
#     except:
#         del h5f[subj+'/sourceToTargetMapping'], h5f[subj+'/eigenvectors']
#         h5f.create_dataset(subj + '/sourceToTargetMapping',data=sourceToTargetMappings)
#         h5f.create_dataset(subj + '/eigenvectors',data=eigenvectors)
#     h5f.close()
    
#     del sourceToTargetMappings


Subject 013
	Loading data...
	Running source to target PCA regression using 20 processes
	Running PCA
	Running regression
Subject 014
	Loading data...
	Running source to target PCA regression using 20 processes
	Running PCA
	Running regression


In [23]:
# ROI to compute FC to
roi = 188

# roi_lh = 189 # Right S1
# roi_rh = 9 # Left M1

# dilateLH = loadMask(roi_lh,dilated=True)
# dilateRH = loadMask(roi_rh,dilated=True)
# combinedDilated = dilateLH + dilateRH

roi_lh = 188 # Right S1
roi_rh = 8 # Left M1

# dilateLH = loadMask(roi_lh,dilated=True)
# dilateRH = loadMask(roi_rh,dilated=True)
# combinedDilated = dilateLH + dilateRH

dilateLH = loadMask(roi_lh,dilated=True)
dilateRH = loadMask(roi_rh,dilated=True)
combinedDilated = dilateLH + dilateRH
# Exclude all SMN regions
smn_rois = np.where(networkdef==networkmappings['smn'])[0]
for x in smn_rois:
    roi_ind = np.where(glasser2==x)[0]
    combinedDilated[roi_ind]=1
source_ind = np.where(combinedDilated==0)[0]

# Now load regular mask to identify responses/activities we want to predict (target data)
mask = loadMask(roi,dilated=False)
target_ind = np.where(mask)[0]

pcafcdir = '/projects3/SRActFlow/data/results/pcaFC_inEigenspace/'

nproc = 20
for subj in subjNums:
    print 'Subject', subj
    print '\tLoading data...'
    subjData = loadRestActivity(subj,zscore=False)
    
    sourceData = subjData[source_ind,:].copy()
    
    targetData = subjData[target_ind,:].copy()

    print '\tRunning source to target PCA regression using', nproc, 'processes'

    sourceToTargetMappings, eigenvectors = pcaFC_inEigenspace(sourceData.T,targetData.T,n_components=500,nproc=nproc)
    
    # Save out to file
    print '\tSaving out to disk'
    h5f = h5py.File(pcafcdir + 'TargetParcel' + str(roi) + '_pcaFC_nozscore.h5','a')
    try:
        h5f.create_dataset(subj + '/sourceToTargetMapping',data=sourceToTargetMappings)
        h5f.create_dataset(subj + '/eigenvectors',data=eigenvectors)
    except:
        del h5f[subj+'/sourceToTargetMapping'], h5f[subj+'/eigenvectors']
        h5f.create_dataset(subj + '/sourceToTargetMapping',data=sourceToTargetMappings)
        h5f.create_dataset(subj + '/eigenvectors',data=eigenvectors)
    h5f.close()
    
    del sourceToTargetMappings

Subject 013
	Loading data...
	Running source to target PCA regression using 20 processes
	Running PCA
	Running regression
	Saving out to disk
Subject 014
	Loading data...
	Running source to target PCA regression using 20 processes
	Running PCA
	Running regression
	Saving out to disk
Subject 016
	Loading data...
	Running source to target PCA regression using 20 processes
	Running PCA
	Running regression
	Saving out to disk
Subject 017
	Loading data...
	Running source to target PCA regression using 20 processes
	Running PCA
	Running regression
	Saving out to disk
Subject 018
	Loading data...
	Running source to target PCA regression using 20 processes
	Running PCA
	Running regression
	Saving out to disk
Subject 021
	Loading data...
	Running source to target PCA regression using 20 processes
	Running PCA
	Running regression
	Saving out to disk
Subject 023
	Loading data...
	Running source to target PCA regression using 20 processes
	Running PCA
	Running regression
	Saving out to disk
Subjec