In [None]:
import numpy as np
import pandas as pd
import os
from sklearn import svm
from sklearn import metrics
import scipy.io
from scipy import stats
import matplotlib.pyplot as plt
from scipy import stats, linalg
import warnings
from sklearn.exceptions import ConvergenceWarning
os.chdir('/gpfs/milgram/project/chun/jk2992/rest_thoughts/') # change to your folder path

## Predicting 9 thought dimensions

In [2]:
def reshape_FC(fc): # define a function to help read in FC data
    fc = np.transpose(fc,(2,0,1))
    fc = np.reshape(fc,(fc.shape[0],fc.shape[1]*fc.shape[2]))
    return fc

def get_num(string): # define a function to help read in behavioral data
    num = []
    for i in range(len(string)):
        if i % 5 == 2:
            num.append(int(string[i]))
    return num

# load FC
rest_FC = scipy.io.loadmat('./data/brain/rest_fc.mat')['rest'][0]
# delete 1032 s1r1 (response box error), index 29 because there is no 1004 and 1019
rest_FC[29] = np.delete(rest_FC[29],[0],axis = 0)
print('FC profiles shape: '+str(len(rest_FC))+'*'+str(rest_FC[2].shape))

# prepare FC for each subject
nsubj = len(rest_FC)
FC_bysub = []
for i in range(nsubj):
    FC_bysub.append(reshape_FC(rest_FC[i]))

def prepare_data(this_var): # define a function as a pipeline to prepare CAT data for training
    df = pd.read_csv('./data/beh/all_ratings.csv')
    beh_list = np.unique(df['Sub']) # create a behavioral participant list
    print('We have '+str(len(beh_list)) + ' participants')
    # create a behavioral dataset by subject
    idx = -1
    beh_bysub = []
    for sub in range(len(beh_list)):
        sub_data = df[df['Sub']==beh_list[sub]][this_var]
        sub_vec = []
        # print(str(beh_list[sub]) + ': ' + str(len(sub_data)*8))
        for run in range(len(sub_data)):
            idx = idx + 1
            run_data = get_num(sub_data[idx])
            sub_vec.append(run_data)
        sub_vec = np.asarray(sub_vec)
        sub_vec = np.reshape(sub_vec,(sub_vec.shape[0]*sub_vec.shape[1]))
        # try zscore
        sub_vec = stats.zscore(sub_vec)
        beh_bysub.append(sub_vec)

    # remove 1032 s1r1, 1044 s1r2
    beh_bysub[29] = np.delete(beh_bysub[29],range(0,8))
    beh_bysub[41] = np.delete(beh_bysub[41],range(8,16))

    # let's double check the brain match behavior
    count = 0
    for i in range(nsubj):
        if len(beh_bysub[i]) == FC_bysub[i].shape[1]:
            count = count + 1
    if count == nsubj:
        print('All behavioral and brain data match')

    good_trial_bysub = []
    good_trial_id_bysub = []
    good_sub = []
    for sub in range(nsubj):
        sub_data = FC_bysub[sub]
        count = 0 
        good_trial_id = []
        for trial in range(sub_data.shape[1]):
            # a good trial has < 1000 missing FC (3 missing nodes)
            if np.sum(np.isnan(sub_data[:,trial])) < 1000:
                count = count + 1
                good_trial_id.append(trial)
        
        good_trial_bysub.append(count)
        # a good participant has > 20 good trials
        if count > 20:
            good_sub.append(sub)
            good_trial_id = np.transpose(good_trial_id)
            good_trial_id_bysub.append(good_trial_id)
    good_sub = np.transpose(good_sub)

    # select good participants
    nsubj_good = len(good_sub)
    FC_selected, beh_selected = [], []
    for i in range(nsubj_good):
        tmp_FC = FC_bysub[good_sub[i]]
        tmp_FC = tmp_FC[:,good_trial_id_bysub[i]]
        FC_selected.append(tmp_FC)
        
        tmp_beh = beh_bysub[good_sub[i]]
        tmp_beh = tmp_beh[good_trial_id_bysub[i]]    
        beh_selected.append(tmp_beh)

    # let's double check the brain match behavior
    count = 0
    count_trial = []
    for i in range(nsubj_good):
        if len(beh_selected[i]) == FC_selected[i].shape[1]:
            count = count + 1
            count_trial.append(len(beh_selected[i]))
    if count == nsubj_good:
        print('All behavioral and brain data match -- 2nd check')
    
    # reshape FC_selected
    training_FC = []
    training_beh = []
    for i in range(len(FC_selected)):
        for j in range(FC_selected[i].shape[1]):
            training_FC.append(FC_selected[i][:,j])
            training_beh.append(beh_selected[i][j])
    
    # one subject did not change the ratings for future and past.
    # Thus there are nan values in these two dimensions. Now removing these nans
    if this_var == 'Future' or this_var == 'Past':
        nanidx = np.where(np.isnan(training_beh))
        training_FC = np.delete(training_FC,nanidx,axis=0)
        training_beh = np.delete(training_beh,nanidx)
    
    training_FC = np.asarray(training_FC)
    training_beh = np.asarray(training_beh)

    return training_FC, training_beh

nR = 268
def load_edges(this_var): # define a function to read in the underlying network defined in step04_SVR_dimensions.ipynb
    print('Loading FC features...')
    nR = 268
    filepath = './results/CPMs/'+this_var+'_features.mat'
    pos_feat = scipy.io.loadmat(filepath)['pos_feat']
    neg_feat = scipy.io.loadmat(filepath)['neg_feat']
    pos_feat, neg_feat = np.average(pos_feat,0), np.average(neg_feat,0)
    for i1 in range(nR):
        for i2 in range(nR):
            if pos_feat[i1,i2]<1:
                pos_feat[i1,i2]=0
            if neg_feat[i1,i2]<1:
                neg_feat[i1,i2]=0
    print(' #pos = '+str(int(np.sum(pos_feat)/2)), ', #neg = '+str(int(np.sum(neg_feat)/2)))
    all_feat = pos_feat+neg_feat
    featid = []
    ii = -1
    for i1 in range(nR-1):
        for i2 in range(i1+1,nR):
            ii=ii+1
            if all_feat[i1,i2]==1:
                featid.append(ii)
    return featid

def modeling(fc,beh): # build a model for out-of-sample predictions
    print('SVR prediction with model learned from CAT')
    print('  train feature: '+str(fc.shape))
    print('  train beh: '+str(beh.shape))

    clf = []
    clf = svm.SVR(kernel='rbf',max_iter=1000, gamma='auto')
    with warnings.catch_warnings():
        warnings.filterwarnings("ignore", category=ConvergenceWarning)
        clf.fit(fc, beh)
    print('Training done!')
    return clf

FC profiles shape: 60*(4, 8, 35778)


In [3]:
# load preprocessed resting-state FC data from HCP - averaged across rest runs for each subject
hcp = scipy.io.loadmat('./data/brain/net_hcp.mat')['NET']
print('loaded HCP FC shape: ', hcp.shape)
print('')

vars = ['Awake','External','Future','Past','Other','Valence','Image','Word','Detail']
vars_tosave = ['a_awake','b_external','c_future','d_past','e_other','f_valence','g_image','h_word','i_detail']

for vi, var in enumerate(vars):
    print('Now running:',var)
    train_fc, train_beh = prepare_data(var)
    edges = load_edges(vars_tosave[vi])
    train_fc = train_fc[:,edges]
    model = modeling(train_fc,train_beh)
    test_fc = hcp[:,edges].T
    print('HCP data shape for model testing:',test_fc.T.shape)

    predicted_beh = []
    for sub in range(test_fc.shape[1]):
        this_sub = test_fc[:,sub].reshape(-1,1)
        predicted = model.predict(this_sub.T)
        predicted_beh.append(predicted[0])

    print('Predicted behavior shape',len(predicted_beh))
    scipy.io.savemat('./results/predicted/'+vars_tosave[vi]+'.mat',{'predicted':predicted_beh})
    print('Successfully saved!')
    print('')

loaded HCP FC shape:  (908, 35778)

Now running: Awake
We have 60 participants
All behavioral and brain data match
All behavioral and brain data match -- 2nd check
Loading FC features...
 #pos = 469 , #neg = 555
SVR prediction with model learned from CAT
  train feature: (1531, 1024)
  train beh: (1531,)
Training done!
HCP data shape for model testing: (908, 1024)
Predicted behavior shape 908
Successfully saved!

Now running: External
We have 60 participants
All behavioral and brain data match
All behavioral and brain data match -- 2nd check
Loading FC features...
 #pos = 611 , #neg = 689
SVR prediction with model learned from CAT
  train feature: (1531, 1300)
  train beh: (1531,)
Training done!
HCP data shape for model testing: (908, 1300)
Predicted behavior shape 908
Successfully saved!

Now running: Future
We have 60 participants
All behavioral and brain data match
All behavioral and brain data match -- 2nd check
Loading FC features...
 #pos = 658 , #neg = 610
SVR prediction with mo

## Predicting 7 topics

In [17]:
# load FC again
rest_FC = scipy.io.loadmat('./data/brain/rest_fc.mat')['rest'][0]
# aligning FCs and topics by trial
fake_run = np.empty((1,8,35778)) # create a np.nan run for sub 1044 s1r2
fake_run[:] = np.nan
rest_FC[41] = np.vstack((rest_FC[41][0,:,:].reshape(1,8,35778),fake_run,rest_FC[41][1:,:,:]))

FC_bytrial = np.hstack((reshape_FC(rest_FC[0]),reshape_FC(rest_FC[1])))
for sub in range(2,len(rest_FC)):
    FC_bytrial = np.hstack((FC_bytrial,reshape_FC(rest_FC[sub])))

# QC on trials
df = pd.read_csv('./data/beh/topics.csv')
good_trial_id,good_fc,good_topic = [], [], []
for i in range(FC_bytrial.shape[1]):
    this_fc = FC_bytrial[:,i]
    this_topic = df['Topics'].tolist()[i]
    if sum(np.isnan(this_fc)) > 1000 or np.isnan(this_topic) or this_topic == 3 or this_topic == 9: # remove the CAT movies
        pass
    else:
        good_trial_id.append(i)
        good_fc.append(this_fc)
        good_topic.append(this_topic)

# further QC on FCs
good_fc = np.asarray(good_fc)
train_fc, good_edge = [], []
for i in range(good_fc.shape[1]):
    this_fc = good_fc[:,i]
    if np.any(np.isnan(this_fc)):
        pass
    else:
        good_edge.append(i)
        train_fc.append(this_fc)

train_fc = np.asarray(train_fc)
train_fc.shape

(35778, 1856)

In [21]:
# Import train_test_split function
from sklearn.model_selection import train_test_split
#Import svm model
from sklearn import svm
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
from imblearn.over_sampling import SMOTE, ADASYN

#Create a svm Classifier
clf = svm.SVC(kernel='rbf', probability=True) # Linear Kernel

# Split dataset into training set and test set
X_train = train_fc.T
y_train = good_topic

X_resampled, y_resampled = SMOTE().fit_resample(X_train, y_train) # over sample 

#Train the model using the training sets
clf.fit(X_resampled, y_resampled)

In [23]:
hcp = scipy.io.loadmat('./data/brain/net_hcp.mat')['NET']
print('loaded HCP FC shape: ', hcp.shape)

test_fc = hcp[:,good_edge].T
print('HCP data shape for model testing:',test_fc.T.shape)

predicted_beh = []
for sub in range(test_fc.shape[1]):
    this_sub = test_fc[:,sub].reshape(-1,1)
    predicted = clf.predict_proba(this_sub.T)
    predicted = np.array(predicted[0])
    predicted_beh.append(predicted)

scipy.io.savemat('./results/predicted/topics.mat',{'predicted':predicted_beh})
print('Successfully saved!')

loaded HCP FC shape:  (908, 35778)
HCP data shape for model testing: (908, 32131)
Successfully saved!
