In [None]:
import numpy as np
import pandas as pd
import os
from sklearn import svm
from sklearn import metrics
import scipy.io
from scipy import stats
import matplotlib.pyplot as plt
from scipy import stats, linalg
import warnings
import math
import random
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.exceptions import ConvergenceWarning
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Import train_test_split function
from sklearn.model_selection import train_test_split
#Import svm model
from sklearn import svm
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

#Create a svm Classifier
clf = svm.SVC(kernel='rbf') # non-linear Kernel, also consider the imbalance in the topics
label_encoder = LabelEncoder()

os.chdir('/gpfs/milgram/project/chun/jk2992/rest_thoughts/') # change to your folder path

In [3]:
# load thought topics
df = pd.read_csv('./data/beh/topics.csv')
beh_bytrial = np.asarray(df['Topics'].tolist())

# load functional connectivity
def reshape_FC(fc):
    fc = np.transpose(fc,(2,0,1))
    fc = np.reshape(fc,(fc.shape[0],fc.shape[1]*fc.shape[2]))
    return fc

rest_FC = scipy.io.loadmat('./data/brain/rest_fc.mat')['rest'][0]
fake_run = np.empty((1,8,35778)) # create a np.nan run for sub 1044
fake_run[:] = np.nan
rest_FC[41] = np.vstack((fake_run,rest_FC[41]))

FC_bytrial = np.hstack((reshape_FC(rest_FC[0]),reshape_FC(rest_FC[1])))
for sub in range(2,len(rest_FC)):
    FC_bytrial = np.hstack((FC_bytrial,reshape_FC(rest_FC[sub])))

FC_bytrial.shape

(35778, 1856)

In [4]:
# select clean data for model building

df_rating = pd.read_csv( './data/beh/all_ratings.csv')
def get_range(sub):
    idx = df_rating[df_rating['Sub']==sub].index
    lower = idx[0]
    upper = idx[len(idx)-1] + 1
    tmp_range = range(lower*8, upper*8)
    return tmp_range

id_list = np.unique(df_rating['Sub'])

good_sub = []
good_sub_id = []
for sub in range(len(id_list)):
    this_fc = FC_bytrial[:,get_range(id_list[sub])]
    this_beh = beh_bytrial[get_range(id_list[sub])]
    # qc on FC and topics
    count = 0
    for i in range(this_fc.shape[1]):
        if sum(np.isnan(this_fc[:,i])) > 1000 or np.isnan(this_beh[i]) or this_beh[i] == 9:
            pass
        else:
            count = count + 1
    # select good -- at least 20 good trials
    if count > 19:
        good_sub.append(sub)
        good_sub_id.append(id_list[sub])
print('We have ' + str(len(good_sub)) + ' "good" participants')
print('They are: '+ str(good_sub_id))

We have 38 "good" participants
They are: [1003, 1005, 1006, 1007, 1009, 1012, 1013, 1014, 1016, 1017, 1018, 1022, 1023, 1025, 1026, 1027, 1028, 1031, 1032, 1034, 1036, 1038, 1039, 1042, 1043, 1045, 1046, 1048, 1050, 1051, 1052, 1055, 1056, 1057, 1059, 1060, 1061, 1062]


In [5]:
# predictive modeling - leave-one-subject-out cross-validation

accuracy_allsub = []
c_matrix = []
predicted_all = []
actual_all = []
for sub in range(len(good_sub)):
# for sub in range(1):
    # split training and testing
    test_sub = good_sub_id[sub]
    test_beh = beh_bytrial[get_range(test_sub)]
    test_FC = FC_bytrial[:,get_range(test_sub)]
    train_beh = np.delete(beh_bytrial,get_range(test_sub),axis=0)
    train_FC = np.delete(FC_bytrial,get_range(test_sub),axis=1)

    # okay now do some qc to the training set
    train_FC_good = []
    train_beh_good = []
    for i in range(train_FC.shape[1]):
        this_FC = train_FC[:,i]
        this_beh = train_beh[i]
        if sum(np.isnan(this_FC)) > 1000 or np.isnan(this_beh) or this_beh == 9:
            pass
        else:
            train_FC_good.append(this_FC)
            train_beh_good.append(this_beh)
    train_FC_good = np.asarray(train_FC_good)
    train_beh_good = np.asarray(train_beh_good)

    # delete bad trials (all FCs are nan) in testing set
    not_all_nan_trial = []
    for i in range(len(test_beh)):
        if sum(np.isnan(test_FC[:,i])) == test_FC.shape[0]:
            pass
        else:
            not_all_nan_trial.append(i)
    not_all_nan_trial = np.asarray(not_all_nan_trial)
    test_FC = test_FC[:,not_all_nan_trial]
    test_beh = test_beh[not_all_nan_trial]
    
    # select fc in training and testing set
    selected_FC_id = []
    for j in range(train_FC_good.shape[1]):
        if np.any(np.isnan(train_FC_good[:,j])) or np.any(np.isnan(test_FC[j,:])):
            pass
        else:
            selected_FC_id.append(j)
    train_FC_selected = train_FC_good[:,selected_FC_id]
    test_FC_selected = test_FC[selected_FC_id,:]
    
    # qc on test beh
    nonnan_in_test = []
    for k in range(len(test_beh)):
        if np.isnan(test_beh[k]) or test_beh[k] == 9:
            pass
        else:
            nonnan_in_test.append(k)
    nonnan_in_test = np.asarray(nonnan_in_test)
    test_beh = test_beh[nonnan_in_test]
    test_FC_selected = test_FC_selected[:,nonnan_in_test]
    
    # now fit the model
    X_resampled, y_resampled = SMOTE().fit_resample(train_FC_selected, train_beh_good) # oversample
    clf.fit(X_resampled, y_resampled)
    y_pred = clf.predict(test_FC_selected.T)
    
    # assess model performance
    accuracy = metrics.accuracy_score(test_beh, y_pred)
    print("Sub", str(sub+1) + '/' + str(len(good_sub)) ,accuracy)
    accuracy_allsub.append(accuracy)

    # confusion matrix
    cm = confusion_matrix(test_beh, y_pred)
    c_matrix.append(cm)
    predicted_all.append(y_pred)
    actual_all.append(test_beh)

Sub 1/38 0.3333333333333333
Sub 2/38 0.17391304347826086
Sub 3/38 0.27586206896551724
Sub 4/38 0.03225806451612903
Sub 5/38 0.23333333333333334
Sub 6/38 0.2
Sub 7/38 0.3870967741935484
Sub 8/38 0.27586206896551724
Sub 9/38 0.1935483870967742
Sub 10/38 0.3
Sub 11/38 0.32
Sub 12/38 0.45161290322580644
Sub 13/38 0.3125
Sub 14/38 0.25
Sub 15/38 0.08695652173913043
Sub 16/38 0.13793103448275862
Sub 17/38 0.0967741935483871
Sub 18/38 0.1111111111111111
Sub 19/38 0.4782608695652174
Sub 20/38 0.625
Sub 21/38 0.25925925925925924
Sub 22/38 0.3793103448275862
Sub 23/38 0.4166666666666667
Sub 24/38 0.5384615384615384
Sub 25/38 0.1935483870967742
Sub 26/38 0.17391304347826086
Sub 27/38 0.5172413793103449
Sub 28/38 0.4642857142857143
Sub 29/38 0.21428571428571427
Sub 30/38 0.13043478260869565
Sub 31/38 0.5714285714285714
Sub 32/38 0.3103448275862069
Sub 33/38 0.4583333333333333
Sub 34/38 0.0
Sub 35/38 0.16666666666666666
Sub 36/38 0.26666666666666666
Sub 37/38 0.40625
Sub 38/38 0.07407407407407407


In [6]:
# save the results
scipy.io.savemat('./results/CPMs/topic_SVC.mat',{'acc':accuracy_allsub,'confusion_matrix':c_matrix,'predicted':predicted_all,'actual':actual_all})

In [11]:
# statistical tesing - null distributed generated by shuffling the topics within participants
null_acc = []
for iter in range(10000):
    if iter % 100 == 0:
        print('Progress: ',iter/100,'%')
    acc = []
    for sub in range(len(good_sub)):
        test = actual_all[sub]
        random.shuffle(test)
        accuracy = metrics.accuracy_score(predicted_all[sub],test)
        acc.append(accuracy)
    null_acc.append(np.mean(acc))

def onetail_p(real, null):
    p = (1+np.sum(null>=real))/(1+len(null))
    print(str(np.sum(null>=real))+' among '+str(len(null))+' null has higher value than this actual prediction')
    print('p = ',p)

print('The topic classification accuracy of the SVC model is:',np.mean(accuracy_allsub))
print(' ')
onetail_p(np.mean(accuracy_allsub),null_acc)

Progress:  0.0 %
Progress:  1.0 %
Progress:  2.0 %
Progress:  3.0 %
Progress:  4.0 %
Progress:  5.0 %
Progress:  6.0 %
Progress:  7.0 %
Progress:  8.0 %
Progress:  9.0 %
Progress:  10.0 %
Progress:  11.0 %
Progress:  12.0 %
Progress:  13.0 %
Progress:  14.0 %
Progress:  15.0 %
Progress:  16.0 %
Progress:  17.0 %
Progress:  18.0 %
Progress:  19.0 %
Progress:  20.0 %
Progress:  21.0 %
Progress:  22.0 %
Progress:  23.0 %
Progress:  24.0 %
Progress:  25.0 %
Progress:  26.0 %
Progress:  27.0 %
Progress:  28.0 %
Progress:  29.0 %
Progress:  30.0 %
Progress:  31.0 %
Progress:  32.0 %
Progress:  33.0 %
Progress:  34.0 %
Progress:  35.0 %
Progress:  36.0 %
Progress:  37.0 %
Progress:  38.0 %
Progress:  39.0 %
Progress:  40.0 %
Progress:  41.0 %
Progress:  42.0 %
Progress:  43.0 %
Progress:  44.0 %
Progress:  45.0 %
Progress:  46.0 %
Progress:  47.0 %
Progress:  48.0 %
Progress:  49.0 %
Progress:  50.0 %
Progress:  51.0 %
Progress:  52.0 %
Progress:  53.0 %
Progress:  54.0 %
Progress:  55.0 %
Pr