In [1]:
import numpy as np 
import sys
import os
import re
def set_path():
    if sys.platform == 'darwin':
        print("Current system is macOS")
        main_fold_path = '/Users/shanxiafeng/Documents/Project/Research/fnirs-prognosis/code/fnirs-treatment-response-prediction'
    elif sys.platform == 'linux':
        print("Current system is Ubuntu")
        main_fold_path = '/home/jy/Documents/fnirs/treatment_response/fnirs-depression-deeplearning'
    else:
        print("Current system is neither macOS nor Ubuntu")
    os.chdir(main_fold_path)
    
set_path()    


import pandas as pd 

data_path = 'allData/diagnosis514/nor_hb_simple_all_1d.npy'
label_path = 'allData/diagnosis514/label.npy'

data = np.load(data_path)
label = np.load(label_path)
data.shape, label.shape

Current system is Ubuntu


((514, 52, 375), (514,))

In [None]:
from utils.utils_mine import stratified_k_fold_cross_validation_with_holdout
import random 
import time


set_seed = 1720051797
good_seed = [1720016193, 1720021284, 1720024374, 1720026134, 1720029377, 1720030896, 1720032214, 1720040453, 1720042018, 1720043346, 1720049413, 1720049622, 1720051797, 1720054380, 1720060075, 1720062243, 1720062474, 1720063973, 1720064187, 1720066133, 1720069130]
bad_seed = [1720013406, 1720013599, 1720017965, 1720019072, 1720020831, 1720023487, 1720035688, 1720036562, 1720040892, 1720041570, 1720049203, 1720051136, 1720053725, 1720057609, 1720066567, 1720066985, 1720069772, 1720072390, 1720074997, 1720075435, 1720075660]
def shuffle_data_label(data, label, seed):
    print(f'seed, {seed}')
    random.seed(seed)
    indices = np.arange(data.shape[0])
    combined = list(zip(data, label, indices))
    random.shuffle(combined)
    data, label, indices = zip(*combined)
    return np.array(data), np.array(label), np.array(indices)

def onehotEncode(x):
    x = x.astype(int)
    t = np.zeros((x.size, x.max()+1))
    t[np.arange(x.size), x] = 1
    return t.astype(int)

def stratified_k_fold_cross_validation_with_holdout(data, label, k, num_of_k_fold, adj=None, seed=42, hold_out_div=3):
    total_amount = data.shape[0] 
    data, label, indices = shuffle_data_label(data, label, seed)
    
    if len(label.shape) > 1:
        label_not_onehot = np.argmax(label, axis=1)
    else:
        mean_label = np.mean(label)
        label_not_onehot = [1 if i > mean_label else 0 for i in label]
        label_not_onehot = np.array(label_not_onehot)
    pos_indices = indices[label_not_onehot == 1]
    neg_indices = indices[label_not_onehot == 0]
    
    pos = data[label_not_onehot == 1]
    neg = data[label_not_onehot == 0]
    
    holdout_pos_num = pos.shape[0] // hold_out_div
    holdout_neg_num = neg.shape[0] // hold_out_div
    
    X_test = np.concatenate((pos[:holdout_pos_num], neg[:holdout_neg_num]), axis=0)
    Y_test = np.concatenate((np.ones(holdout_pos_num), np.zeros(holdout_neg_num)), axis=0)
    test_indices = np.concatenate((pos_indices[:holdout_pos_num], neg_indices[:holdout_neg_num]), axis=0)
    
    train_val_pos = pos[holdout_pos_num:]
    train_val_neg = neg[holdout_neg_num:]
    train_val_pos_indices = pos_indices[holdout_pos_num:]
    train_val_neg_indices = neg_indices[holdout_neg_num:]
    
    train_val_pos_num = pos.shape[0] - holdout_pos_num
    train_val_neg_num = neg.shape[0] - holdout_neg_num
    one_fold_number_pos = train_val_pos_num // num_of_k_fold
    one_fold_number_neg = train_val_neg_num // num_of_k_fold
    
    val_pos = train_val_pos[k * one_fold_number_pos : (k + 1) * one_fold_number_pos]
    val_neg = train_val_neg[k * one_fold_number_neg : (k + 1) * one_fold_number_neg]
    val_pos_indices = train_val_pos_indices[k * one_fold_number_pos : (k + 1) * one_fold_number_pos]
    val_neg_indices = train_val_neg_indices[k * one_fold_number_neg : (k + 1) * one_fold_number_neg]
    
    X_val = np.concatenate((val_pos, val_neg), axis=0)
    Y_val = np.concatenate((np.ones(val_pos.shape[0]), np.zeros(val_neg.shape[0])), axis=0)
    val_indices = np.concatenate((val_pos_indices, val_neg_indices), axis=0)
    
    train_pos = np.concatenate((train_val_pos[0 : k * one_fold_number_pos], train_val_pos[(k + 1) * one_fold_number_pos :]), axis=0)
    train_neg = np.concatenate((train_val_neg[0 : k * one_fold_number_neg], train_val_neg[(k + 1) * one_fold_number_neg :]), axis=0)
    train_pos_indices = np.concatenate((train_val_pos_indices[0 : k * one_fold_number_pos], train_val_pos_indices[(k + 1) * one_fold_number_pos :]), axis=0)
    train_neg_indices = np.concatenate((train_val_neg_indices[0 : k * one_fold_number_neg], train_val_neg_indices[(k + 1) * one_fold_number_neg :]), axis=0)
    
    X_train = np.concatenate((train_pos, train_neg), axis=0)
    Y_train = np.concatenate((np.ones(train_pos.shape[0]), np.zeros(train_neg.shape[0])), axis=0)
    train_indices = np.concatenate((train_pos_indices, train_neg_indices), axis=0)
    
    if len(label.shape) > 1:
        Y_train, Y_val, Y_test = onehotEncode(Y_train).astype('float32'), onehotEncode(Y_val).astype('float32'), onehotEncode(Y_test).astype('float32')
    else:
        Y_train, Y_val, Y_test = Y_train.astype('float32'), Y_val.astype('float32'), Y_test.astype('float32')
    
    if adj is None:
        return X_train, Y_train, X_val, Y_val, X_test, Y_test, train_indices, val_indices, test_indices
    else:
        adj_train = adj[train_indices]
        adj_val = adj[val_indices]
        adj_test = adj[test_indices]
        return X_train, Y_train, X_val, Y_val, X_test, Y_test, adj_train, adj_val, adj_test, train_indices, val_indices, test_indices

easy_subject = []
hard_subject = []

for seed in good_seed:
    result = stratified_k_fold_cross_validation_with_holdout(data, label, 4, 4, seed=seed, hold_out_div=5)
    easy_subject.extend(result[-1])
for seed in bad_seed:
    result = stratified_k_fold_cross_validation_with_holdout(data, label, 4, 4, seed=seed, hold_out_div=5)
    hard_subject.extend(result[-1])    

In [53]:
import collections
import matplotlib.pyplot as plt
import numpy as np

num_count = 8

# Count the frequency of each subject
collection_easy = collections.Counter(easy_subject)
collection_hard = collections.Counter(hard_subject)

print('easy_subjects = [', end='')
for subject, count in collection_easy.items():
    if count > num_count:
        # print(f'easy_subject, {subject}, {count}')
        print(f'{subject}, ', end='')
print(']')
        

print('hard_subjects = [', end='')
for subject, count in collection_hard.items():
    if count > num_count:
        # print(f'easy_subject, {subject}, {count}')
        print(f'{subject}, ', end='')
print(']', end='')
        

easy_subjects = [487, 121, 317, 346, 7, 170, 84, 383, ]
hard_subjects = [396, 91, 411, 334, 497, 0, 373, 201, 484, 127, ]