### Packages

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import seaborn as sns
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix, accuracy_score, f1_score

XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Users/INK/UOA/ENGSCI700/.ENGSCI700/lib/python3.13/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib\n  Referenced from: <B637898E-C0C3-3F93-8C08-800EE41A7A5B> /Users/INK/UOA/ENGSCI700/.ENGSCI700/lib/python3.13/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/usr/local/opt/libomp/lib/libomp.dylib' (no such file)"]


# Set Up

### Demographic

In [2]:
info = pd.read_excel('../data/Brain_networks/Demographic.xlsx')
info['Participant_ID'] = info['Participant_ID'].apply(lambda x: int(re.findall(r'\d+', str(x))[0]))

# Remove duplicates
info = info.drop_duplicates(subset='Participant_ID')

# Reset index (optional)
info = info.reset_index(drop=True)

# Display the result
info.head()

Unnamed: 0,Participant_ID,ADHD/NT,Gender
0,200,NT,F
1,201,ADHD,F
2,202,NT,F
3,205,NT,F
4,207,NT,F


### Define

In [19]:
condition_map = {
    'rsfMRI_HB6': 'rest',
    'flanker_events': 'task',
    'flanker_events_twitching': 'twitching'
}

significant_networks = pd.read_excel('significant_networks.xlsx')
networks = significant_networks['Feature'].tolist()
networks

['Medial-Occipital_Visual',
 'Occipital-Lateral(L)_Visual',
 'Medial-Lateral(L)_Visual',
 'Medial-Lateral(R)_Visual',
 'ACC-RPFC(L)_Salience',
 'AInsula(L)-RPFC(R)_Salience',
 'LPFC(L)-PPC(L)_FP']

# Random Forest

## Preprocess

### Visual

In [4]:
# Visual
all_visual = pd.concat([
    pd.read_excel('../data/Brain_networks/Network_CC/Visual_2021.xlsx'),
    pd.read_excel('../data/Brain_networks/Network_CC/Visual_2022.xlsx'),
    pd.read_excel('../data/Brain_networks/Network_CC/Visual_2023.xlsx')
], ignore_index=True)
all_visual['Condition'] = all_visual['Condition'].astype('category')
all_visual.head()

Unnamed: 0,Participant_ID,Sequence_name,Condition_matfile,Condition,Medial-Occipital,Medial-Lateral(L),Medial-Lateral(R),Occipital-Lateral(L),Occipital-Lateral(R),Lateral(L)-Lateral(R),Averaged_CC
0,200,rsfMRI_HB6,Condition001.mat,rest,0.18123,0.22104,0.032202,0.68723,0.4797,0.97918,0.4301
1,201,rsfMRI_HB6,Condition001.mat,rest,0.44015,0.56515,0.42759,0.848,0.88916,1.2118,0.73031
2,202,rsfMRI_HB6,Condition001.mat,rest,0.28081,0.21935,0.58884,0.51701,0.6758,0.87309,0.52582
3,205,rsfMRI_HB6,Condition001.mat,rest,0.39404,0.88638,0.51938,0.50584,0.28219,1.0813,0.61152
4,207,rsfMRI_HB6,Condition001.mat,rest,0.44657,0.32933,0.27231,0.47209,0.3813,0.68107,0.43045


In [5]:
visual_w_info = all_visual.merge(info, on='Participant_ID', how='inner')
matrix_visual = visual_w_info.drop(columns=['Condition_matfile', 'Sequence_name'], inplace=False)
matrix_visual.head()

Unnamed: 0,Participant_ID,Condition,Medial-Occipital,Medial-Lateral(L),Medial-Lateral(R),Occipital-Lateral(L),Occipital-Lateral(R),Lateral(L)-Lateral(R),Averaged_CC,ADHD/NT,Gender
0,200,rest,0.18123,0.22104,0.032202,0.68723,0.4797,0.97918,0.4301,NT,F
1,201,rest,0.44015,0.56515,0.42759,0.848,0.88916,1.2118,0.73031,ADHD,F
2,202,rest,0.28081,0.21935,0.58884,0.51701,0.6758,0.87309,0.52582,NT,F
3,205,rest,0.39404,0.88638,0.51938,0.50584,0.28219,1.0813,0.61152,NT,F
4,207,rest,0.44657,0.32933,0.27231,0.47209,0.3813,0.68107,0.43045,NT,F


### Salience

In [6]:
# Salience
all_salience = pd.concat([
    pd.read_excel('../data/Brain_networks/Network_CC/Salience_2021.xlsx'),
    pd.read_excel('../data/Brain_networks/Network_CC/Salience_2022.xlsx'),
    pd.read_excel('../data/Brain_networks/Network_CC/Salience_2023.xlsx')
], ignore_index=True)
all_salience['Condition'] = all_salience['Condition'].astype('category')
all_salience.head()

Unnamed: 0,Participant_ID,Sequence_name,Condition_matfile,Condition,ACC-AInsula(L),ACC-AInsula(R),ACC-RPFC(L),ACC-RPFC(R),ACC-SMG(L),ACC-SMG(R),...,AInsula(R)-RPFC(R),AInsula(R)-SMG(L),AInsula(R)-SMG(R),RPFC(L)-RPFC(R),RPFC(L)-SMG(L),RPFC(L)-SMG(R),RPFC(R)-SMG(L),RPFC(R)-SMG(R),SMG(L)-SMG(R),Averaged_CC
0,200,rsfMRI_HB6,Condition001.mat,rest,0.38244,0.42946,0.47293,0.68623,0.47652,0.14783,...,0.24826,0.40205,0.16082,0.51811,0.25721,0.083836,0.32207,0.010654,0.96934,0.3448
1,201,rsfMRI_HB6,Condition001.mat,rest,0.26701,0.30673,0.26035,0.47045,0.036738,0.29623,...,0.4424,0.13552,0.71314,0.79054,0.55826,0.60852,0.21384,0.58124,0.79743,0.49922
2,202,rsfMRI_HB6,Condition001.mat,rest,0.39955,0.76758,0.18989,0.30054,0.33014,0.0646,...,0.32921,0.48753,0.26006,0.42888,0.71127,0.33762,0.58176,0.95569,0.62045,0.39151
3,205,rsfMRI_HB6,Condition001.mat,rest,0.51393,0.60648,0.83598,0.76121,0.42954,-0.027161,...,0.64889,0.65663,0.22283,1.2365,0.55846,0.1853,0.59952,0.32887,0.55385,0.45836
4,207,rsfMRI_HB6,Condition001.mat,rest,0.50032,0.54461,0.35094,0.37688,0.28651,0.24603,...,0.32564,0.47198,0.62164,0.44331,0.75381,0.44949,0.38576,0.59805,0.79248,0.42589


In [7]:
salience_w_info = all_salience.merge(info, on='Participant_ID', how='inner')
matrix_salience = salience_w_info.drop(columns=['Condition_matfile', 'Sequence_name'], inplace=False)
matrix_salience.head()

Unnamed: 0,Participant_ID,Condition,ACC-AInsula(L),ACC-AInsula(R),ACC-RPFC(L),ACC-RPFC(R),ACC-SMG(L),ACC-SMG(R),AInsula(L)-AInsula(R),AInsula(L)-RPFC(L),...,AInsula(R)-SMG(R),RPFC(L)-RPFC(R),RPFC(L)-SMG(L),RPFC(L)-SMG(R),RPFC(R)-SMG(L),RPFC(R)-SMG(R),SMG(L)-SMG(R),Averaged_CC,ADHD/NT,Gender
0,200,rest,0.38244,0.42946,0.47293,0.68623,0.47652,0.14783,0.40015,0.63142,...,0.16082,0.51811,0.25721,0.083836,0.32207,0.010654,0.96934,0.3448,NT,F
1,201,rest,0.26701,0.30673,0.26035,0.47045,0.036738,0.29623,0.59892,0.81091,...,0.71314,0.79054,0.55826,0.60852,0.21384,0.58124,0.79743,0.49922,ADHD,F
2,202,rest,0.39955,0.76758,0.18989,0.30054,0.33014,0.0646,0.22255,0.55078,...,0.26006,0.42888,0.71127,0.33762,0.58176,0.95569,0.62045,0.39151,NT,F
3,205,rest,0.51393,0.60648,0.83598,0.76121,0.42954,-0.027161,0.41983,0.29928,...,0.22283,1.2365,0.55846,0.1853,0.59952,0.32887,0.55385,0.45836,NT,F
4,207,rest,0.50032,0.54461,0.35094,0.37688,0.28651,0.24603,0.63985,0.3283,...,0.62164,0.44331,0.75381,0.44949,0.38576,0.59805,0.79248,0.42589,NT,F


### FP

In [8]:
# FrontoParietal
all_fp = pd.concat([
    pd.read_excel('../data/Brain_networks/Network_CC/FP_2021.xlsx'),
    pd.read_excel('../data/Brain_networks/Network_CC/FP_2022.xlsx'),
    pd.read_excel('../data/Brain_networks/Network_CC/FP_2023.xlsx')
], ignore_index=True)
all_fp['Condition'] = all_fp['Condition'].astype('category')
all_fp.head()

Unnamed: 0,Participant_ID,Sequence_name,Condition_matfile,Condition,LPFC(L)-PPC(L),LPFC(L)-LPFC(R),LPFC(L)-PPC(R),PPC(L)-LPFC(R),PPC(L)-PPC(R),LPFC(R)-PPC(R),Averaged_CC
0,200,rsfMRI_HB6,Condition001.mat,rest,0.74405,0.31677,0.07744,0.4263,0.42652,0.98033,0.49524
1,201,rsfMRI_HB6,Condition001.mat,rest,0.80099,0.17939,0.3125,0.07402,0.6891,0.61432,0.44505
2,202,rsfMRI_HB6,Condition001.mat,rest,1.2186,0.74083,0.52707,0.73794,0.64482,1.0955,0.82746
3,205,rsfMRI_HB6,Condition001.mat,rest,0.65107,0.79147,0.15068,0.82202,0.68807,0.5438,0.60785
4,207,rsfMRI_HB6,Condition001.mat,rest,0.75497,0.39063,0.45857,0.73926,0.70244,1.0723,0.68636


In [9]:
fp_w_info = all_fp.merge(info, on='Participant_ID', how='inner')
matrix_fp = fp_w_info.drop(columns=['Condition_matfile', 'Sequence_name'], inplace=False)
matrix_fp.head()

Unnamed: 0,Participant_ID,Condition,LPFC(L)-PPC(L),LPFC(L)-LPFC(R),LPFC(L)-PPC(R),PPC(L)-LPFC(R),PPC(L)-PPC(R),LPFC(R)-PPC(R),Averaged_CC,ADHD/NT,Gender
0,200,rest,0.74405,0.31677,0.07744,0.4263,0.42652,0.98033,0.49524,NT,F
1,201,rest,0.80099,0.17939,0.3125,0.07402,0.6891,0.61432,0.44505,ADHD,F
2,202,rest,1.2186,0.74083,0.52707,0.73794,0.64482,1.0955,0.82746,NT,F
3,205,rest,0.65107,0.79147,0.15068,0.82202,0.68807,0.5438,0.60785,NT,F
4,207,rest,0.75497,0.39063,0.45857,0.73926,0.70244,1.0723,0.68636,NT,F


### Partner's

In [10]:
# Sensorimotor
all_SenMotor = pd.concat([
    pd.read_excel('../data/Brain_networks/Network_CC/SenMotor_2021.xlsx'),
    pd.read_excel('../data/Brain_networks/Network_CC/SenMotor_2022.xlsx'),
    pd.read_excel('../data/Brain_networks/Network_CC/SenMotor_2023.xlsx')
], ignore_index=True)
all_SenMotor['Condition'] = all_SenMotor['Condition'].astype('category')

SenMotor_w_info = all_SenMotor.merge(info, on='Participant_ID', how='inner')
matrix_SenMotor = SenMotor_w_info.drop(columns=['Condition_matfile', 'Sequence_name'], inplace=False)
matrix_SenMotor.head()

# DA
all_DA = pd.concat([
    pd.read_excel('../data/Brain_networks/Network_CC/DA_2021.xlsx'),
    pd.read_excel('../data/Brain_networks/Network_CC/DA_2022.xlsx'),
    pd.read_excel('../data/Brain_networks/Network_CC/DA_2023.xlsx')
], ignore_index=True)
all_DA['Condition'] = all_DA['Condition'].astype('category')

DA_w_info = all_DA.merge(info, on='Participant_ID', how='inner')
matrix_DA = DA_w_info.drop(columns=['Condition_matfile', 'Sequence_name'], inplace=False)
matrix_DA.head()

# DMN
all_DMN = pd.concat([
    pd.read_excel('../data/Brain_networks/Network_CC/DMN_2021.xlsx'),
    pd.read_excel('../data/Brain_networks/Network_CC/DMN_2022.xlsx'),
    pd.read_excel('../data/Brain_networks/Network_CC/DMN_2023.xlsx')
], ignore_index=True)
all_DMN['Condition'] = all_DMN['Condition'].astype('category')

DMN_w_info = all_DMN.merge(info, on='Participant_ID', how='inner')
matrix_DMN = DMN_w_info.drop(columns=['Condition_matfile', 'Sequence_name'], inplace=False)
matrix_DMN.head()

Unnamed: 0,Participant_ID,Condition,MPFC-PCC,MPFC-LP(L),MPFC-LP(R),PCC-LP(L),PCC-LP(R),LP(L)-LP(R),Averaged_CC,ADHD/NT,Gender
0,200,rest,0.32243,0.12195,0.29818,0.52458,0.83889,0.40638,0.41874,NT,F
1,201,rest,0.54773,0.30204,0.41196,0.7593,0.85822,0.7944,0.61228,ADHD,F
2,202,rest,0.60787,0.76637,0.90228,0.88364,0.90088,1.266,0.88784,NT,F
3,205,rest,0.58095,0.54134,0.645,0.86591,0.90772,1.2412,0.79702,NT,F
4,207,rest,0.47869,0.32974,0.38599,0.48359,0.51843,0.51397,0.45173,NT,F


### combine all networks

In [11]:
# Separate rest and task conditions for each network
network_list = ['visual', 'salience', 'fp', 'DMN', 'DA', 'SenMotor']
conditions = ['rest', 'task']

# Loop through the matrices and conditions
for network in network_list:
    for condition in conditions:
        globals()[f'matrix_{network}_{condition}'] = globals()[f'matrix_{network}'][globals()[f'matrix_{network}']['Condition'] == condition]

In [12]:
def add_suffix_and_merge(matrices, suffixes):
    matrices_copy = [matrix.copy() for matrix in matrices]
    for matrix, suffix in zip(matrices_copy, suffixes):
        matrix.columns = [col + suffix if col != 'Participant_ID' else col for col in matrix.columns]

    merged_df = matrices_copy[0]
    for matrix in matrices_copy[1:]:
        merged_df = merged_df.merge(matrix, on='Participant_ID')
    
    return merged_df

In [15]:
matrix_list_rest = [matrix_visual_rest, matrix_salience_rest, matrix_fp_rest, matrix_DMN_rest, matrix_DA_rest, matrix_SenMotor_rest]
matrix_list_task = [matrix_visual_task, matrix_salience_task, matrix_fp_task, matrix_DMN_task, matrix_DA_task, matrix_SenMotor_task]
suffix_list = ['_Visual', '_Salience', '_FP', '_DMN', '_DA', '_SenMotor']

merged_rest_df = add_suffix_and_merge(matrix_list_rest, suffix_list)
merged_task_df = add_suffix_and_merge(matrix_list_task, suffix_list)

In [16]:
from collections import defaultdict

for merged_df in [merged_rest_df, merged_task_df]:
    columns = merged_df.columns

    feature_dict = defaultdict(list)

    network_suffixes = ['_DA', '_DMN', '_FP', '_Salience', '_SenMotor', '_Visual']
    duplicate_features = ['ADHD/NT', 'Gender']

    for col in columns:
        matched = False
        if any(col.startswith(prefix) for prefix in duplicate_features):
            for suffix in network_suffixes:
                if col.endswith(suffix):
                    feature = col.replace(suffix, '')
                    feature_dict[feature].append(col)
                    matched=True
                    break
            if not matched:
                feature_dict[col].append(col)

    duplicate_features = {feature: cols for feature, cols in feature_dict.items() if len(cols) > 1}

    for feature, cols in duplicate_features.items():
        print(f"{feature}: {cols}")

    def validate_duplicate_features(duplicate_features, df):
        for feature, cols in duplicate_features.items():
            first_col = df[cols[0]]
            for col in cols[1:]:
                if not first_col.equals(df[col]):
                    print(f"Discrepancy found in feature '{feature}' between columns: {cols[0]} and {col}")
                    return False
        print("All duplicate features are consistent across their columns.")
        return True

    if validate_duplicate_features(duplicate_features, merged_df):
        for feature, cols in duplicate_features.items():
            merged_df[feature] = merged_df[cols[0]]
            merged_df.drop(columns=cols, inplace=True)
    merged_df.columns

ADHD/NT: ['ADHD/NT_Visual', 'ADHD/NT_Salience', 'ADHD/NT_FP', 'ADHD/NT_DMN', 'ADHD/NT_DA', 'ADHD/NT_SenMotor']
Gender: ['Gender_Visual', 'Gender_Salience', 'Gender_FP', 'Gender_DMN', 'Gender_DA', 'Gender_SenMotor']
All duplicate features are consistent across their columns.
ADHD/NT: ['ADHD/NT_Visual', 'ADHD/NT_Salience', 'ADHD/NT_FP', 'ADHD/NT_DMN', 'ADHD/NT_DA', 'ADHD/NT_SenMotor']
Gender: ['Gender_Visual', 'Gender_Salience', 'Gender_FP', 'Gender_DMN', 'Gender_DA', 'Gender_SenMotor']
All duplicate features are consistent across their columns.


## Feature extraction

In [None]:
visual_features = ['Medial-Occipital', 'Occipital-Lateral(L)', 'Medial-Lateral(L)', 'Medial-Lateral(R)', 'Lateral(L)-Lateral(R)']
salience_features = ['ACC-AInsula(L)', 'ACC-RPFC(L)', 'ACC-RPFC(R)', 'AInsula(L)-RPFC(L)', 'AInsula(L)-RPFC(R)', 'AInsula(R)-RPFC(L)', 'RPFC(L)-SMG(L)', 'RPFC(R)-SMG(L)']
fp_features = ['LPFC(L)-PPC(L)']

DMN_features = ['MPFC-PCC', 'MPFC-LP(L)', 'MPFC-LP(R)', 'LP(L)-LP(R)']

In [None]:
X_visual = matrix_visual[visual_features]
X_salience = matrix_salience[salience_features]
X_fp = matrix_fp[fp_features]

X_dmn = matrix_DMN[DMN_features]
X_dmn

In [None]:
y_visual = matrix_visual['ADHD/NT'].map({'NT': 0, 'ADHD': 1})
y_salience = matrix_salience['ADHD/NT'].map({'NT': 0, 'ADHD': 1})
y_fp = matrix_fp['ADHD/NT'].map({'NT': 0, 'ADHD': 1})

y_dmn = matrix_DMN['ADHD/NT'].map({'NT': 0, 'ADHD': 1})

## Function

In [28]:
y_rest = merged_rest_df['ADHD/NT'].map({'NT': 0, 'ADHD': 1})
y_task = merged_task_df['ADHD/NT'].map({'NT': 0, 'ADHD': 1})
X_rest = merged_rest_df.drop(columns=['Participant_ID', 'ADHD/NT', 'Gender', 'Averaged_CC_DA', 'Averaged_CC_DMN', 'Averaged_CC_FP', 'Averaged_CC_Salience', 'Averaged_CC_SenMotor', 'Averaged_CC_Visual', 'Condition_SenMotor', 'Condition_DA', 'Condition_DMN', 'Condition_FP', 'Condition_Salience', 'Condition_Visual'])
X_task = merged_task_df.drop(columns=['Participant_ID', 'ADHD/NT', 'Gender', 'Averaged_CC_DA', 'Averaged_CC_DMN', 'Averaged_CC_FP', 'Averaged_CC_Salience', 'Averaged_CC_SenMotor', 'Averaged_CC_Visual', 'Condition_SenMotor', 'Condition_DA', 'Condition_DMN', 'Condition_FP', 'Condition_Salience', 'Condition_Visual'])
X_rest

Unnamed: 0,Medial-Occipital_Visual,Medial-Lateral(L)_Visual,Medial-Lateral(R)_Visual,Occipital-Lateral(L)_Visual,Occipital-Lateral(R)_Visual,Lateral(L)-Lateral(R)_Visual,ACC-AInsula(L)_Salience,ACC-AInsula(R)_Salience,ACC-RPFC(L)_Salience,ACC-RPFC(R)_Salience,...,LP(L)-LP(R)_DMN,FEF(L)-FEF(R)_DA,FEF(L)-IPS(L)_DA,FEF(L)-IPS(R)_DA,FEF(R)-IPS(L)_DA,FEF(R)-IPS(R)_DA,IPS(L)-IPS(R)_DA,Lateral(L)-Lateral(R)_SenMotor,Lateral(L)-Superior_SenMotor,Lateral(R)-Superior_SenMotor
0,0.18123,0.22104,0.032202,0.68723,0.4797,0.97918,0.38244,0.42946,0.47293,0.68623,...,0.40638,0.46997,0.21073,0.27023,0.39122,0.74872,0.59308,0.83207,0.58554,0.24982
1,0.44015,0.56515,0.42759,0.848,0.88916,1.2118,0.26701,0.30673,0.26035,0.47045,...,0.7944,0.20693,0.16965,0.24049,0.33709,0.23263,0.77646,1.3382,0.55084,0.5672
2,0.28081,0.21935,0.58884,0.51701,0.6758,0.87309,0.39955,0.76758,0.18989,0.30054,...,1.266,0.27424,0.51956,0.39782,0.24638,0.27299,0.65969,0.99492,0.34612,0.27247
3,0.39404,0.88638,0.51938,0.50584,0.28219,1.0813,0.51393,0.60648,0.83598,0.76121,...,1.2412,0.16379,0.27223,0.32728,0.33158,0.40066,0.95249,0.99612,0.77645,0.72975
4,0.44657,0.32933,0.27231,0.47209,0.3813,0.68107,0.50032,0.54461,0.35094,0.37688,...,0.51397,0.73404,0.48094,0.17462,0.40541,0.47736,0.66752,1.1595,0.50971,0.34342
5,0.51784,0.3864,0.25965,0.86811,0.67408,1.216,1.3184,1.0003,0.55693,0.82977,...,0.95219,0.16743,-0.04308,0.040355,0.025693,-0.055387,0.83645,1.5412,0.48552,0.44717
6,0.15324,0.33297,0.26522,1.0363,0.77898,1.4428,0.56774,0.54841,0.58323,0.52019,...,0.74009,0.53515,0.15953,0.12894,-0.25259,-0.30271,1.2792,1.3617,0.29368,0.26979
7,0.75346,0.73949,0.79315,0.84036,0.54342,1.2063,0.35685,0.73981,0.66661,0.81839,...,0.89143,0.72294,0.96609,0.87177,0.91621,0.93282,1.534,0.67625,0.8242,0.64055
8,0.19626,0.67334,0.4126,0.43223,0.31156,1.2347,0.17843,0.83809,0.34126,0.50295,...,0.59072,0.71743,-0.023664,-0.092588,-0.16923,-0.19597,0.40748,1.3463,1.0138,0.89674
9,0.009248,0.60416,0.30653,0.12321,0.35136,1.0388,0.88647,0.67109,0.86904,0.60027,...,0.62938,0.63551,0.76052,0.59723,0.36263,0.69286,0.92966,1.2048,0.23007,0.13082


### rf_classifier

In [None]:
def run_rf(X, y, condition="Condition"):
    print(f"\n=== Analyzing {condition} Condition ===")
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Check if test samples exist in training set
    overlap = X_train.merge(X_test, how='inner')
    print("Number of overlapping rows between train and test sets:", len(overlap))

    # Hyperparameter grid for tuning
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5],
        'min_samples_split': [5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['sqrt', 'log2']
    }

    # GridSearchCV with RandomForestClassifier
    grid_search = GridSearchCV(
        estimator=RandomForestClassifier(random_state=42, class_weight='balanced', oob_score=True),
        param_grid=param_grid,
        cv=5,
        scoring='f1_macro',
        n_jobs=-1
    )
    

    # Fit grid search
    grid_search.fit(X_train, y_train)
    print("Best parameters:", grid_search.best_params_)

    # Best model from grid search
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')

    print(f"\nTest Set Performance:")
    print(f"Accuracy:     {accuracy:.4f}")
    print(f"F1 Macro:     {f1:.4f}")
    print(f"ROC AUC:      {roc_auc:.4f}")

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    # Out-of-bag score
    print(f"\n🌲 Out-of-Bag Score (from training): {best_model.oob_score_:.4f}")

    # Cross-validated generalization score
    cv_scores = cross_val_score(best_model, X, y, cv=5, scoring='f1_macro', n_jobs=-1)
    print(f"Cross-Validated F1 Macro: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

    # Feature importances
    print("\nTop 10 Feature Importances:")
    importances = best_model.feature_importances_
    if isinstance(X, pd.DataFrame):
        feature_names = X.columns
    else:
        feature_names = [f"Feature {i}" for i in range(X.shape[1])]
        
    top_indices = np.argsort(importances)[::-1][:10]
    for i in top_indices:
        print(f"{feature_names[i]}: {importances[i]:.4f}")

In [26]:
def run_ml(X_list, y_list):
   for X, y, condition in zip(X_list, y_list, ["Rest", "Task"]):
       X_filtered = X[networks]
       run_rf(X_filtered, y, condition=condition)

run_ml([X_rest, X_task], [y_rest, y_task])


=== Analyzing Rest Condition ===
Number of overlapping rows between train and test sets: 0
Best parameters: {'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 100}

Test Set Performance:
Accuracy:     0.6667
F1 Macro:     0.5556
ROC AUC:      0.5625

Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.25      0.33         4
           1       0.70      0.88      0.78         8

    accuracy                           0.67        12
   macro avg       0.60      0.56      0.56        12
weighted avg       0.63      0.67      0.63        12

Confusion Matrix:
[[1 3]
 [1 7]]

🌲 Out-of-Bag Score (from training): 0.5682
Cross-Validated F1 Macro: 0.5493 ± 0.1176

Top 10 Feature Importances:
Medial-Occipital_Visual: 0.2669
Occipital-Lateral(L)_Visual: 0.2258
LPFC(L)-PPC(L)_FP: 0.1658
AInsula(L)-RPFC(R)_Salience: 0.1113
Medial-Lateral(L)_Visual: 0.0769
ACC-RPFC(L)_Salience: 0.0767
Media