In [13]:
import os
import numpy as np
import pandas as pd
import scipy.io as sio
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from xgboost import XGBClassifier

%matplotlib inline
np.random.seed(42)

In [14]:
DATA_DIR = "./abide"

raw_features = []

subject_dirs = [os.path.join(DATA_DIR, d) for d in os.listdir(DATA_DIR) 
                if os.path.isdir(os.path.join(DATA_DIR, d))]

for subject_dir in subject_dirs:
    subject_id = os.path.basename(subject_dir)
    file_list = [f for f in os.listdir(subject_dir) if "AAL116_correlation_matrix.mat" in f]
    if not file_list:
        continue
    file_path = os.path.join(subject_dir, file_list[0])
    try:
        mat_data = sio.loadmat(file_path)
        corr_matrix = mat_data['data']
        raw_vector = corr_matrix.flatten()
        raw_features.append([subject_id] + raw_vector.tolist())
    except Exception as e:
        print(f"Error processing {file_path} for subject {subject_id}: {e}")

if raw_features:
    num_raw_features = len(raw_features[0]) - 1
    raw_columns = ["Subject_ID"] + [f"raw_feat_{i}" for i in range(num_raw_features)]
    raw_df = pd.DataFrame(raw_features, columns=raw_columns)
    print("Raw features (first few rows):")
    display(raw_df.head())
else:
    print("No raw features were extracted.")

Raw features (first few rows):


Unnamed: 0,Subject_ID,raw_feat_0,raw_feat_1,raw_feat_2,raw_feat_3,raw_feat_4,raw_feat_5,raw_feat_6,raw_feat_7,raw_feat_8,...,raw_feat_13446,raw_feat_13447,raw_feat_13448,raw_feat_13449,raw_feat_13450,raw_feat_13451,raw_feat_13452,raw_feat_13453,raw_feat_13454,raw_feat_13455
0,sub-patient50746,1.0,0.296519,0.186566,0.127506,-0.160403,-0.231936,0.107875,-0.017547,-0.224591,...,-0.006368,-0.207693,-0.025804,-0.265567,-0.140235,-0.208179,-0.181679,-0.095347,-0.106705,1.0
1,sub-patient51465,1.0,0.55218,-0.197868,0.127646,-0.330161,-0.318455,-0.300559,-0.102945,-0.165011,...,0.17658,0.068962,0.37773,-0.099666,-0.193038,0.04273,0.164206,0.302463,0.674022,1.0
2,sub-patient50968,1.0,0.356604,-0.181203,0.074351,-0.173837,-0.040913,-0.118115,0.107371,-0.182411,...,0.03666,0.057355,0.081297,-0.050684,-0.134433,-0.086592,-0.146643,0.044312,0.034604,1.0
3,sub-control51363,1.0,0.4996,0.111693,-0.222331,-0.050617,-0.151886,-0.075262,-0.218519,0.002364,...,-0.003771,0.175768,-0.028122,0.038801,0.106015,0.082474,-0.042229,0.022918,0.110044,1.0
4,sub-control51133,1.0,0.589132,0.269111,0.223921,-0.096538,-0.142542,0.217353,0.243499,-0.088444,...,0.216024,0.124669,-0.123923,0.04174,0.151036,0.096975,0.127788,0.172333,0.45631,1.0


In [15]:
tda_csv_path = "tda_features_overall.csv"
tda_df = pd.read_csv(tda_csv_path)
print("TDA Features (first few rows):")
display(tda_df.head())

wasserstein_csv_path = "wasserstein_features.csv"
wasserstein_df = pd.read_csv(wasserstein_csv_path)
print("Wasserstein Features (first few rows):")
display(wasserstein_df.head())

TDA Features (first few rows):


Unnamed: 0,Subject_ID,File_Name,TDA_Feature_0,TDA_Feature_1,TDA_Feature_2
0,sub-patient50746,sub-patient50746_AAL116_correlation_matrix.mat,6.794423,6.367342,7.189431
1,sub-patient51465,sub-patient51465_AAL116_correlation_matrix.mat,6.81871,6.486397,7.011656
2,sub-patient50968,sub-patient50968_AAL116_correlation_matrix.mat,6.825804,6.521837,7.65463
3,sub-control51363,sub-control51363_AAL116_correlation_matrix.mat,6.821861,6.418258,6.740007
4,sub-control51133,sub-control51133_AAL116_correlation_matrix.mat,6.797961,6.21537,6.777266


Wasserstein Features (first few rows):


Unnamed: 0,Subject_ID,Wasserstein_Distance
0,sub-patient50746,0.0
1,sub-patient51465,2.246184
2,sub-patient50968,3.970881
3,sub-control51363,2.261754
4,sub-control51133,2.060965


In [16]:
merged_df = pd.merge(raw_df, tda_df, on="Subject_ID", how="inner")
final_df = pd.merge(merged_df, wasserstein_df, on="Subject_ID", how="inner")

print("Final merged DataFrame (first few rows):")
display(final_df.head())
print("Final merged shape:", final_df.shape)


Final merged DataFrame (first few rows):


Unnamed: 0,Subject_ID,raw_feat_0,raw_feat_1,raw_feat_2,raw_feat_3,raw_feat_4,raw_feat_5,raw_feat_6,raw_feat_7,raw_feat_8,...,raw_feat_13451,raw_feat_13452,raw_feat_13453,raw_feat_13454,raw_feat_13455,File_Name,TDA_Feature_0,TDA_Feature_1,TDA_Feature_2,Wasserstein_Distance
0,sub-patient50746,1.0,0.296519,0.186566,0.127506,-0.160403,-0.231936,0.107875,-0.017547,-0.224591,...,-0.208179,-0.181679,-0.095347,-0.106705,1.0,sub-patient50746_AAL116_correlation_matrix.mat,6.794423,6.367342,7.189431,0.0
1,sub-patient51465,1.0,0.55218,-0.197868,0.127646,-0.330161,-0.318455,-0.300559,-0.102945,-0.165011,...,0.04273,0.164206,0.302463,0.674022,1.0,sub-patient51465_AAL116_correlation_matrix.mat,6.81871,6.486397,7.011656,2.246184
2,sub-patient50968,1.0,0.356604,-0.181203,0.074351,-0.173837,-0.040913,-0.118115,0.107371,-0.182411,...,-0.086592,-0.146643,0.044312,0.034604,1.0,sub-patient50968_AAL116_correlation_matrix.mat,6.825804,6.521837,7.65463,3.970881
3,sub-control51363,1.0,0.4996,0.111693,-0.222331,-0.050617,-0.151886,-0.075262,-0.218519,0.002364,...,0.082474,-0.042229,0.022918,0.110044,1.0,sub-control51363_AAL116_correlation_matrix.mat,6.821861,6.418258,6.740007,2.261754
4,sub-control51133,1.0,0.589132,0.269111,0.223921,-0.096538,-0.142542,0.217353,0.243499,-0.088444,...,0.096975,0.127788,0.172333,0.45631,1.0,sub-control51133_AAL116_correlation_matrix.mat,6.797961,6.21537,6.777266,2.060965


Final merged shape: (1025, 13462)


In [17]:
print("Columns in final_df:")
print(final_df.columns)

print("\nColumn dtypes:")
print(final_df.dtypes)

final_df["Label"] = final_df["Subject_ID"].apply(lambda sid: 0 if "control" in sid.lower() else 1)
print("Label distribution:")
print(final_df["Label"].value_counts())

excluded_cols = ["Subject_ID", "Label", "File_Name"]

feature_cols = [col for col in final_df.columns if col not in excluded_cols]

for col in feature_cols:
    if final_df[col].dtype == object:
        print(f"Excluding column '{col}' because it's not numeric.")
        feature_cols.remove(col)

X = final_df[feature_cols].values.astype(np.float32)
y = final_df["Label"].values.astype(np.int32)

Columns in final_df:
Index(['Subject_ID', 'raw_feat_0', 'raw_feat_1', 'raw_feat_2', 'raw_feat_3',
       'raw_feat_4', 'raw_feat_5', 'raw_feat_6', 'raw_feat_7', 'raw_feat_8',
       ...
       'raw_feat_13451', 'raw_feat_13452', 'raw_feat_13453', 'raw_feat_13454',
       'raw_feat_13455', 'File_Name', 'TDA_Feature_0', 'TDA_Feature_1',
       'TDA_Feature_2', 'Wasserstein_Distance'],
      dtype='object', length=13462)

Column dtypes:
Subject_ID               object
raw_feat_0              float64
raw_feat_1              float64
raw_feat_2              float64
raw_feat_3              float64
                         ...   
File_Name                object
TDA_Feature_0           float64
TDA_Feature_1           float64
TDA_Feature_2           float64
Wasserstein_Distance    float64
Length: 13462, dtype: object
Label distribution:
Label
0    537
1    488
Name: count, dtype: int64


In [20]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

n_components = 50
pca = PCA(n_components=n_components, random_state=42)
X_pca = pca.fit_transform(X_scaled)

print("Explained variance ratio (first 10 components):")
print(pca.explained_variance_ratio_[:10])
print("Cumulative explained variance:", np.sum(pca.explained_variance_ratio_))
print(f"PCA-reduced feature matrix shape: {X_pca.shape}")


Explained variance ratio (first 10 components):
[0.04869525 0.03260824 0.0283147  0.0242481  0.01810027 0.01685982
 0.01457778 0.01385128 0.01161067 0.01059385]
Cumulative explained variance: 0.41678995
PCA-reduced feature matrix shape: (1025, 50)


In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    X_pca, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set size:", X_train.shape[0])
print("Test set size:", X_test.shape[0])


Training set size: 820
Test set size: 205


In [22]:
xgb_model = XGBClassifier(
    n_estimators=200,
    max_depth=10,
    learning_rate=0.1,
    random_state=42,
    use_label_encoder=False, # Once again, newer versions will warn you for some reason
    eval_metric='logloss' 
)

xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost Test Accuracy: {accuracy:.4f}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred))


Parameters: { "use_label_encoder" } are not used.



XGBoost Test Accuracy: 0.6049

Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.69      0.65       107
           1       0.60      0.51      0.55        98

    accuracy                           0.60       205
   macro avg       0.60      0.60      0.60       205
weighted avg       0.60      0.60      0.60       205

