In [1]:
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.model_selection import cross_val_score

dataset_base_path = "/media/jfallmann/T9/University/master_thesis/dataset"

mri_raw_path = f"{dataset_base_path}/mri/raw"
mri_base_path = f"{dataset_base_path}/mri"
snp_raw_path = f"{dataset_base_path}/snp/raw"
mri_bids_path = f"{dataset_base_path}/mri/bids"
mri_fastsurfer_out = f"{dataset_base_path}/mri/processed"
tables_path = f"{dataset_base_path}/tables"

In [3]:
# import mri data
mri_data = np.load(f"{mri_base_path}/processed_volumes.npy")

# import snp data
snp_data = np.load(f"{dataset_base_path}/snp/processed/genomes.npy")

# import demographic data
demographic_data = np.load(f"{dataset_base_path}/tables/demographic_data.npy")

# import diagnosis data
diagnosis_data = np.load(f"{dataset_base_path}/tables/diagnosis_data.npy")

In [4]:
# ignore subjects where the whole row is zero in the mri data where rows are subjects and columns are mri data. We want to remove these subjects from all datasets
zero_rows = np.where(~mri_data.any(axis=1))[0]
mri_data = np.delete(mri_data, zero_rows, axis=0)
snp_data = np.delete(snp_data, zero_rows, axis=0)
demographic_data = np.delete(demographic_data, zero_rows, axis=0)
diagnosis_data = np.delete(diagnosis_data, zero_rows, axis=0)


In [5]:
mri_data.shape

(407, 95)

In [6]:
# concatenate mri_data, snp_data, demographic_data
X = np.concatenate((mri_data, snp_data, demographic_data), axis=1)
Y = diagnosis_data[:, 0]

In [7]:
def fit_classifier(X, Y):
    # train test split
    from sklearn.model_selection import train_test_split
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

    # train random forest classifier
    clf = RandomForestClassifier(n_estimators=50, random_state=42)
    classifier = clf.fit(X_train, Y_train)

    # evaluate classifier
    from sklearn.metrics import accuracy_score
    Y_pred = classifier.predict(X_test)
    return accuracy_score(Y_test, Y_pred)

In [8]:
# select the top N features according to PCA
from sklearn.decomposition import PCA
n_features = 10
pca = PCA(n_components=n_features)
X = pca.fit_transform(X)

In [9]:
scores = cross_val_score(RandomForestClassifier(n_estimators=50, random_state=42), X, Y, cv=5)
scores.mean()

0.503733815115929

In [10]:
# remove all samples where diagnosis is not 1 or 3
X_cn_ad = X[np.where((Y == 1) | (Y == 3))]
Y_cn_ad = Y[np.where((Y == 1) | (Y == 3))]

In [11]:
scores = cross_val_score(RandomForestClassifier(n_estimators=50, random_state=42), X_cn_ad, Y_cn_ad, cv=5)
scores.mean()

0.7421052631578947

In [12]:
X_mci_ad = X[np.where((Y == 2) | (Y == 3))]
Y_mci_ad = Y[np.where((Y == 2) | (Y == 3))]

In [13]:
scores = cross_val_score(RandomForestClassifier(n_estimators=50, random_state=42), X_mci_ad, Y_mci_ad, cv=5)
scores.mean()

0.7905050505050506