In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import numpy as np

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import CCA
from contrastive import CPCA	# $ pip3 install contrastive
import umap.umap_ as umap

# set for your own directory
my_filepath = "/Users/kinichen/Summer_dFC/Datasets/ds003465_task-Axcpt_Time-Freq.npy"

In [41]:
def dim_reduction_train(X, model_class, n_components, Y=None, alpha=1):
	"""
	Fit dimension reduction pipeline on train data, and return both transformed X and the reusable pipeline.
	
	Parameters:
		X: array-like, shape (n_samples_train, n_features)
		model_class: class (PCA, CCA, CPCA, or UMAP)
		n_components: number of components to keep on outermost layer (not PCA if
			it is used as a preprocessing step)
		Y: for supervised CCA or CPCA background
		alpha: CPCA contrastive parameter
		
	Returns:
		X_reduced: transformed training data
		pipeline: object to reuse to transform test data
	"""
	
	scaler_X = StandardScaler()
	X_scaled = scaler_X.fit_transform(X)

	if model_class.__name__ == 'PCA':
		model = model_class(n_components=n_components)
		pipeline = Pipeline([
			('scaler', scaler_X),
			('pca', model)
		])
		X_reduced = pipeline.fit_transform(X)

	elif model_class.__name__ == 'CCA':
		# Note: it only makes sense to use supervised CCA with labels Y=y. If
		# using Y=X_rest, the reduced subspace does not enhance task-specific
		# features, but instead returns the shared subspace between task and rest.
		if Y is None:
			raise ValueError("CCA requires supervised labels Y.")

		# Preprocess labels into "2D" array with shape (n_samples, 1), then OneHotEncoder
		y = OneHotEncoder(sparse_output=False).fit_transform(Y.reshape(-1, 1))

		# PCA before CCA
		pca = PCA(n_components=1000)
		X_pca = pca.fit_transform(X_scaled)

		cca = model_class(n_components=n_components)
		X_cca, _ = cca.fit_transform(X_pca, y)

		# Store fitted pipeline parts manually into a dictionary
		pipeline = {
			'scaler': scaler_X,
			'pca': pca,
			'cca': cca,
			'label_encoder': y  # just for reference if needed
		}
		X_reduced = X_cca

	elif model_class.__name__ == 'CPCA':
		if Y is None:
			raise ValueError("CPCA requires background dataset Y.")
		
		scaler_Y = StandardScaler()
		Y_scaled = scaler_Y.fit_transform(Y)

		# Note: the CPCA class intrinsically applies PCA to reduce to 1000 
  		# components first, so for consistency on test data, do this explicitly
		pca = PCA(n_components=1000)
		X_pca = pca.fit_transform(X_scaled)
		Y_pca = pca.transform(Y_scaled)	# transform Y into the same PCA feature space

		cpca = model_class(n_components=n_components)
		X_cpca = cpca.fit_transform(X_pca, Y_pca, 
							alpha_selection='manual', alpha_value=alpha)
		
		pipeline = {
			'scaler_X': scaler_X,	# Don't store scaler_Y, as it is not needed for test data
			'pca': pca,
   			'cpca': cpca,
			'alpha': alpha 
		}
		X_reduced = X_cpca

	elif model_class.__name__ == 'UMAP':
		pca = PCA(n_components=1000)
		X_pca = pca.fit_transform(X_scaled)

		umap_model = umap.UMAP(n_neighbors=50, min_dist=0.1, 
							   n_components=n_components, random_state=25)
		X_umap = umap_model.fit_transform(X_pca)

		pipeline = {
			'scaler': scaler_X,
			'pca': pca,
			'umap': umap_model
		}
		X_reduced = X_umap

	else:
		raise ValueError("Only PCA, CCA, CPCA, and UMAP are supported right now.")

	return X_reduced, pipeline


In [42]:
def dim_reduction_test(X, pipeline):
	"""
	Return the transformed test data using the fitted model pipeline.
	
	Parameters:
		X: array-like, shape (n_samples_test, n_features)
		pipeline: fitted model pipeline (PCA, CCA, CPCA, or UMAP)
		Y: optional labels (needed for supervised CCA or CPCA background)
		
	Returns:
		X_reduced: transformed test data
	"""
	if isinstance(pipeline, Pipeline):  # only pure PCA used a Pipeline object
		X_reduced = pipeline.transform(X)
		
	elif 'cca' in pipeline:
		X_scaled = pipeline['scaler'].transform(X)
		X_pca = pipeline['pca'].transform(X_scaled)
		X_reduced = pipeline['cca'].transform(X_pca)

	elif 'cpca' in pipeline:
		X_scaled = pipeline['scaler_X'].transform(X)
		X_pca = pipeline['pca'].transform(X_scaled)
		X_reduced = pipeline['cpca'].transform(X_pca,
						alpha_selection='manual', alpha_value=pipeline['alpha'])

	elif 'umap' in pipeline:
		X_scaled = pipeline['scaler'].transform(X)
		X_pca = pipeline['pca'].transform(X_scaled)
		X_reduced = pipeline['umap'].transform(X_pca)

	else:
		raise ValueError("Only PCA, CCA, CPCA and UMAP are supported right now.")
	
	return X_reduced

In [43]:
def evaluate_performance(classifier, X_train, y_train, X_test, y_test):
	"""
	Evaluate the performance of a fitted classifier on train and test data.
	"""
	y_train_pred = classifier.predict(X_train)	# binary
	y_train_prob = classifier.predict_proba(X_train)[:, 1]	# probabilities of task
	y_test_pred = classifier.predict(X_test)
	y_test_prob = classifier.predict_proba(X_test)[:, 1]

	print("Train accuracy:", round(accuracy_score(y_train, y_train_pred), 4))
	print("Train AUC:", round(roc_auc_score(y_train, y_train_prob), 4))
	print("Test accuracy:", round(accuracy_score(y_test, y_test_pred), 4))	# threshold decisions
	# since two classes, accuracy of 0.5 = random guess
	print("Test AUC:", round(roc_auc_score(y_test, y_test_prob), 4))	# evaluates how well the
	# model predicts the probability of the positive class (1 = task). 
 	# for this metric, 0.5 = random guess

In [2]:
# Loading dataset for 1 task paradigm assessed by 1 method for all subjects (1 run)
dFC = np.load(my_filepath, allow_pickle=True)
dFC_dict = dFC.item() # extract the dictionary from np array

X = dFC_dict["X"]
y = dFC_dict["y"]
subj_label = dFC_dict["subj_label"]
method = dFC_dict["measure_name"]

In [27]:
# PCA + Logistic Regression with penalty

# First, split!
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Dimensionality reduction
# Note: to be comparable, all dim reduction methods should keep the same number 
# of components at the end. Only 2 are kept since supervised CCA can only keep a maximum of 2.
X_train_reduced, pipeline = dim_reduction_train(X_train, PCA, n_components=2)
X_test_reduced = dim_reduction_test(X_test, pipeline)	# transform test data for later

# Classification model training
classifier = LogisticRegression(penalty='l1', solver='saga', max_iter=5000)
classifier.fit(X_train_reduced, y_train)

# Evaluation metrics
evaluate_performance(classifier, X_train_reduced, y_train, X_test_reduced, y_test)

Train accuracy: 0.5874
Train AUC: 0.571
Test accuracy: 0.6014
Test AUC: 0.5824


In [44]:
# PCA + Logistic Regression with penalty WITH 100 COMPONENTS

# First, split!
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Dimensionality reduction
# Note: to be comparable, all dim reduction methods should keep the same number of components at the end
X_train_reduced, pipeline = dim_reduction_train(X_train, PCA, n_components=100)
X_test_reduced = dim_reduction_test(X_test, pipeline)	# transform test data for later

# Classification model training
classifier = LogisticRegression(penalty='l1', solver='saga', max_iter=5000)
classifier.fit(X_train_reduced, y_train)

# Evaluation metrics
evaluate_performance(classifier, X_train_reduced, y_train, X_test_reduced, y_test)

Train accuracy: 0.6371
Train AUC: 0.6684
Test accuracy: 0.621
Test AUC: 0.6378


In [28]:
# Supervised CCA + Logistic Regression with penalty

# First, split!
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Dimensionality reduction	(after one-hot encoding the labels, max 2 canonical components)
X_train_reduced, pipeline = dim_reduction_train(X_train, CCA, n_components=2, Y=y_train)
X_test_reduced = dim_reduction_test(X_test, pipeline)	# transform test data for later

# Classification model training
classifier = LogisticRegression(penalty='l1', solver='saga', max_iter=5000)
classifier.fit(X_train_reduced, y_train)

# Evaluation metrics
evaluate_performance(classifier, X_train_reduced, y_train, X_test_reduced, y_test)

Train accuracy: 0.7468
Train AUC: 0.8271
Test accuracy: 0.6861
Test AUC: 0.7366


In [39]:
# CPCA + Logistic Regression with penalty

# First, split!
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Dimensionality reduction
X_train_rest = X_train[y_train == 0]  # background dataset for cPCA
X_train_reduced, pipeline = dim_reduction_train(X_train, CPCA, n_components=2, 
                                                Y=X_train_rest, alpha=0.5)
X_test_reduced = dim_reduction_test(X_test, pipeline)   # project test data on 
# learned subspace, so don't provide background Y=X_test_rest or else data leakage

# Classification model training
classifier = LogisticRegression(penalty='l1', solver='saga', max_iter=5000)
classifier.fit(X_train_reduced, y_train)

# Evaluation metrics
evaluate_performance(classifier, X_train_reduced, y_train, X_test_reduced, y_test)

Train accuracy: 0.5381
Train AUC: 0.5102
Test accuracy: 0.5264
Test AUC: 0.5289


In [45]:
# CPCA + Logistic Regression with penalty WITH 100 COMPONENTS

# First, split!
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Dimensionality reduction
X_train_rest = X_train[y_train == 0]  # background dataset for cPCA
X_train_reduced, pipeline = dim_reduction_train(X_train, CPCA, n_components=100, 
                                                Y=X_train_rest, alpha=0.5)
X_test_reduced = dim_reduction_test(X_test, pipeline)   # project test data on 
# learned subspace, so don't provide background Y=X_test_rest or else data leakage

# Classification model training
classifier = LogisticRegression(penalty='l1', solver='saga', max_iter=5000)
classifier.fit(X_train_reduced, y_train)

# Evaluation metrics
evaluate_performance(classifier, X_train_reduced, y_train, X_test_reduced, y_test)

Train accuracy: 0.5625
Train AUC: 0.6293
Test accuracy: 0.5657
Test AUC: 0.5891


In [38]:
# UMAP + Logistic Regression with penalty

# First, split!
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

# Dimensionality reduction
X_train_reduced, pipeline = dim_reduction_train(X_train, umap.UMAP, n_components=2)
X_test_reduced = dim_reduction_test(X_test, pipeline)

# Classification model training
classifier = LogisticRegression(penalty='l1', solver='saga', max_iter=5000)
classifier.fit(X_train_reduced, y_train)

# Evaluation metrics
evaluate_performance(classifier, X_train_reduced, y_train, X_test_reduced, y_test)

  warn(


Train accuracy: 0.6053
Train AUC: 0.5729
Test accuracy: 0.6218
Test AUC: 0.5937
