From f522ead6254c5bc8309601c9465f98ce6234df4b Mon Sep 17 00:00:00 2001 From: GaelleChapuis Date: Wed, 6 Aug 2025 12:26:31 +0200 Subject: [PATCH 01/14] score outlier fit to 1 for extreme values --- src/ephysatlas/outliers.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/ephysatlas/outliers.py b/src/ephysatlas/outliers.py index fcfc940..0de44ea 100644 --- a/src/ephysatlas/outliers.py +++ b/src/ephysatlas/outliers.py @@ -115,7 +115,12 @@ def kde_proba_distribution( n_above = 0 n_below = 0 - # TODO for extremely high value (e.g. >10 IQR), remove them, put outlier score to 1 + # For extremely high value (e.g. >10 IQR), remove them, put outlier score to 1 + val_replace = np.mean(train_data) # Use the mean of the train data so it's well within boundaries + idx_replace = np.where( (test_data > np.mean(train_data) + 10 * iqr(train_data)) | + (test_data < np.mean(train_data) - 10 * iqr(train_data)) + )[0] + test_data[idx_replace] = val_replace if np.max(test_data) > np.max(x_train): # Pad above @@ -150,6 +155,7 @@ def kde_proba_distribution( # The outlier probability is the inverse outp = 1 - y_test + outp[idx_replace] = 1 # Replace extreme outliers that were set to default value to outlier score 1 return outp, x_train, hist_train From 9405666d8ff19625af1d42d24163d5fc8ea421ea Mon Sep 17 00:00:00 2001 From: GaelleChapuis Date: Wed, 6 Aug 2025 18:57:09 +0200 Subject: [PATCH 02/14] condition on N channel / PID in training set --- src/ephysatlas/outliers.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/ephysatlas/outliers.py b/src/ephysatlas/outliers.py index 0de44ea..3ea022f 100644 --- a/src/ephysatlas/outliers.py +++ b/src/ephysatlas/outliers.py @@ -160,7 +160,8 @@ def kde_proba_distribution( return outp, x_train, hist_train -def kde_proba_1pid(df_base, df_new, features, mapping, p_thresh=0.999999, min_ch=15): +def kde_proba_1pid(df_base, df_new, features, mapping, p_thresh=0.999999, + min_ch=15, n_pid=3, min_ch_compute=100): # Regions regions = np.unique(df_new[mapping + "_id"]).astype(int) # Store the features that are outlier per brain region in a dict @@ -181,17 +182,18 @@ def kde_proba_1pid(df_base, df_new, features, mapping, p_thresh=0.999999, min_ch df_train = select_series( df_base, features=[feature], acronym=None, id=region, mapping=mapping ) - # Get channel indices that are in region, keeping only feature values df_test = select_series( df_new, features=[feature], acronym=None, id=region, mapping=mapping ) - # For all channels at once, test if outside the distribution for the given features train_data = df_train.to_numpy() test_data = df_test.to_numpy() - score_out, _, _ = kde_proba_distribution(train_data, test_data) - # score_out, _, _ = detect_outlier_histV2(train_data, test_data) + # score_out = 0 if N pid or N channel too small in training set + if (len(df_train['pid'].unique()) >= n_pid) and (df_train.shape[0] >= min_ch_compute): + score_out, _, _ = kde_proba_distribution(train_data, test_data) + else: + score_out = np.zeros(test_data.shape) # Save into new column df_new_compute[feature + "_q"] = score_out df_new_compute[feature + "_extremes"] = 0 From 1d14b62829b7c6b2f5e8785f477e852f0f22b179 Mon Sep 17 00:00:00 2001 From: GaelleChapuis Date: Tue, 12 Aug 2025 15:18:51 +0200 Subject: [PATCH 03/14] SVM --- src/ephysatlas/outliers.py | 108 ++++++++++++++++++++++++++++++++++++- 1 file changed, 106 insertions(+), 2 deletions(-) diff --git a/src/ephysatlas/outliers.py b/src/ephysatlas/outliers.py index 3ea022f..35327aa 100644 --- a/src/ephysatlas/outliers.py +++ b/src/ephysatlas/outliers.py @@ -11,6 +11,11 @@ from ephysatlas.plots import select_series from ephysatlas.anatomy import ClassifierRegions, NEW_VOID +from sklearn.svm import OneClassSVM +from sklearn.cluster import KMeans +from scipy.signal import find_peaks +from sklearn.neighbors import KernelDensity + # from ephys_atlas.plots import BINS BINS = 50 @@ -177,7 +182,7 @@ def kde_proba_1pid(df_base, df_new, features, mapping, p_thresh=0.999999, listout = list() for feature in features: - print(f"{feature} [{region}]") + # print(f"{feature} [{region}]") # Load data for that regions df_train = select_series( df_base, features=[feature], acronym=None, id=region, mapping=mapping @@ -186,11 +191,16 @@ def kde_proba_1pid(df_base, df_new, features, mapping, p_thresh=0.999999, df_test = select_series( df_new, features=[feature], acronym=None, id=region, mapping=mapping ) + + df_pid = select_series( + df_base, features=['pid'], acronym=None, id=region, mapping=mapping + ) + # For all channels at once, test if outside the distribution for the given features train_data = df_train.to_numpy() test_data = df_test.to_numpy() # score_out = 0 if N pid or N channel too small in training set - if (len(df_train['pid'].unique()) >= n_pid) and (df_train.shape[0] >= min_ch_compute): + if bool((df_pid.nunique().values[0] >= n_pid) & (df_pid.shape[0] >= min_ch_compute)): score_out, _, _ = kde_proba_distribution(train_data, test_data) else: score_out = np.zeros(test_data.shape) @@ -297,3 +307,97 @@ def df_add_channel_label(df, pid, one=None): # Merge to get the labels column df = df.merge(df_label, left_index=True, right_index=True) return df + +# =========================================================== +# ====== SVM ================================================ + +def detect_modality_auto_bandwidth(data, peak_height_frac=0.1, resolution=500): + """ + Detect unimodal or multimodal distribution using KDE with Silverman's bandwidth. + + Parameters + ---------- + data : array-like + Raw data points. + peak_height_frac : float, optional + Fraction of maximum density for minimum peak height. + resolution : int, optional + Number of points in the KDE evaluation grid. + + Returns + ------- + modality : str + "unimodal" or "multimodal" + peaks : array + Indices of detected peaks in x_grid. + x_grid : array + Grid of x values for plotting/debugging. + density : array + KDE density values on x_grid. + bandwidth : float + Bandwidth used for KDE (Silverman's rule). + """ + + data = np.asarray(data) + n = len(data) + sigma = np.std(data) + + # Silverman's rule of thumb + bandwidth = 1.06 * sigma * n ** (-1 / 5) + + # KDE fit + kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(data.reshape(-1, 1)) + + # Evaluate density + x_grid = np.linspace(data.min(), data.max(), resolution) + log_dens = kde.score_samples(x_grid[:, None]) + density = np.exp(log_dens) + + # Peak detection + peaks, _ = find_peaks(density, height=max(density) * peak_height_frac) + + modality = "unimodal" if len(peaks) <= 1 else "multimodal" + return modality, peaks, x_grid, density, bandwidth + + +def get_gamma_svm(X_train, f=0.4): + ''' + f : how many times larger you want gamma + ''' + modality, peaks, x_grid, density, bandwidth = detect_modality_auto_bandwidth(X_train) + num_peaks = len(peaks) + + if num_peaks <= 1 : + # Unimodal + sigma = ( iqr(X_train) / 1.349 ) # approximate standard deviation assuming normality + if sigma == 0.0: # Fall back in case IQR is 0 + sigma = np.std(X_train) / 16 + + sigma = sigma / np.sqrt(f) + + else: + # Multimodal + # Suppose you detected k peaks + k = num_peaks # from your detection method + + # Cluster the data into k groups + data_reshaped = X_train.reshape(-1, 1) + labels = KMeans(n_clusters=k, n_init=10).fit_predict(data_reshaped) + + # Compute per-cluster STD + cluster_stds = [np.std(X_train[labels == i]) for i in range(k)] + + # Use average STD for gamma + sigma = np.mean(cluster_stds) + + sigma = sigma / np.sqrt(f) + gamma = 1 / (2 * sigma ** 2) + return gamma + + +def outlier_score_svm(X_train, X_test, nu=0.2, f=0.4, kernel='rbf'): + gamma = get_gamma_svm(X_train, f=f) + model = OneClassSVM(kernel=kernel, gamma=gamma, nu=nu) + model.fit(X_train.reshape(-1, 1)) + pred_svm = model.predict(X_test.reshape(-1, 1)) + return pred_svm From dfca01bb571113860a4004cf3273f2b66d834553 Mon Sep 17 00:00:00 2001 From: GaelleChapuis Date: Tue, 12 Aug 2025 15:59:49 +0200 Subject: [PATCH 04/14] outliers code --- src/ephysatlas/outliers.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/ephysatlas/outliers.py b/src/ephysatlas/outliers.py index 35327aa..d3193f1 100644 --- a/src/ephysatlas/outliers.py +++ b/src/ephysatlas/outliers.py @@ -1,6 +1,5 @@ import numpy as np import pandas as pd -from sklearn.neighbors import KernelDensity from scipy import stats from scipy.interpolate import interp1d from scipy.stats import iqr @@ -401,3 +400,9 @@ def outlier_score_svm(X_train, X_test, nu=0.2, f=0.4, kernel='rbf'): model.fit(X_train.reshape(-1, 1)) pred_svm = model.predict(X_test.reshape(-1, 1)) return pred_svm + + +def generate_testset(X_train): + val_iqr = 3 * iqr(X_train) + X_test = np.linspace(min(X_train) - val_iqr, max(X_train) + val_iqr, 100).reshape(-1, 1) + return X_test \ No newline at end of file From 3185ab1de864fa391353c8c462c176fd4fb08794 Mon Sep 17 00:00:00 2001 From: GaelleChapuis Date: Tue, 12 Aug 2025 18:23:12 +0200 Subject: [PATCH 05/14] score 1 pid avm --- src/ephysatlas/outliers.py | 78 +++++++++++++++++++++++++++++++++++++- 1 file changed, 77 insertions(+), 1 deletion(-) diff --git a/src/ephysatlas/outliers.py b/src/ephysatlas/outliers.py index d3193f1..41fe3a1 100644 --- a/src/ephysatlas/outliers.py +++ b/src/ephysatlas/outliers.py @@ -395,6 +395,11 @@ def get_gamma_svm(X_train, f=0.4): def outlier_score_svm(X_train, X_test, nu=0.2, f=0.4, kernel='rbf'): + # Step 0 : Filter the train data to remove large outliers + X_train = X_train[ + np.abs(X_train - np.median(X_train)) <= 5 * iqr(X_train) + ] + gamma = get_gamma_svm(X_train, f=f) model = OneClassSVM(kernel=kernel, gamma=gamma, nu=nu) model.fit(X_train.reshape(-1, 1)) @@ -405,4 +410,75 @@ def outlier_score_svm(X_train, X_test, nu=0.2, f=0.4, kernel='rbf'): def generate_testset(X_train): val_iqr = 3 * iqr(X_train) X_test = np.linspace(min(X_train) - val_iqr, max(X_train) + val_iqr, 100).reshape(-1, 1) - return X_test \ No newline at end of file + return X_test + + +def score_svm_1pid(df_base, df_new, features, mapping, + min_ch=14, n_pid=3, min_ch_compute=50): + # Regions + regions = np.unique(df_new[mapping + "_id"]).astype(int) + # Store the features that are outlier per brain region in a dict + dictout = dict((el, list()) for el in regions.tolist()) + dictout["mapping"] = mapping + dictout["features"] = features + + for count, region in tqdm.tqdm(enumerate(regions), total=len(regions)): + # Get channel indices that are in region, but keeping all info besides features + idx_reg = np.where(df_new[mapping + "_id"] == region) + df_new_compute = df_new.iloc[idx_reg].copy() + df_new_compute["has_outliers"] = False + + listout = list() + for feature in features: + # print(f"{feature} [{region}]") + # Load data for that regions + df_train = select_series( + df_base, features=[feature], acronym=None, id=region, mapping=mapping + ) + # Get channel indices that are in region, keeping only feature values + df_test = select_series( + df_new, features=[feature], acronym=None, id=region, mapping=mapping + ) + + df_pid = select_series( + df_base, features=['pid'], acronym=None, id=region, mapping=mapping + ) + + # For all channels at once, test if outside the distribution for the given features + train_data = df_train.to_numpy() + test_data = df_test.to_numpy() + + print(f'Train N: {len(train_data)}, Test N: {len(test_data)}') + + # score_out = 0 if N pid or N channel too small in training set + if bool((df_pid.nunique().values[0] >= n_pid) & (df_pid.shape[0] >= min_ch_compute)): + score_out = outlier_score_svm(train_data, test_data) + else: + score_out = np.zeros(test_data.shape) + # Save into new column + df_new_compute[feature + "_extremes"] = score_out + # A region is assigned as having outliers if more than N minimum channels are outliers + has_outliers = sum(df_new_compute[feature + "_extremes"]) > np.floor(min_ch) + if has_outliers: + listout.append(feature) + if sum( + df_new_compute["has_outliers"] == 0 + ): # Reassign only if entirely False + df_new_compute["has_outliers"] = True + # Save appended list of feature in dict + dictout[region] = listout + + # Concatenate dataframes + if count == 0: + df_save = df_new_compute.copy() + else: + df_save = pd.concat([df_save, df_new_compute]) + + if df_save["has_outliers"].sum() > 0: + has_outlier = True + else: + has_outlier = False + + # Resort by channel + df_save = df_save.sort_values(by=["channel"]) + return df_save, dictout, has_outlier From 04f40b8e37cf3442dd2653dc14f672e2babc3a37 Mon Sep 17 00:00:00 2001 From: GaelleChapuis Date: Thu, 14 Aug 2025 17:01:29 +0200 Subject: [PATCH 06/14] outlier SVM --- src/ephysatlas/outliers.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/ephysatlas/outliers.py b/src/ephysatlas/outliers.py index 41fe3a1..91496f3 100644 --- a/src/ephysatlas/outliers.py +++ b/src/ephysatlas/outliers.py @@ -402,7 +402,7 @@ def outlier_score_svm(X_train, X_test, nu=0.2, f=0.4, kernel='rbf'): gamma = get_gamma_svm(X_train, f=f) model = OneClassSVM(kernel=kernel, gamma=gamma, nu=nu) - model.fit(X_train.reshape(-1, 1)) + model.fit(X_train.reshape(-1, 1)) # This is slow for high N ; e.g. 29 seconds for 69k samples pred_svm = model.predict(X_test.reshape(-1, 1)) return pred_svm @@ -414,7 +414,7 @@ def generate_testset(X_train): def score_svm_1pid(df_base, df_new, features, mapping, - min_ch=14, n_pid=3, min_ch_compute=50): + min_ch_outlier=20, n_pid=3, min_ch_compute=100, max_ch_compute = 3000, p_thresh_kde=0.98): # Regions regions = np.unique(df_new[mapping + "_id"]).astype(int) # Store the features that are outlier per brain region in a dict @@ -450,15 +450,25 @@ def score_svm_1pid(df_base, df_new, features, mapping, print(f'Train N: {len(train_data)}, Test N: {len(test_data)}') + # Outlier → 1 + # Inlier → 0 + # score_out = 0 if N pid or N channel too small in training set if bool((df_pid.nunique().values[0] >= n_pid) & (df_pid.shape[0] >= min_ch_compute)): - score_out = outlier_score_svm(train_data, test_data) + score_svm = outlier_score_svm(train_data, test_data) + # map the OneClassSVM output from {1, -1} into {0, 1} + score_out = np.where(score_svm == -1, 1, 0) + elif len(train_data) > max_ch_compute: # Apply KDE method as SVM model.fit is too slow for high N + scoreq_out, _, _ = kde_proba_distribution(train_data, test_data) # This is a score to be thresholded + scoreq_out = scoreq_out.squeeze() # TODO this should not be necessary, place in kde_proba_distribution + score_out = scoreq_out > p_thresh_kde + score_out = score_out.astype(int) else: score_out = np.zeros(test_data.shape) # Save into new column df_new_compute[feature + "_extremes"] = score_out # A region is assigned as having outliers if more than N minimum channels are outliers - has_outliers = sum(df_new_compute[feature + "_extremes"]) > np.floor(min_ch) + has_outliers = sum(df_new_compute[feature + "_extremes"]) > np.floor(min_ch_outlier) if has_outliers: listout.append(feature) if sum( From d00d9620f8dc524545a733aad3bbc3c06cf5b250 Mon Sep 17 00:00:00 2001 From: GaelleChapuis Date: Thu, 14 Aug 2025 17:08:51 +0200 Subject: [PATCH 07/14] logic size --- src/ephysatlas/outliers.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/ephysatlas/outliers.py b/src/ephysatlas/outliers.py index 91496f3..871be7c 100644 --- a/src/ephysatlas/outliers.py +++ b/src/ephysatlas/outliers.py @@ -455,14 +455,17 @@ def score_svm_1pid(df_base, df_new, features, mapping, # score_out = 0 if N pid or N channel too small in training set if bool((df_pid.nunique().values[0] >= n_pid) & (df_pid.shape[0] >= min_ch_compute)): - score_svm = outlier_score_svm(train_data, test_data) - # map the OneClassSVM output from {1, -1} into {0, 1} - score_out = np.where(score_svm == -1, 1, 0) - elif len(train_data) > max_ch_compute: # Apply KDE method as SVM model.fit is too slow for high N - scoreq_out, _, _ = kde_proba_distribution(train_data, test_data) # This is a score to be thresholded - scoreq_out = scoreq_out.squeeze() # TODO this should not be necessary, place in kde_proba_distribution - score_out = scoreq_out > p_thresh_kde - score_out = score_out.astype(int) + if len(train_data) > max_ch_compute: # Apply KDE method as SVM model.fit is too slow for high N + scoreq_out, _, _ = kde_proba_distribution(train_data, + test_data) # This is a score to be thresholded + scoreq_out = scoreq_out.squeeze() # TODO this should not be necessary, place in kde_proba_distribution + score_out = scoreq_out > p_thresh_kde + score_out = score_out.astype(int) + + else: + score_svm = outlier_score_svm(train_data, test_data) + # map the OneClassSVM output from {1, -1} into {0, 1} + score_out = np.where(score_svm == -1, 1, 0) else: score_out = np.zeros(test_data.shape) # Save into new column From 615e14ef848fa6bd5931489054ba77030c176a08 Mon Sep 17 00:00:00 2001 From: GaelleChapuis Date: Fri, 15 Aug 2025 17:47:39 +0200 Subject: [PATCH 08/14] making test for log transform, ATM not passing --- tests/test_outliers.py | 111 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 tests/test_outliers.py diff --git a/tests/test_outliers.py b/tests/test_outliers.py new file mode 100644 index 0000000..1b86baa --- /dev/null +++ b/tests/test_outliers.py @@ -0,0 +1,111 @@ +from ephysatlas import outliers +import numpy as np +import unittest + + +class TestOutliersKDEProba(unittest.TestCase): + + def test_usage(self): + # Set random seed for reproducibility + np.random.seed(42) # TODO this does not seem to work + # Generate train data from a normal distribution + train_data = np.random.normal(loc=-4.5, scale=1.2, size=2000) + # Generate test data from a normal distribution + test_data = np.random.normal(loc=-3.5, scale=0.4, size=60) + # Compute score + score, x, hist = outliers.kde_proba_distribution(train_data, test_data) + assert score.shape[0] == test_data.shape[0] + + score_out = np.array([0.94909745, 0.95985259, 0.94758700, 0.95592986, 0.93989972, + 0.96315955, 0.96212788, 0.94726918, 0.96550552, 0.96921661, + 0.94203708, 0.94796957, 0.94788689, 0.94578141, 0.96682835, + 0.95866918, 0.95697261, 0.94013844, 0.96537497, 0.94521747, + 0.96241355, 0.96347127, 0.97713305, 0.94508643, 0.94181560, + 0.96301632, 0.97997165, 0.97587523, 0.96314187, 0.95152157, + 0.96633449, 0.94571803, 0.96450351, 0.96466301, 0.97192622, + 0.98534690, 0.96404361, 0.95123884, 0.96199762, 0.94007265, + 0.94884578, 0.95349810, 0.95104398, 0.96286312, 0.94720050, + 0.97326999, 0.96212302, 0.96207678, 0.95545146, 0.96287692, + 0.96673165, 0.94662449, 0.94931726, 0.95603152, 0.94501664, + 0.94724124, 0.95189154, 0.96782068, 0.96339920, 0.96306370]) + + np.testing.assert_almost_equal(score, score_out) + + def test_bimodal(self): + # Check that a sample selected in between two Gaussian distributions is detected as an outlier + # Generate train data from 2 normal distributions + train_data = np.concatenate(( + np.random.normal(loc=-4.5, scale=1.2, size=2000), + np.random.normal(loc=17.5, scale=2.2, size=1000) + )) + + # Generate test data from a normal distribution + test_data = np.random.normal(loc=-3.5, scale=0.4, size=60) + # Add some clear outliers + test_data = np.concatenate((test_data, np.array([5.4, 6.4, 0, -9.0]))) + + # Compute score + score, _, _ = outliers.kde_proba_distribution(train_data, test_data) + assert score[-1] == 1 + + def test_pad_span(self): + # If taking a test value very far out, the train histogram will have more padded sample points + # We do not want this to influence the mean result of the score. Test to check the values don't change + + # Generate train data from a normal distribution + train_data = np.random.normal(loc=-4.5, scale=1.2, size=2000) + + # Test some clear outliers + test_data1 = np.array([-9.0, 3.3, -5.2, -7.8]) + test_data2 = np.array([-9.0, 3.3, -5.2, -1000]) + + # Compute score + score1, _, _ = outliers.kde_proba_distribution(train_data, test_data1) + score2, _, _ = outliers.kde_proba_distribution(train_data, test_data2) + + for i_sample in range(0, 3): + assert score1[i_sample] == score2[i_sample] + + + def test_log_transform(self): + # We want to check that doing twice a log/exp transform onto a Gaussian give the same values + + # Gaussian + # Generate train data from a normal distribution + train_data = np.random.normal(loc=-4.5, scale=1.2, size=2000) + # Generate test data linearly around train set + val_iqr = 3 * np.std(train_data) + test_data = np.linspace(min(train_data) - val_iqr, max(train_data) + val_iqr, 100).reshape(-1, 1) + # Compute score + score, x, hist = outliers.kde_proba_distribution(train_data, test_data) + + # Log normal + train_data_log = np.exp(train_data) + test_data_log = np.exp(test_data) + + # Compute score + score_log, _, _ = outliers.kde_proba_distribution(train_data_log, test_data_log) + # TODO this is different and the below does not pass! + np.testing.assert_almost_equal(np.exp(test_data), test_data_log) + + plot_debug = True + # Plot for debug + if plot_debug: + import matplotlib.pyplot as plt + from ephysatlas.plots import plot_histogram + + fig, axs = plt.subplots(1, 3) + axs[0].plot(test_data) + axs[1].plot(np.exp(test_data)) + axs[2].plot(test_data_log) + + fig, axs = plt.subplots(1, 1) + plot_histogram(train_data, ax=axs, normalise=True) + plt.plot(test_data, -1*(score-1), 'k.-') + plt.plot(np.log(test_data_log), -1 * (score_log - 1), 'b.-') + + fig, axs = plt.subplots(1, 1) + plot_histogram(train_data_log, ax=axs, normalise=True) + plt.plot(test_data_log, -1*(score_log-1), 'k.-') + + From fd18815fa1f57fd74afb5c8397fedd824d776532 Mon Sep 17 00:00:00 2001 From: GaelleChapuis Date: Fri, 15 Aug 2025 17:56:06 +0200 Subject: [PATCH 09/14] copy to enforce variable local --- src/ephysatlas/outliers.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/ephysatlas/outliers.py b/src/ephysatlas/outliers.py index 871be7c..6d837e1 100644 --- a/src/ephysatlas/outliers.py +++ b/src/ephysatlas/outliers.py @@ -81,6 +81,10 @@ def kde_proba_distribution( - x_train: (E, ) numpy array, x values of the histogram formed using training dataset - hist_train: (E, ) numpy array, y values of the histogram formed using training dataset """ + # Do this to force the variables to be local to function + test_data = test_data.copy() + train_data = train_data.copy() + if len(train_data) > n_min_sample_train: # Step 0 : Filter the train data to remove large outliers train_data = train_data[ From 02e0febbe7a0c4d05fdaa2c1c4a4ec136a29b5da Mon Sep 17 00:00:00 2001 From: GaelleChapuis Date: Mon, 18 Aug 2025 15:08:06 +0200 Subject: [PATCH 10/14] outliers changes for KS test metrics --- src/ephysatlas/outliers.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/ephysatlas/outliers.py b/src/ephysatlas/outliers.py index 6d837e1..e4b9c0f 100644 --- a/src/ephysatlas/outliers.py +++ b/src/ephysatlas/outliers.py @@ -206,19 +206,16 @@ def kde_proba_1pid(df_base, df_new, features, mapping, p_thresh=0.999999, if bool((df_pid.nunique().values[0] >= n_pid) & (df_pid.shape[0] >= min_ch_compute)): score_out, _, _ = kde_proba_distribution(train_data, test_data) else: - score_out = np.zeros(test_data.shape) + score_out = -1 * np.ones(test_data.shape) # Assign value -1 when cannot compute # Save into new column df_new_compute[feature + "_q"] = score_out df_new_compute[feature + "_extremes"] = 0 df_new_compute.loc[ df_new_compute[feature + "_q"] > p_thresh, feature + "_extremes" ] = 1 - # A region is assigned as having outliers if more than half its channels are outliers - # Condition on N minimum channel. - has_outliers = sum(df_new_compute[feature + "_extremes"]) > np.floor( - len(test_data) / 2 - ) - if len(test_data) >= min_ch and has_outliers: + # A region is assigned as having outliers if N minimum channel are outliers. + has_outliers = sum(df_new_compute[feature + "_extremes"]) >= min_ch + if has_outliers: listout.append(feature) if sum( df_new_compute["has_outliers"] == 0 From efa0636ea351aac6fa1b894c2b104ab77769abc2 Mon Sep 17 00:00:00 2001 From: GaelleChapuis Date: Tue, 19 Aug 2025 12:14:07 +0200 Subject: [PATCH 11/14] svm score --- src/ephysatlas/outliers.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/src/ephysatlas/outliers.py b/src/ephysatlas/outliers.py index e4b9c0f..30a07b5 100644 --- a/src/ephysatlas/outliers.py +++ b/src/ephysatlas/outliers.py @@ -395,7 +395,7 @@ def get_gamma_svm(X_train, f=0.4): return gamma -def outlier_score_svm(X_train, X_test, nu=0.2, f=0.4, kernel='rbf'): +def outlier_score_svm(X_train, X_test, nu=0.2, f=0.4, kernel='rbf', output='prediction'): # Step 0 : Filter the train data to remove large outliers X_train = X_train[ np.abs(X_train - np.median(X_train)) <= 5 * iqr(X_train) @@ -404,8 +404,22 @@ def outlier_score_svm(X_train, X_test, nu=0.2, f=0.4, kernel='rbf'): gamma = get_gamma_svm(X_train, f=f) model = OneClassSVM(kernel=kernel, gamma=gamma, nu=nu) model.fit(X_train.reshape(-1, 1)) # This is slow for high N ; e.g. 29 seconds for 69k samples - pred_svm = model.predict(X_test.reshape(-1, 1)) - return pred_svm + + if output == 'prediction': + # Let the model decide what is the predicted output, score is set to 0 + pred_svm = model.predict(X_test.reshape(-1, 1)) + score_test = np.zeros(np.shape(pred_svm)) + elif output == 'score': + # Outputting a score instead of directly the SVM prediction + scores_train = model.decision_function(X_train.reshape(-1, 1)) + scores_test = model.decision_function(X_test.reshape(-1, 1)) + threshold = np.percentile(scores_train, 100 * nu) # nu fraction as outliers + pred_svm = np.where(scores_test >= threshold, 1, -1) + + # map the OneClassSVM output from {1, -1} into {0, 1} + pred_svm_01 = np.where(pred_svm == -1, 1, 0) + + return pred_svm_01, score_test def generate_testset(X_train): @@ -449,7 +463,7 @@ def score_svm_1pid(df_base, df_new, features, mapping, train_data = df_train.to_numpy() test_data = df_test.to_numpy() - print(f'Train N: {len(train_data)}, Test N: {len(test_data)}') + # print(f'Train N: {len(train_data)}, Test N: {len(test_data)}') # Outlier → 1 # Inlier → 0 @@ -457,16 +471,13 @@ def score_svm_1pid(df_base, df_new, features, mapping, # score_out = 0 if N pid or N channel too small in training set if bool((df_pid.nunique().values[0] >= n_pid) & (df_pid.shape[0] >= min_ch_compute)): if len(train_data) > max_ch_compute: # Apply KDE method as SVM model.fit is too slow for high N - scoreq_out, _, _ = kde_proba_distribution(train_data, - test_data) # This is a score to be thresholded + scoreq_out, _, _ = kde_proba_distribution(train_data, test_data) # This is a score to be thresholded scoreq_out = scoreq_out.squeeze() # TODO this should not be necessary, place in kde_proba_distribution score_out = scoreq_out > p_thresh_kde score_out = score_out.astype(int) else: - score_svm = outlier_score_svm(train_data, test_data) - # map the OneClassSVM output from {1, -1} into {0, 1} - score_out = np.where(score_svm == -1, 1, 0) + pred_svm, _ = outlier_score_svm(train_data, test_data) else: score_out = np.zeros(test_data.shape) # Save into new column From 0fd591c91feafc24bc15cc879e83688ec5959b00 Mon Sep 17 00:00:00 2001 From: GaelleChapuis Date: Tue, 26 Aug 2025 16:51:25 +0200 Subject: [PATCH 12/14] decoder wip --- src/ephysatlas/autism_decoding.py | 208 +++++++++++++++++++++++++++++ src/ephysatlas/regionclassifier.py | 2 +- 2 files changed, 209 insertions(+), 1 deletion(-) create mode 100644 src/ephysatlas/autism_decoding.py diff --git a/src/ephysatlas/autism_decoding.py b/src/ephysatlas/autism_decoding.py new file mode 100644 index 0000000..3d307b1 --- /dev/null +++ b/src/ephysatlas/autism_decoding.py @@ -0,0 +1,208 @@ + +# Get autism data + +import numpy as np +import pandas as pd +from pathlib import Path +from one.api import ONE +import ephysatlas.data +import ephysatlas.aggregation +from ephysatlas.features import voltage_features_set +from ephysatlas.features import EphysTransformer +from ephysatlas.plots import plot_histogram, select_series +from ephysatlas.anatomy import ClassifierRegions, NEW_VOID + +BASE_PATH = Path('/Users/gaellechapuis/Documents/Work/EphysAtlas') + +def download_baseline_data(local_data_path=BASE_PATH, label='2025_W28', one=None): + if one is None: one = ONE(base_url='https://alyx.internationalbrainlab.org', mode='remote') + download_path = ephysatlas.data.download_tables(local_data_path, label=label, one=one) + return download_path + +def get_autism_baseline_data(path_autism, path_baseline, + APPLY_DENOISING = False, APPLY_TRANSFORMER = True, + strict = True, load_denoised = False, + features=None, br=None): + + # This function assumes the data is already downloaded + + if br is None: + br = ClassifierRegions() + br.add_new_region(NEW_VOID) + + if features is None: features = voltage_features_set() + if not load_denoised: strict = False + + # ---- BASELINE DATA ---- + df_voltage = ephysatlas.data.read_features_from_disk(path_baseline, + strict=strict, + load_denoised=load_denoised) + + # ---- AUTISM DATA ---- + df_autism = ephysatlas.data.read_features_from_disk(path_autism, + strict=strict, + load_denoised=load_denoised) + # Apply denoising or transform + if not load_denoised: + if APPLY_DENOISING: # This applies the transformer automatically + # Apply denoising without bad channel interpolation + df_voltage['channel_labels'] = 0 + df_voltage = ephysatlas.aggregation.denoise_raw_features_data(df_voltage) + + df_autism['channel_labels'] = 0 + df_autism = ephysatlas.aggregation.denoise_raw_features_data(df_autism) + + elif APPLY_TRANSFORMER: # Applies transformer without denoising first + df_voltage = EphysTransformer().fit_transform(df_voltage) + df_autism = EphysTransformer().fit_transform(df_autism) + + df_voltage = df_voltage.reset_index().dropna() + df_autism = df_autism.reset_index().dropna() + + # -- Check that features are in df columns + setfeature = set(df_voltage.columns).intersection(set(features)) + setfeature.discard("channel_labels") # Remove channel labels as a feature + features = sorted(list(setfeature)) + + return df_autism, df_voltage, features + + +# ==== +from sklearn.preprocessing import StandardScaler +from sklearn.pipeline import Pipeline +from sklearn.linear_model import LogisticRegression +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import LeaveOneGroupOut +from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score + +def pid_cv_region_summary(df_baseline, df_autism, feature_cols, model_type="log_reg"): + """ + PID-wise cross-validation for a single brain region, with region-level aggregation. + + Parameters + ---------- + df_baseline : pd.DataFrame + Baseline data for this region + df_autism : pd.DataFrame + Autism data for this region + feature_cols : list + Features to use + model_type : str + "log_reg" (default) or "random_forest" + + Returns + ------- + pid_metrics : pd.DataFrame + Per-PID metrics: accuracy, precision, recall, f1 + region_summary : pd.Series + Region-level mean metrics aggregated over autism PIDs + final_model : fitted sklearn Pipeline on all data + """ + # Label datasets + df_baseline = df_baseline.copy() + df_autism = df_autism.copy() + df_baseline["label"] = 0 + df_autism["label"] = 1 + + # Combine data for this region + df_region = pd.concat([df_baseline, df_autism], ignore_index=True) + X, y = get_set_classifier_xy(df_region, feature_cols) + # Select classifier + if model_type == "log_reg": + clf = LogisticRegression(max_iter=500) + elif model_type == "random_forest": + clf = RandomForestClassifier(n_estimators=200, random_state=42) + else: + raise ValueError(f"Unknown model_type: {model_type}") + + # Do cross val + autism_pids = df_autism["pid"].unique() + pid_metrics = cross_val(df_region, feature_cols, clf, cv_pids=autism_pids) + weighted_recall = aggregate_pid_scores(pid_metrics, min_channels=20) + + # Train final model on all data + final_pipe = Pipeline([ + ("scaler", StandardScaler()), + ("clf", clf) + ]) + final_pipe.fit(X, y) + + feature_importances = pd.DataFrame({ + "feature": feature_cols, + "coef": clf.coef_.flatten(), + "abs_coef": np.abs(clf.coef_.flatten()) + }).sort_values("abs_coef", ascending=False) + + return pid_metrics, weighted_recall, feature_importances + + + +def get_set_classifier_xy(df_region, feature_cols): + X = df_region[feature_cols].to_numpy() + y = df_region["label"].to_numpy() + return X, y + + + +## +def cross_val(df_region, feature_cols, clf, cv_pids=None): + X, y = get_set_classifier_xy(df_region, feature_cols) + + if cv_pids is None: + cv_pids = df_region["pid"] + + logo = LeaveOneGroupOut() + pid_rows = [] + + # Cross-val each PID + groups = df_region["pid"].to_numpy() # PID grouping + for train_idx, test_idx in logo.split(df_region, y, groups=groups): + test_pid = groups[test_idx][0] + # Only leave out the PIDs to do cross-validation on + if test_pid not in cv_pids: + continue + + X_train, X_test = X[train_idx], X[test_idx] + y_train, y_test = y[train_idx], y[test_idx] + + pipe = Pipeline([ + ("scaler", StandardScaler()), + ("clf", clf) + ]) + pipe.fit(X_train, y_train) + y_pred = pipe.predict(X_test) + + # Compute per-PID metrics + + pid_rows.append({ + "pid": test_pid, + "n_channel" : len(test_idx), + "accuracy": accuracy_score(y_test, y_pred), + # These will be set to 0 if only autism data cross-val + "precision_0": precision_score(y_test, y_pred, pos_label=0, zero_division=0), + "recall_0": recall_score(y_test, y_pred, pos_label=0, zero_division=0), + "f1_0": f1_score(y_test, y_pred, pos_label=0, zero_division=0), + # These will represent autism data as label =1 + "precision_1": precision_score(y_test, y_pred, pos_label=1, zero_division=0), + "recall_1": recall_score(y_test, y_pred, pos_label=1, zero_division=0), + "f1_1": f1_score(y_test, y_pred, pos_label=1, zero_division=0) + }) + + pid_metrics = pd.DataFrame(pid_rows) + return pid_metrics + + +def aggregate_pid_scores(pid_metrics, min_channels=20, min_pid=3): + """ + Aggregate per-PID recall scores to region-level score. + """ + # Keep only PIDs with enough channels + pid_metrics_ch = pid_metrics[pid_metrics["n_channel"] >= min_channels] + + if pid_metrics_ch.empty or len(pid_metrics_ch) Date: Tue, 9 Sep 2025 11:06:59 +0200 Subject: [PATCH 13/14] get dataframe --- src/ephysatlas/autism_decoding.py | 62 +++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 3 deletions(-) diff --git a/src/ephysatlas/autism_decoding.py b/src/ephysatlas/autism_decoding.py index 3d307b1..e5a8e05 100644 --- a/src/ephysatlas/autism_decoding.py +++ b/src/ephysatlas/autism_decoding.py @@ -14,6 +14,39 @@ BASE_PATH = Path('/Users/gaellechapuis/Documents/Work/EphysAtlas') +def remove_void_root(df_voltage, mapping, br, list_acronyms=None): + if list_acronyms is None: + list_acronyms = ['void', 'root', 'void_fluid'] # Todo remove void liquid + vect_void_root = br.acronym2id(list_acronyms) + df_voltage = df_voltage[~df_voltage[mapping + "_id"].isin(vect_void_root)] + return df_voltage + + +def set_common_region(df_voltage, df_autism, id_region_voltage, id_region_autism, mapping): + set_common = set(id_region_voltage).intersection(set(id_region_autism)) + df_voltage = df_voltage[df_voltage[mapping + "_id"].isin(set_common)] + df_autism = df_autism[df_autism[mapping + "_id"].isin(set_common)] + return df_voltage, df_autism + +def get_br_id_min_pid(df, mapping, min_pid): + df_pid = df[[mapping+'_id', 'pid']].copy() + df_pid = df_pid.drop_duplicates() + gr_reg = df_pid.groupby([mapping+'_id']).aggregate('count') + id_region = gr_reg[gr_reg['pid']>=min_pid].index + # Keeps rows with brain region col values + filtered_df = df[df[mapping+'_id'].isin(id_region)] + return filtered_df, id_region + + +def get_br_id_min_channel(df, mapping, min_channel): + df_channel = df[[mapping+'_id', 'pid', 'channel']].copy() + df_channel = df_channel.drop_duplicates() # This should not change the df + gr_reg = df_channel.groupby([mapping+'_id']).aggregate('count') + id_region = gr_reg[gr_reg['channel']>=min_channel].index + # Keeps rows with brain region col values + filtered_df = df[df[mapping+'_id'].isin(id_region)] + return filtered_df, id_region + def download_baseline_data(local_data_path=BASE_PATH, label='2025_W28', one=None): if one is None: one = ONE(base_url='https://alyx.internationalbrainlab.org', mode='remote') download_path = ephysatlas.data.download_tables(local_data_path, label=label, one=one) @@ -22,7 +55,9 @@ def download_baseline_data(local_data_path=BASE_PATH, label='2025_W28', one=None def get_autism_baseline_data(path_autism, path_baseline, APPLY_DENOISING = False, APPLY_TRANSFORMER = True, strict = True, load_denoised = False, - features=None, br=None): + features=None, br=None, + min_pid=5, min_channel=200, mapping='Beryl', + remove_voidroot=True): # This function assumes the data is already downloaded @@ -59,6 +94,27 @@ def get_autism_baseline_data(path_autism, path_baseline, df_voltage = df_voltage.reset_index().dropna() df_autism = df_autism.reset_index().dropna() + # Remove bad channels ; keep only good channels with label 0 + df_voltage = df_voltage[df_voltage["channel_labels"] == 0] + df_autism = df_autism[df_autism["channel_labels"] == 0] + + # Remove void and root + if remove_voidroot: + df_voltage = remove_void_root(df_voltage, mapping, br) + df_autism = remove_void_root(df_autism, mapping, br) + + # Check per region that there is min number of PID + df_voltage, id_region_voltage = get_br_id_min_pid(df_voltage, mapping, min_pid) + df_autism, id_region_autism = get_br_id_min_pid(df_autism, mapping, min_pid) + # Take union of region remaining in Autism / EA + df_voltage, df_autism = set_common_region(df_voltage, df_autism, id_region_voltage, id_region_autism, mapping) + + # Check per region that there is at least number channels + df_voltage, id_region_voltage = get_br_id_min_channel(df_voltage, mapping, min_channel) + df_autism, id_region_autism = get_br_id_min_channel(df_autism, mapping, min_channel) + # Take union of region remaining in Autism / EA + df_voltage, df_autism = set_common_region(df_voltage, df_autism, id_region_voltage, id_region_autism, mapping) + # -- Check that features are in df columns setfeature = set(df_voltage.columns).intersection(set(features)) setfeature.discard("channel_labels") # Remove channel labels as a feature @@ -137,9 +193,9 @@ def pid_cv_region_summary(df_baseline, df_autism, feature_cols, model_type="log_ -def get_set_classifier_xy(df_region, feature_cols): +def get_set_classifier_xy(df_region, feature_cols, label_col="label"): X = df_region[feature_cols].to_numpy() - y = df_region["label"].to_numpy() + y = df_region[label_col].to_numpy() return X, y From f0ca5cbf31fe271664e14c998640878e5a40ac3d Mon Sep 17 00:00:00 2001 From: GaelleChapuis Date: Thu, 11 Sep 2025 16:25:13 +0200 Subject: [PATCH 14/14] decoder binary --- src/ephysatlas/autism_decoding.py | 252 ++++++++++++++++++++++++++---- 1 file changed, 223 insertions(+), 29 deletions(-) diff --git a/src/ephysatlas/autism_decoding.py b/src/ephysatlas/autism_decoding.py index e5a8e05..8db6968 100644 --- a/src/ephysatlas/autism_decoding.py +++ b/src/ephysatlas/autism_decoding.py @@ -11,6 +11,13 @@ from ephysatlas.features import EphysTransformer from ephysatlas.plots import plot_histogram, select_series from ephysatlas.anatomy import ClassifierRegions, NEW_VOID +import seaborn as sns +import xgboost as xgb +from sklearn.metrics import classification_report +from sklearn.model_selection import StratifiedGroupKFold +import shap +from collections import Counter +import matplotlib.pyplot as plt BASE_PATH = Path('/Users/gaellechapuis/Documents/Work/EphysAtlas') @@ -52,6 +59,39 @@ def download_baseline_data(local_data_path=BASE_PATH, label='2025_W28', one=None download_path = ephysatlas.data.download_tables(local_data_path, label=label, one=one) return download_path +def clean_df(df_autism, df_voltage, remove_voidroot, mapping, br, min_pid, min_channel, features): + df_voltage = df_voltage.reset_index().dropna() + df_autism = df_autism.reset_index().dropna() + + # Remove bad channels ; keep only good channels with label 0 + df_voltage = df_voltage[df_voltage["channel_labels"] == 0] + df_autism = df_autism[df_autism["channel_labels"] == 0] + + # Remove void and root + if remove_voidroot: + df_voltage = remove_void_root(df_voltage, mapping, br) + df_autism = remove_void_root(df_autism, mapping, br) + + # Check per region that there is min number of PID + df_voltage, id_region_voltage = get_br_id_min_pid(df_voltage, mapping, min_pid) + df_autism, id_region_autism = get_br_id_min_pid(df_autism, mapping, min_pid) + # Take union of region remaining in Autism / EA + df_voltage, df_autism = set_common_region(df_voltage, df_autism, id_region_voltage, id_region_autism, mapping) + + # Check per region that there is at least number channels + df_voltage, id_region_voltage = get_br_id_min_channel(df_voltage, mapping, min_channel) + df_autism, id_region_autism = get_br_id_min_channel(df_autism, mapping, min_channel) + # Take union of region remaining in Autism / EA + df_voltage, df_autism = set_common_region(df_voltage, df_autism, id_region_voltage, id_region_autism, mapping) + + # -- Check that features are in df columns + setfeature = set(df_voltage.columns).intersection(set(features)) + setfeature.discard("channel_labels") # Remove channel labels as a feature + features = sorted(list(setfeature)) + + return df_autism, df_voltage, features + + def get_autism_baseline_data(path_autism, path_baseline, APPLY_DENOISING = False, APPLY_TRANSFORMER = True, strict = True, load_denoised = False, @@ -91,34 +131,7 @@ def get_autism_baseline_data(path_autism, path_baseline, df_voltage = EphysTransformer().fit_transform(df_voltage) df_autism = EphysTransformer().fit_transform(df_autism) - df_voltage = df_voltage.reset_index().dropna() - df_autism = df_autism.reset_index().dropna() - - # Remove bad channels ; keep only good channels with label 0 - df_voltage = df_voltage[df_voltage["channel_labels"] == 0] - df_autism = df_autism[df_autism["channel_labels"] == 0] - - # Remove void and root - if remove_voidroot: - df_voltage = remove_void_root(df_voltage, mapping, br) - df_autism = remove_void_root(df_autism, mapping, br) - - # Check per region that there is min number of PID - df_voltage, id_region_voltage = get_br_id_min_pid(df_voltage, mapping, min_pid) - df_autism, id_region_autism = get_br_id_min_pid(df_autism, mapping, min_pid) - # Take union of region remaining in Autism / EA - df_voltage, df_autism = set_common_region(df_voltage, df_autism, id_region_voltage, id_region_autism, mapping) - - # Check per region that there is at least number channels - df_voltage, id_region_voltage = get_br_id_min_channel(df_voltage, mapping, min_channel) - df_autism, id_region_autism = get_br_id_min_channel(df_autism, mapping, min_channel) - # Take union of region remaining in Autism / EA - df_voltage, df_autism = set_common_region(df_voltage, df_autism, id_region_voltage, id_region_autism, mapping) - - # -- Check that features are in df columns - setfeature = set(df_voltage.columns).intersection(set(features)) - setfeature.discard("channel_labels") # Remove channel labels as a feature - features = sorted(list(setfeature)) + df_autism, df_voltage, features = clean_df(df_autism, df_voltage, remove_voidroot, mapping, br, min_pid, min_channel, features) return df_autism, df_voltage, features @@ -261,4 +274,185 @@ def aggregate_pid_scores(pid_metrics, min_channels=20, min_pid=3): # Weighted mean by number of channels weights = pid_metrics["n_channel"] weighted_recall = np.average(pid_metrics["recall_1"], weights=weights) - return weighted_recall \ No newline at end of file + return weighted_recall + + +## ----- BINARY DECODER ------ + +N_SPLIT = 3 +RANDOM_STATE = 42 + + +def get_data_per_fold(df_classes, features, label_col="label", + n_split=N_SPLIT, shuffle=True, random_state=RANDOM_STATE): + # Get data sets + X, y = get_set_classifier_xy(df_classes, features, label_col=label_col) + + # Stratify groups to remove entire PIDs + sgkf = StratifiedGroupKFold(n_splits=n_split, shuffle=shuffle, random_state=random_state) + # Instantiate dict to save statistics of fold + fold_data = dict() + + for fold_idx, (train_idx, test_idx) in enumerate(sgkf.split(X, y, groups=df_classes["pid"]), start=1): + + X_train, X_test = X[train_idx], X[test_idx] + y_train, y_test = y[train_idx], y[test_idx] + + y_test_random = np.random.permutation(y_test) + + # Get the train/test PIDs from the split + train_pids = df_classes.iloc[train_idx]["pid"].unique() + test_pids = df_classes.iloc[test_idx]["pid"].unique() + + # Count how many unique PIDs of each label are in train/test + train_counts = df_classes[df_classes["pid"].isin(train_pids)].drop_duplicates("pid")["label"].value_counts() + test_counts = df_classes[df_classes["pid"].isin(test_pids)].drop_duplicates("pid")["label"].value_counts() + + # Compute class weight within the fold + # Compute in 2 ways, as this depends on the type of classifier used: class_weight and scale_weight + # y_train and y_test come from your split + n_train = len(y_train) + classes_train = np.unique(y_train) + K = len(classes_train) # This will always be 2 in our case + # count samples per class in y_train + counts = Counter(y_train) + # compute weights + class_weights = {cls: n_train / (K * counts[cls]) for cls in classes_train} + # Scale weight + n_pos = sum(y_train == 1) + n_neg = sum(y_train == 0) + scale_pos_weight = n_neg / n_pos + + # Check there are the right N classes in test set + classes_test = np.unique(y_test) + is_test_balanced = len(classes_test) == K + + fold_data[fold_idx] = {"fold": fold_idx, + "features": features, + "X_train": X_train, + "X_test": X_test, + "y_train": y_train, + "y_test": y_test, + "y_test_random": y_test_random, + "n_train_channels": np.bincount(y_train), + "n_test_channels": np.bincount(y_test), + "n_train_pids": train_counts, + "n_test_pids": test_counts, + "list_train_pids": train_pids, + "list_test_pids": test_pids, + "class_weights": class_weights, + "scale_pos_weight": scale_pos_weight, + "random_state": random_state, + "is_test_balanced": is_test_balanced} + + return fold_data + + +def get_weight_classes(df_baseline, df_compare): + scale_pos_weight = len(df_baseline) / len(df_compare) + return scale_pos_weight + +def train_classifer_one_fold(fold, eval_metric="logloss"): + X_train = fold["X_train"] + X_test = fold["X_test"] + y_train = fold["y_train"] + y_test = fold["y_test"] + y_test_random = fold['y_test_random'] + features = fold['features'] + scale_pos_weight = fold['scale_pos_weight'] + + report_perf = dict() + + # Train classifier + model = xgb.XGBClassifier(use_label_encoder=False, eval_metric=eval_metric, scale_pos_weight=scale_pos_weight) + model.fit(X_train, y_train) + # Check on performance + y_pred = model.predict(X_test) + + report_perf['classification_report'] = classification_report(y_test, y_pred, output_dict=True) + report_perf['classification_report_random'] = classification_report(y_test, y_test_random, output_dict=True) + + # Create explainer + explainer = shap.TreeExplainer(model) + shap_values = explainer.shap_values(X_test) + # Take mean absolute SHAP value across samples + importance = np.abs(shap_values).mean(axis=0) + + report_perf['shap_values'] = shap_values + report_perf['importance'] = importance + report_perf['features'] = features + + return report_perf + + +def print_classification_report(report): + # Convert to DataFrame + df_report = pd.DataFrame(report).transpose() + print(df_report.head()) + + +def get_sorted_list_feature_importance(features, importance): + # Put into a DataFrame for clarity + feature_importance = pd.DataFrame({ + "feature": features, + "importance": importance + }) + # Sort by importance + feature_importance = feature_importance.sort_values("importance", ascending=False).reset_index(drop=True) + list_imp = feature_importance['feature'].to_list() + return list_imp + +def plot_one_fold(fold, report_perf, label='label'): + importance = report_perf['importance'] + shap_values = report_perf['shap_values'] + features = report_perf['features'] + + report = report_perf['classification_report'] + report_random = report_perf['classification_report_random'] + print_classification_report(report) + print_classification_report(report_random) + print(fold['is_test_balanced']) + + + X_test = fold['X_test'] + X_test_df = pd.DataFrame(X_test, columns=features) + plot_df = X_test_df.copy() + plot_df[label] = fold['y_test'] + + # 1. Global feature importance (summary plot) + # This shows which features matter most overall. + shap.summary_plot(shap_values, X_test, feature_names=features) + + # 2. Cross plot of first 3 important features + list_feat = get_sorted_list_feature_importance(features, importance) + # Select first 3 for plotting + features_plot = list_feat[0:3] + # print(features_plot) + sns.pairplot(data=plot_df[features_plot + [label]], hue=label) + + # 3. Feature-level effect (dependence plot) + # To see how a single feature influences predictions: + shap.dependence_plot(features_plot[0], shap_values, X_test_df) + + +def plot_one_fold_region(fold, report_perf, mapping='Beryl'): + # Plot specific to training with region as information column + X_test = fold['X_test'] + features = report_perf['features'] + shap_values = report_perf['shap_values'] + X_test_df = pd.DataFrame(X_test, columns=features) + + # Place absolute SHAP values into dataframe + df_shap = pd.DataFrame(np.abs(shap_values), columns=features) + # Overwrite region column otherwise it contain the SHAP value for region ID + df_shap[mapping + "_id"] = X_test_df[mapping + "_id"].astype('int') + # Mean absolute SHAP per feature, stratified by region + grouped = df_shap.groupby(mapping + "_id").mean() + + plt.figure(figsize=(12,6)) + sns.heatmap(grouped, cmap="viridis", annot=False) + plt.title("Mean |SHAP| values per feature and region") + plt.ylabel(mapping + "_id (region)") + plt.xlabel("Feature") + plt.show() +