From f522ead6254c5bc8309601c9465f98ce6234df4b Mon Sep 17 00:00:00 2001
From: GaelleChapuis <gaelle.chapuis.ibl@gmail.com>
Date: Wed, 6 Aug 2025 12:26:31 +0200
Subject: [PATCH 01/14] score outlier fit to 1 for extreme values

---
 src/ephysatlas/outliers.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/ephysatlas/outliers.py b/src/ephysatlas/outliers.py
index fcfc940..0de44ea 100644
--- a/src/ephysatlas/outliers.py
+++ b/src/ephysatlas/outliers.py
@@ -115,7 +115,12 @@ def kde_proba_distribution(
     n_above = 0
     n_below = 0
 
-    # TODO for extremely high value (e.g. >10 IQR), remove them, put outlier score to 1
+    # For extremely high value (e.g. >10 IQR), remove them, put outlier score to 1
+    val_replace = np.mean(train_data)  # Use the mean of the train data so it's well within boundaries
+    idx_replace = np.where( (test_data > np.mean(train_data) + 10 * iqr(train_data)) |
+                            (test_data < np.mean(train_data) - 10 * iqr(train_data))
+                            )[0]
+    test_data[idx_replace] = val_replace
 
     if np.max(test_data) > np.max(x_train):
         # Pad above
@@ -150,6 +155,7 @@ def kde_proba_distribution(
 
     # The outlier probability is the inverse
     outp = 1 - y_test
+    outp[idx_replace] = 1  # Replace extreme outliers that were set to default value to outlier score 1
 
     return outp, x_train, hist_train
 

From 9405666d8ff19625af1d42d24163d5fc8ea421ea Mon Sep 17 00:00:00 2001
From: GaelleChapuis <gaelle.chapuis.ibl@gmail.com>
Date: Wed, 6 Aug 2025 18:57:09 +0200
Subject: [PATCH 02/14] condition on N channel / PID in training set

---
 src/ephysatlas/outliers.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/ephysatlas/outliers.py b/src/ephysatlas/outliers.py
index 0de44ea..3ea022f 100644
--- a/src/ephysatlas/outliers.py
+++ b/src/ephysatlas/outliers.py
@@ -160,7 +160,8 @@ def kde_proba_distribution(
     return outp, x_train, hist_train
 
 
-def kde_proba_1pid(df_base, df_new, features, mapping, p_thresh=0.999999, min_ch=15):
+def kde_proba_1pid(df_base, df_new, features, mapping, p_thresh=0.999999,
+                   min_ch=15, n_pid=3, min_ch_compute=100):
     # Regions
     regions = np.unique(df_new[mapping + "_id"]).astype(int)
     # Store the features that are outlier per brain region in a dict
@@ -181,17 +182,18 @@ def kde_proba_1pid(df_base, df_new, features, mapping, p_thresh=0.999999, min_ch
             df_train = select_series(
                 df_base, features=[feature], acronym=None, id=region, mapping=mapping
             )
-
             # Get channel indices that are in region, keeping only feature values
             df_test = select_series(
                 df_new, features=[feature], acronym=None, id=region, mapping=mapping
             )
-
             # For all channels at once, test if outside the distribution for the given features
             train_data = df_train.to_numpy()
             test_data = df_test.to_numpy()
-            score_out, _, _ = kde_proba_distribution(train_data, test_data)
-            # score_out, _, _ = detect_outlier_histV2(train_data, test_data)
+            # score_out = 0 if N pid or N channel too small in training set
+            if (len(df_train['pid'].unique()) >= n_pid) and (df_train.shape[0] >= min_ch_compute):
+                score_out, _, _ = kde_proba_distribution(train_data, test_data)
+            else:
+                score_out = np.zeros(test_data.shape)
             # Save into new column
             df_new_compute[feature + "_q"] = score_out
             df_new_compute[feature + "_extremes"] = 0

From 1d14b62829b7c6b2f5e8785f477e852f0f22b179 Mon Sep 17 00:00:00 2001
From: GaelleChapuis <gaelle.chapuis.ibl@gmail.com>
Date: Tue, 12 Aug 2025 15:18:51 +0200
Subject: [PATCH 03/14] SVM

---
 src/ephysatlas/outliers.py | 108 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 106 insertions(+), 2 deletions(-)

diff --git a/src/ephysatlas/outliers.py b/src/ephysatlas/outliers.py
index 3ea022f..35327aa 100644
--- a/src/ephysatlas/outliers.py
+++ b/src/ephysatlas/outliers.py
@@ -11,6 +11,11 @@
 from ephysatlas.plots import select_series
 from ephysatlas.anatomy import ClassifierRegions, NEW_VOID
 
+from sklearn.svm import OneClassSVM
+from sklearn.cluster import KMeans
+from scipy.signal import find_peaks
+from sklearn.neighbors import KernelDensity
+
 # from ephys_atlas.plots import BINS
 BINS = 50
 
@@ -177,7 +182,7 @@ def kde_proba_1pid(df_base, df_new, features, mapping, p_thresh=0.999999,
 
         listout = list()
         for feature in features:
-            print(f"{feature} [{region}]")
+            # print(f"{feature} [{region}]")
             # Load data for that regions
             df_train = select_series(
                 df_base, features=[feature], acronym=None, id=region, mapping=mapping
@@ -186,11 +191,16 @@ def kde_proba_1pid(df_base, df_new, features, mapping, p_thresh=0.999999,
             df_test = select_series(
                 df_new, features=[feature], acronym=None, id=region, mapping=mapping
             )
+
+            df_pid = select_series(
+                df_base, features=['pid'], acronym=None, id=region, mapping=mapping
+            )
+
             # For all channels at once, test if outside the distribution for the given features
             train_data = df_train.to_numpy()
             test_data = df_test.to_numpy()
             # score_out = 0 if N pid or N channel too small in training set
-            if (len(df_train['pid'].unique()) >= n_pid) and (df_train.shape[0] >= min_ch_compute):
+            if bool((df_pid.nunique().values[0] >= n_pid) & (df_pid.shape[0] >= min_ch_compute)):
                 score_out, _, _ = kde_proba_distribution(train_data, test_data)
             else:
                 score_out = np.zeros(test_data.shape)
@@ -297,3 +307,97 @@ def df_add_channel_label(df, pid, one=None):
     # Merge to get the labels column
     df = df.merge(df_label, left_index=True, right_index=True)
     return df
+
+# ===========================================================
+# ====== SVM ================================================
+
+def detect_modality_auto_bandwidth(data, peak_height_frac=0.1, resolution=500):
+    """
+    Detect unimodal or multimodal distribution using KDE with Silverman's bandwidth.
+
+    Parameters
+    ----------
+    data : array-like
+        Raw data points.
+    peak_height_frac : float, optional
+        Fraction of maximum density for minimum peak height.
+    resolution : int, optional
+        Number of points in the KDE evaluation grid.
+
+    Returns
+    -------
+    modality : str
+        "unimodal" or "multimodal"
+    peaks : array
+        Indices of detected peaks in x_grid.
+    x_grid : array
+        Grid of x values for plotting/debugging.
+    density : array
+        KDE density values on x_grid.
+    bandwidth : float
+        Bandwidth used for KDE (Silverman's rule).
+    """
+
+    data = np.asarray(data)
+    n = len(data)
+    sigma = np.std(data)
+
+    # Silverman's rule of thumb
+    bandwidth = 1.06 * sigma * n ** (-1 / 5)
+
+    # KDE fit
+    kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(data.reshape(-1, 1))
+
+    # Evaluate density
+    x_grid = np.linspace(data.min(), data.max(), resolution)
+    log_dens = kde.score_samples(x_grid[:, None])
+    density = np.exp(log_dens)
+
+    # Peak detection
+    peaks, _ = find_peaks(density, height=max(density) * peak_height_frac)
+
+    modality = "unimodal" if len(peaks) <= 1 else "multimodal"
+    return modality, peaks, x_grid, density, bandwidth
+
+
+def get_gamma_svm(X_train, f=0.4):
+    '''
+    f : how many times larger you want gamma
+    '''
+    modality, peaks, x_grid, density, bandwidth = detect_modality_auto_bandwidth(X_train)
+    num_peaks = len(peaks)
+
+    if num_peaks <= 1 :
+        # Unimodal
+        sigma = ( iqr(X_train) / 1.349 )  # approximate standard deviation assuming normality
+        if sigma == 0.0:  # Fall back in case IQR is 0
+            sigma = np.std(X_train) / 16
+
+        sigma = sigma / np.sqrt(f)
+
+    else:
+        # Multimodal
+        # Suppose you detected k peaks
+        k = num_peaks  # from your detection method
+
+        # Cluster the data into k groups
+        data_reshaped = X_train.reshape(-1, 1)
+        labels = KMeans(n_clusters=k, n_init=10).fit_predict(data_reshaped)
+
+        # Compute per-cluster STD
+        cluster_stds = [np.std(X_train[labels == i]) for i in range(k)]
+
+        # Use average STD for gamma
+        sigma = np.mean(cluster_stds)
+
+    sigma = sigma / np.sqrt(f)
+    gamma = 1 / (2 * sigma ** 2)
+    return gamma
+
+
+def outlier_score_svm(X_train, X_test, nu=0.2, f=0.4, kernel='rbf'):
+    gamma = get_gamma_svm(X_train, f=f)
+    model = OneClassSVM(kernel=kernel, gamma=gamma, nu=nu)
+    model.fit(X_train.reshape(-1, 1))
+    pred_svm = model.predict(X_test.reshape(-1, 1))
+    return pred_svm

From dfca01bb571113860a4004cf3273f2b66d834553 Mon Sep 17 00:00:00 2001
From: GaelleChapuis <gaelle.chapuis.ibl@gmail.com>
Date: Tue, 12 Aug 2025 15:59:49 +0200
Subject: [PATCH 04/14] outliers code

---
 src/ephysatlas/outliers.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/ephysatlas/outliers.py b/src/ephysatlas/outliers.py
index 35327aa..d3193f1 100644
--- a/src/ephysatlas/outliers.py
+++ b/src/ephysatlas/outliers.py
@@ -1,6 +1,5 @@
 import numpy as np
 import pandas as pd
-from sklearn.neighbors import KernelDensity
 from scipy import stats
 from scipy.interpolate import interp1d
 from scipy.stats import iqr
@@ -401,3 +400,9 @@ def outlier_score_svm(X_train, X_test, nu=0.2, f=0.4, kernel='rbf'):
     model.fit(X_train.reshape(-1, 1))
     pred_svm = model.predict(X_test.reshape(-1, 1))
     return pred_svm
+
+
+def generate_testset(X_train):
+    val_iqr = 3 * iqr(X_train)
+    X_test = np.linspace(min(X_train) - val_iqr, max(X_train) + val_iqr, 100).reshape(-1, 1)
+    return X_test
\ No newline at end of file

From 3185ab1de864fa391353c8c462c176fd4fb08794 Mon Sep 17 00:00:00 2001
From: GaelleChapuis <gaelle.chapuis.ibl@gmail.com>
Date: Tue, 12 Aug 2025 18:23:12 +0200
Subject: [PATCH 05/14] score 1 pid avm

---
 src/ephysatlas/outliers.py | 78 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 77 insertions(+), 1 deletion(-)

diff --git a/src/ephysatlas/outliers.py b/src/ephysatlas/outliers.py
index d3193f1..41fe3a1 100644
--- a/src/ephysatlas/outliers.py
+++ b/src/ephysatlas/outliers.py
@@ -395,6 +395,11 @@ def get_gamma_svm(X_train, f=0.4):
 
 
 def outlier_score_svm(X_train, X_test, nu=0.2, f=0.4, kernel='rbf'):
+    # Step 0 : Filter the train data to remove large outliers
+    X_train = X_train[
+        np.abs(X_train - np.median(X_train)) <= 5 * iqr(X_train)
+        ]
+
     gamma = get_gamma_svm(X_train, f=f)
     model = OneClassSVM(kernel=kernel, gamma=gamma, nu=nu)
     model.fit(X_train.reshape(-1, 1))
@@ -405,4 +410,75 @@ def outlier_score_svm(X_train, X_test, nu=0.2, f=0.4, kernel='rbf'):
 def generate_testset(X_train):
     val_iqr = 3 * iqr(X_train)
     X_test = np.linspace(min(X_train) - val_iqr, max(X_train) + val_iqr, 100).reshape(-1, 1)
-    return X_test
\ No newline at end of file
+    return X_test
+
+
+def score_svm_1pid(df_base, df_new, features, mapping,
+                   min_ch=14, n_pid=3, min_ch_compute=50):
+    # Regions
+    regions = np.unique(df_new[mapping + "_id"]).astype(int)
+    # Store the features that are outlier per brain region in a dict
+    dictout = dict((el, list()) for el in regions.tolist())
+    dictout["mapping"] = mapping
+    dictout["features"] = features
+
+    for count, region in tqdm.tqdm(enumerate(regions), total=len(regions)):
+        # Get channel indices that are in region, but keeping all info besides features
+        idx_reg = np.where(df_new[mapping + "_id"] == region)
+        df_new_compute = df_new.iloc[idx_reg].copy()
+        df_new_compute["has_outliers"] = False
+
+        listout = list()
+        for feature in features:
+            # print(f"{feature} [{region}]")
+            # Load data for that regions
+            df_train = select_series(
+                df_base, features=[feature], acronym=None, id=region, mapping=mapping
+            )
+            # Get channel indices that are in region, keeping only feature values
+            df_test = select_series(
+                df_new, features=[feature], acronym=None, id=region, mapping=mapping
+            )
+
+            df_pid = select_series(
+                df_base, features=['pid'], acronym=None, id=region, mapping=mapping
+            )
+
+            # For all channels at once, test if outside the distribution for the given features
+            train_data = df_train.to_numpy()
+            test_data = df_test.to_numpy()
+
+            print(f'Train N: {len(train_data)}, Test N: {len(test_data)}')
+
+            # score_out = 0 if N pid or N channel too small in training set
+            if bool((df_pid.nunique().values[0] >= n_pid) & (df_pid.shape[0] >= min_ch_compute)):
+                score_out = outlier_score_svm(train_data, test_data)
+            else:
+                score_out = np.zeros(test_data.shape)
+            # Save into new column
+            df_new_compute[feature + "_extremes"] = score_out
+            # A region is assigned as having outliers if more than N minimum channels are outliers
+            has_outliers = sum(df_new_compute[feature + "_extremes"]) > np.floor(min_ch)
+            if has_outliers:
+                listout.append(feature)
+                if sum(
+                    df_new_compute["has_outliers"] == 0
+                ):  # Reassign only if entirely False
+                    df_new_compute["has_outliers"] = True
+        # Save appended list of feature in dict
+        dictout[region] = listout
+
+        # Concatenate dataframes
+        if count == 0:
+            df_save = df_new_compute.copy()
+        else:
+            df_save = pd.concat([df_save, df_new_compute])
+
+    if df_save["has_outliers"].sum() > 0:
+        has_outlier = True
+    else:
+        has_outlier = False
+
+    # Resort by channel
+    df_save = df_save.sort_values(by=["channel"])
+    return df_save, dictout, has_outlier

From 04f40b8e37cf3442dd2653dc14f672e2babc3a37 Mon Sep 17 00:00:00 2001
From: GaelleChapuis <gaelle.chapuis.ibl@gmail.com>
Date: Thu, 14 Aug 2025 17:01:29 +0200
Subject: [PATCH 06/14] outlier SVM

---
 src/ephysatlas/outliers.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/ephysatlas/outliers.py b/src/ephysatlas/outliers.py
index 41fe3a1..91496f3 100644
--- a/src/ephysatlas/outliers.py
+++ b/src/ephysatlas/outliers.py
@@ -402,7 +402,7 @@ def outlier_score_svm(X_train, X_test, nu=0.2, f=0.4, kernel='rbf'):
 
     gamma = get_gamma_svm(X_train, f=f)
     model = OneClassSVM(kernel=kernel, gamma=gamma, nu=nu)
-    model.fit(X_train.reshape(-1, 1))
+    model.fit(X_train.reshape(-1, 1))  # This is slow for high N ; e.g. 29 seconds for 69k samples
     pred_svm = model.predict(X_test.reshape(-1, 1))
     return pred_svm
 
@@ -414,7 +414,7 @@ def generate_testset(X_train):
 
 
 def score_svm_1pid(df_base, df_new, features, mapping,
-                   min_ch=14, n_pid=3, min_ch_compute=50):
+                   min_ch_outlier=20, n_pid=3, min_ch_compute=100, max_ch_compute = 3000, p_thresh_kde=0.98):
     # Regions
     regions = np.unique(df_new[mapping + "_id"]).astype(int)
     # Store the features that are outlier per brain region in a dict
@@ -450,15 +450,25 @@ def score_svm_1pid(df_base, df_new, features, mapping,
 
             print(f'Train N: {len(train_data)}, Test N: {len(test_data)}')
 
+            # Outlier → 1
+            # Inlier → 0
+
             # score_out = 0 if N pid or N channel too small in training set
             if bool((df_pid.nunique().values[0] >= n_pid) & (df_pid.shape[0] >= min_ch_compute)):
-                score_out = outlier_score_svm(train_data, test_data)
+                score_svm = outlier_score_svm(train_data, test_data)
+                # map the OneClassSVM output from {1, -1} into {0, 1}
+                score_out = np.where(score_svm == -1, 1, 0)
+            elif len(train_data) > max_ch_compute:  # Apply KDE method as SVM model.fit is too slow for high N
+                scoreq_out, _, _ = kde_proba_distribution(train_data, test_data)  # This is a score to be thresholded
+                scoreq_out = scoreq_out.squeeze()  # TODO this should not be necessary, place in kde_proba_distribution
+                score_out = scoreq_out > p_thresh_kde
+                score_out = score_out.astype(int)
             else:
                 score_out = np.zeros(test_data.shape)
             # Save into new column
             df_new_compute[feature + "_extremes"] = score_out
             # A region is assigned as having outliers if more than N minimum channels are outliers
-            has_outliers = sum(df_new_compute[feature + "_extremes"]) > np.floor(min_ch)
+            has_outliers = sum(df_new_compute[feature + "_extremes"]) > np.floor(min_ch_outlier)
             if has_outliers:
                 listout.append(feature)
                 if sum(

From d00d9620f8dc524545a733aad3bbc3c06cf5b250 Mon Sep 17 00:00:00 2001
From: GaelleChapuis <gaelle.chapuis.ibl@gmail.com>
Date: Thu, 14 Aug 2025 17:08:51 +0200
Subject: [PATCH 07/14] logic size

---
 src/ephysatlas/outliers.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/ephysatlas/outliers.py b/src/ephysatlas/outliers.py
index 91496f3..871be7c 100644
--- a/src/ephysatlas/outliers.py
+++ b/src/ephysatlas/outliers.py
@@ -455,14 +455,17 @@ def score_svm_1pid(df_base, df_new, features, mapping,
 
             # score_out = 0 if N pid or N channel too small in training set
             if bool((df_pid.nunique().values[0] >= n_pid) & (df_pid.shape[0] >= min_ch_compute)):
-                score_svm = outlier_score_svm(train_data, test_data)
-                # map the OneClassSVM output from {1, -1} into {0, 1}
-                score_out = np.where(score_svm == -1, 1, 0)
-            elif len(train_data) > max_ch_compute:  # Apply KDE method as SVM model.fit is too slow for high N
-                scoreq_out, _, _ = kde_proba_distribution(train_data, test_data)  # This is a score to be thresholded
-                scoreq_out = scoreq_out.squeeze()  # TODO this should not be necessary, place in kde_proba_distribution
-                score_out = scoreq_out > p_thresh_kde
-                score_out = score_out.astype(int)
+                if len(train_data) > max_ch_compute:  # Apply KDE method as SVM model.fit is too slow for high N
+                    scoreq_out, _, _ = kde_proba_distribution(train_data,
+                                                              test_data)  # This is a score to be thresholded
+                    scoreq_out = scoreq_out.squeeze()  # TODO this should not be necessary, place in kde_proba_distribution
+                    score_out = scoreq_out > p_thresh_kde
+                    score_out = score_out.astype(int)
+
+                else:
+                    score_svm = outlier_score_svm(train_data, test_data)
+                    # map the OneClassSVM output from {1, -1} into {0, 1}
+                    score_out = np.where(score_svm == -1, 1, 0)
             else:
                 score_out = np.zeros(test_data.shape)
             # Save into new column

From 615e14ef848fa6bd5931489054ba77030c176a08 Mon Sep 17 00:00:00 2001
From: GaelleChapuis <gaelle.chapuis.ibl@gmail.com>
Date: Fri, 15 Aug 2025 17:47:39 +0200
Subject: [PATCH 08/14] making test for log transform, ATM not passing

---
 tests/test_outliers.py | 111 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)
 create mode 100644 tests/test_outliers.py

diff --git a/tests/test_outliers.py b/tests/test_outliers.py
new file mode 100644
index 0000000..1b86baa
--- /dev/null
+++ b/tests/test_outliers.py
@@ -0,0 +1,111 @@
+from ephysatlas import outliers
+import numpy as np
+import unittest
+
+
+class TestOutliersKDEProba(unittest.TestCase):
+
+    def test_usage(self):
+        # Set random seed for reproducibility
+        np.random.seed(42)  # TODO this does not seem to work
+        # Generate train data from a normal distribution
+        train_data = np.random.normal(loc=-4.5, scale=1.2, size=2000)
+        # Generate test data from a normal distribution
+        test_data = np.random.normal(loc=-3.5, scale=0.4, size=60)
+        # Compute score
+        score, x, hist = outliers.kde_proba_distribution(train_data, test_data)
+        assert score.shape[0] == test_data.shape[0]
+
+        score_out = np.array([0.94909745, 0.95985259, 0.94758700, 0.95592986, 0.93989972,
+                              0.96315955, 0.96212788, 0.94726918, 0.96550552, 0.96921661,
+                              0.94203708, 0.94796957, 0.94788689, 0.94578141, 0.96682835,
+                              0.95866918, 0.95697261, 0.94013844, 0.96537497, 0.94521747,
+                              0.96241355, 0.96347127, 0.97713305, 0.94508643, 0.94181560,
+                              0.96301632, 0.97997165, 0.97587523, 0.96314187, 0.95152157,
+                              0.96633449, 0.94571803, 0.96450351, 0.96466301, 0.97192622,
+                              0.98534690, 0.96404361, 0.95123884, 0.96199762, 0.94007265,
+                              0.94884578, 0.95349810, 0.95104398, 0.96286312, 0.94720050,
+                              0.97326999, 0.96212302, 0.96207678, 0.95545146, 0.96287692,
+                              0.96673165, 0.94662449, 0.94931726, 0.95603152, 0.94501664,
+                              0.94724124, 0.95189154, 0.96782068, 0.96339920, 0.96306370])
+
+        np.testing.assert_almost_equal(score, score_out)
+
+    def test_bimodal(self):
+        # Check that a sample selected in between two Gaussian distributions is detected as an outlier
+        # Generate train data from 2 normal distributions
+        train_data = np.concatenate((
+            np.random.normal(loc=-4.5, scale=1.2, size=2000),
+            np.random.normal(loc=17.5, scale=2.2, size=1000)
+        ))
+
+        # Generate test data from a normal distribution
+        test_data = np.random.normal(loc=-3.5, scale=0.4, size=60)
+        # Add some clear outliers
+        test_data = np.concatenate((test_data, np.array([5.4, 6.4, 0, -9.0])))
+
+        # Compute score
+        score, _, _ = outliers.kde_proba_distribution(train_data, test_data)
+        assert score[-1] == 1
+
+    def test_pad_span(self):
+        # If taking a test value very far out, the train histogram will have more padded sample points
+        # We do not want this to influence the mean result of the score. Test to check the values don't change
+
+        # Generate train data from a normal distribution
+        train_data = np.random.normal(loc=-4.5, scale=1.2, size=2000)
+
+        # Test some clear outliers
+        test_data1 = np.array([-9.0, 3.3, -5.2, -7.8])
+        test_data2 = np.array([-9.0, 3.3, -5.2, -1000])
+
+        # Compute score
+        score1, _, _ = outliers.kde_proba_distribution(train_data, test_data1)
+        score2, _, _ = outliers.kde_proba_distribution(train_data, test_data2)
+
+        for i_sample in range(0, 3):
+            assert score1[i_sample] == score2[i_sample]
+
+
+    def test_log_transform(self):
+        # We want to check that doing twice a log/exp transform onto a Gaussian give the same values
+
+        # Gaussian
+        # Generate train data from a normal distribution
+        train_data = np.random.normal(loc=-4.5, scale=1.2, size=2000)
+        # Generate test data linearly around train set
+        val_iqr = 3 * np.std(train_data)
+        test_data = np.linspace(min(train_data) - val_iqr, max(train_data) + val_iqr, 100).reshape(-1, 1)
+        # Compute score
+        score, x, hist = outliers.kde_proba_distribution(train_data, test_data)
+
+        # Log normal
+        train_data_log = np.exp(train_data)
+        test_data_log = np.exp(test_data)
+
+        # Compute score
+        score_log, _, _ = outliers.kde_proba_distribution(train_data_log, test_data_log)
+        # TODO this is different and the below does not pass!
+        np.testing.assert_almost_equal(np.exp(test_data), test_data_log)
+
+        plot_debug = True
+        # Plot for debug
+        if plot_debug:
+            import matplotlib.pyplot as plt
+            from ephysatlas.plots import plot_histogram
+
+            fig, axs = plt.subplots(1, 3)
+            axs[0].plot(test_data)
+            axs[1].plot(np.exp(test_data))
+            axs[2].plot(test_data_log)
+
+            fig, axs = plt.subplots(1, 1)
+            plot_histogram(train_data, ax=axs, normalise=True)
+            plt.plot(test_data, -1*(score-1), 'k.-')
+            plt.plot(np.log(test_data_log), -1 * (score_log - 1), 'b.-')
+
+            fig, axs = plt.subplots(1, 1)
+            plot_histogram(train_data_log, ax=axs, normalise=True)
+            plt.plot(test_data_log, -1*(score_log-1), 'k.-')
+
+

From fd18815fa1f57fd74afb5c8397fedd824d776532 Mon Sep 17 00:00:00 2001
From: GaelleChapuis <gaelle.chapuis.ibl@gmail.com>
Date: Fri, 15 Aug 2025 17:56:06 +0200
Subject: [PATCH 09/14] copy to enforce variable local

---
 src/ephysatlas/outliers.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/ephysatlas/outliers.py b/src/ephysatlas/outliers.py
index 871be7c..6d837e1 100644
--- a/src/ephysatlas/outliers.py
+++ b/src/ephysatlas/outliers.py
@@ -81,6 +81,10 @@ def kde_proba_distribution(
     - x_train: (E, ) numpy array, x values of the histogram formed using training dataset
     - hist_train: (E, ) numpy array, y values of the histogram formed using training dataset
     """
+    # Do this to force the variables to be local to function
+    test_data = test_data.copy()
+    train_data = train_data.copy()
+
     if len(train_data) > n_min_sample_train:
         # Step 0 : Filter the train data to remove large outliers
         train_data = train_data[

From 02e0febbe7a0c4d05fdaa2c1c4a4ec136a29b5da Mon Sep 17 00:00:00 2001
From: GaelleChapuis <gaelle.chapuis.ibl@gmail.com>
Date: Mon, 18 Aug 2025 15:08:06 +0200
Subject: [PATCH 10/14] outliers changes for KS test metrics

---
 src/ephysatlas/outliers.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/ephysatlas/outliers.py b/src/ephysatlas/outliers.py
index 6d837e1..e4b9c0f 100644
--- a/src/ephysatlas/outliers.py
+++ b/src/ephysatlas/outliers.py
@@ -206,19 +206,16 @@ def kde_proba_1pid(df_base, df_new, features, mapping, p_thresh=0.999999,
             if bool((df_pid.nunique().values[0] >= n_pid) & (df_pid.shape[0] >= min_ch_compute)):
                 score_out, _, _ = kde_proba_distribution(train_data, test_data)
             else:
-                score_out = np.zeros(test_data.shape)
+                score_out = -1 * np.ones(test_data.shape)  # Assign value -1 when cannot compute
             # Save into new column
             df_new_compute[feature + "_q"] = score_out
             df_new_compute[feature + "_extremes"] = 0
             df_new_compute.loc[
                 df_new_compute[feature + "_q"] > p_thresh, feature + "_extremes"
             ] = 1
-            # A region is assigned as having outliers if more than half its channels are outliers
-            # Condition on N minimum channel.
-            has_outliers = sum(df_new_compute[feature + "_extremes"]) > np.floor(
-                len(test_data) / 2
-            )
-            if len(test_data) >= min_ch and has_outliers:
+            # A region is assigned as having outliers if N minimum channel are outliers.
+            has_outliers = sum(df_new_compute[feature + "_extremes"]) >= min_ch
+            if has_outliers:
                 listout.append(feature)
                 if sum(
                     df_new_compute["has_outliers"] == 0

From efa0636ea351aac6fa1b894c2b104ab77769abc2 Mon Sep 17 00:00:00 2001
From: GaelleChapuis <gaelle.chapuis.ibl@gmail.com>
Date: Tue, 19 Aug 2025 12:14:07 +0200
Subject: [PATCH 11/14] svm score

---
 src/ephysatlas/outliers.py | 29 ++++++++++++++++++++---------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/src/ephysatlas/outliers.py b/src/ephysatlas/outliers.py
index e4b9c0f..30a07b5 100644
--- a/src/ephysatlas/outliers.py
+++ b/src/ephysatlas/outliers.py
@@ -395,7 +395,7 @@ def get_gamma_svm(X_train, f=0.4):
     return gamma
 
 
-def outlier_score_svm(X_train, X_test, nu=0.2, f=0.4, kernel='rbf'):
+def outlier_score_svm(X_train, X_test, nu=0.2, f=0.4, kernel='rbf', output='prediction'):
     # Step 0 : Filter the train data to remove large outliers
     X_train = X_train[
         np.abs(X_train - np.median(X_train)) <= 5 * iqr(X_train)
@@ -404,8 +404,22 @@ def outlier_score_svm(X_train, X_test, nu=0.2, f=0.4, kernel='rbf'):
     gamma = get_gamma_svm(X_train, f=f)
     model = OneClassSVM(kernel=kernel, gamma=gamma, nu=nu)
     model.fit(X_train.reshape(-1, 1))  # This is slow for high N ; e.g. 29 seconds for 69k samples
-    pred_svm = model.predict(X_test.reshape(-1, 1))
-    return pred_svm
+
+    if output == 'prediction':
+        # Let the model decide what is the predicted output, score is set to 0
+        pred_svm = model.predict(X_test.reshape(-1, 1))
+        score_test = np.zeros(np.shape(pred_svm))
+    elif output == 'score':
+        # Outputting a score instead of directly the SVM prediction
+        scores_train = model.decision_function(X_train.reshape(-1, 1))
+        scores_test = model.decision_function(X_test.reshape(-1, 1))
+        threshold = np.percentile(scores_train, 100 * nu)  # nu fraction as outliers
+        pred_svm = np.where(scores_test >= threshold, 1, -1)
+
+    # map the OneClassSVM output from {1, -1} into {0, 1}
+    pred_svm_01 = np.where(pred_svm == -1, 1, 0)
+
+    return pred_svm_01, score_test
 
 
 def generate_testset(X_train):
@@ -449,7 +463,7 @@ def score_svm_1pid(df_base, df_new, features, mapping,
             train_data = df_train.to_numpy()
             test_data = df_test.to_numpy()
 
-            print(f'Train N: {len(train_data)}, Test N: {len(test_data)}')
+            # print(f'Train N: {len(train_data)}, Test N: {len(test_data)}')
 
             # Outlier → 1
             # Inlier → 0
@@ -457,16 +471,13 @@ def score_svm_1pid(df_base, df_new, features, mapping,
             # score_out = 0 if N pid or N channel too small in training set
             if bool((df_pid.nunique().values[0] >= n_pid) & (df_pid.shape[0] >= min_ch_compute)):
                 if len(train_data) > max_ch_compute:  # Apply KDE method as SVM model.fit is too slow for high N
-                    scoreq_out, _, _ = kde_proba_distribution(train_data,
-                                                              test_data)  # This is a score to be thresholded
+                    scoreq_out, _, _ = kde_proba_distribution(train_data, test_data)  # This is a score to be thresholded
                     scoreq_out = scoreq_out.squeeze()  # TODO this should not be necessary, place in kde_proba_distribution
                     score_out = scoreq_out > p_thresh_kde
                     score_out = score_out.astype(int)
 
                 else:
-                    score_svm = outlier_score_svm(train_data, test_data)
-                    # map the OneClassSVM output from {1, -1} into {0, 1}
-                    score_out = np.where(score_svm == -1, 1, 0)
+                    pred_svm, _ = outlier_score_svm(train_data, test_data)
             else:
                 score_out = np.zeros(test_data.shape)
             # Save into new column

From 0fd591c91feafc24bc15cc879e83688ec5959b00 Mon Sep 17 00:00:00 2001
From: GaelleChapuis <gaelle.chapuis.ibl@gmail.com>
Date: Tue, 26 Aug 2025 16:51:25 +0200
Subject: [PATCH 12/14] decoder wip

---
 src/ephysatlas/autism_decoding.py  | 208 +++++++++++++++++++++++++++++
 src/ephysatlas/regionclassifier.py |   2 +-
 2 files changed, 209 insertions(+), 1 deletion(-)
 create mode 100644 src/ephysatlas/autism_decoding.py

diff --git a/src/ephysatlas/autism_decoding.py b/src/ephysatlas/autism_decoding.py
new file mode 100644
index 0000000..3d307b1
--- /dev/null
+++ b/src/ephysatlas/autism_decoding.py
@@ -0,0 +1,208 @@
+
+# Get autism data
+
+import numpy as np
+import pandas as pd
+from pathlib import Path
+from one.api import ONE
+import ephysatlas.data
+import ephysatlas.aggregation
+from ephysatlas.features import voltage_features_set
+from ephysatlas.features import EphysTransformer
+from ephysatlas.plots import plot_histogram, select_series
+from ephysatlas.anatomy import ClassifierRegions, NEW_VOID
+
+BASE_PATH = Path('/Users/gaellechapuis/Documents/Work/EphysAtlas')
+
+def download_baseline_data(local_data_path=BASE_PATH, label='2025_W28', one=None):
+    if one is None: one = ONE(base_url='https://alyx.internationalbrainlab.org', mode='remote')
+    download_path = ephysatlas.data.download_tables(local_data_path, label=label, one=one)
+    return download_path
+
+def get_autism_baseline_data(path_autism, path_baseline,
+                             APPLY_DENOISING = False, APPLY_TRANSFORMER = True,
+                             strict = True, load_denoised = False,
+                             features=None, br=None):
+
+    # This function assumes the data is already downloaded
+
+    if br is None:
+        br = ClassifierRegions()
+        br.add_new_region(NEW_VOID)
+
+    if features is None: features = voltage_features_set()
+    if not load_denoised:  strict = False
+
+    # ---- BASELINE DATA ----
+    df_voltage = ephysatlas.data.read_features_from_disk(path_baseline,
+                                                         strict=strict,
+                                                         load_denoised=load_denoised)
+
+    # ---- AUTISM DATA ----
+    df_autism = ephysatlas.data.read_features_from_disk(path_autism,
+                                                         strict=strict,
+                                                         load_denoised=load_denoised)
+    # Apply denoising or transform
+    if not load_denoised:
+        if APPLY_DENOISING:  # This applies the transformer automatically
+            # Apply denoising without bad channel interpolation
+            df_voltage['channel_labels'] = 0
+            df_voltage = ephysatlas.aggregation.denoise_raw_features_data(df_voltage)
+
+            df_autism['channel_labels'] = 0
+            df_autism = ephysatlas.aggregation.denoise_raw_features_data(df_autism)
+
+        elif APPLY_TRANSFORMER:  # Applies transformer without denoising first
+            df_voltage = EphysTransformer().fit_transform(df_voltage)
+            df_autism = EphysTransformer().fit_transform(df_autism)
+
+    df_voltage = df_voltage.reset_index().dropna()
+    df_autism = df_autism.reset_index().dropna()
+
+    # -- Check that features are in df columns
+    setfeature = set(df_voltage.columns).intersection(set(features))
+    setfeature.discard("channel_labels")  # Remove channel labels as a feature
+    features = sorted(list(setfeature))
+
+    return df_autism, df_voltage, features
+
+
+# ====
+from sklearn.preprocessing import StandardScaler
+from sklearn.pipeline import Pipeline
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import LeaveOneGroupOut
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
+
+def pid_cv_region_summary(df_baseline, df_autism, feature_cols, model_type="log_reg"):
+    """
+    PID-wise cross-validation for a single brain region, with region-level aggregation.
+
+    Parameters
+    ----------
+    df_baseline : pd.DataFrame
+        Baseline data for this region
+    df_autism : pd.DataFrame
+        Autism data for this region
+    feature_cols : list
+        Features to use
+    model_type : str
+        "log_reg" (default) or "random_forest"
+
+    Returns
+    -------
+    pid_metrics : pd.DataFrame
+        Per-PID metrics: accuracy, precision, recall, f1
+    region_summary : pd.Series
+        Region-level mean metrics aggregated over autism PIDs
+    final_model : fitted sklearn Pipeline on all data
+    """
+    # Label datasets
+    df_baseline = df_baseline.copy()
+    df_autism = df_autism.copy()
+    df_baseline["label"] = 0
+    df_autism["label"] = 1
+
+    # Combine data for this region
+    df_region = pd.concat([df_baseline, df_autism], ignore_index=True)
+    X, y = get_set_classifier_xy(df_region, feature_cols)
+    # Select classifier
+    if model_type == "log_reg":
+        clf = LogisticRegression(max_iter=500)
+    elif model_type == "random_forest":
+        clf = RandomForestClassifier(n_estimators=200, random_state=42)
+    else:
+        raise ValueError(f"Unknown model_type: {model_type}")
+
+    # Do cross val
+    autism_pids = df_autism["pid"].unique()
+    pid_metrics = cross_val(df_region, feature_cols, clf, cv_pids=autism_pids)
+    weighted_recall = aggregate_pid_scores(pid_metrics, min_channels=20)
+
+    # Train final model on all data
+    final_pipe = Pipeline([
+        ("scaler", StandardScaler()),
+        ("clf", clf)
+    ])
+    final_pipe.fit(X, y)
+
+    feature_importances = pd.DataFrame({
+        "feature": feature_cols,
+        "coef": clf.coef_.flatten(),
+        "abs_coef": np.abs(clf.coef_.flatten())
+    }).sort_values("abs_coef", ascending=False)
+
+    return pid_metrics, weighted_recall, feature_importances
+
+
+
+def get_set_classifier_xy(df_region, feature_cols):
+    X = df_region[feature_cols].to_numpy()
+    y = df_region["label"].to_numpy()
+    return X, y
+
+
+
+##
+def cross_val(df_region, feature_cols, clf, cv_pids=None):
+    X, y = get_set_classifier_xy(df_region, feature_cols)
+
+    if cv_pids is None:
+        cv_pids = df_region["pid"]
+
+    logo = LeaveOneGroupOut()
+    pid_rows = []
+
+    # Cross-val each PID
+    groups = df_region["pid"].to_numpy()  # PID grouping
+    for train_idx, test_idx in logo.split(df_region, y, groups=groups):
+        test_pid = groups[test_idx][0]
+        # Only leave out the PIDs to do cross-validation on
+        if test_pid not in cv_pids:
+            continue
+
+        X_train, X_test = X[train_idx], X[test_idx]
+        y_train, y_test = y[train_idx], y[test_idx]
+
+        pipe = Pipeline([
+            ("scaler", StandardScaler()),
+            ("clf", clf)
+        ])
+        pipe.fit(X_train, y_train)
+        y_pred = pipe.predict(X_test)
+
+        # Compute per-PID metrics
+
+        pid_rows.append({
+            "pid": test_pid,
+            "n_channel" : len(test_idx),
+            "accuracy": accuracy_score(y_test, y_pred),
+            # These will be set to 0 if only autism data cross-val
+            "precision_0": precision_score(y_test, y_pred, pos_label=0, zero_division=0),
+            "recall_0": recall_score(y_test, y_pred, pos_label=0, zero_division=0),
+            "f1_0": f1_score(y_test, y_pred, pos_label=0, zero_division=0),
+            # These will represent autism data as label =1
+            "precision_1": precision_score(y_test, y_pred, pos_label=1, zero_division=0),
+            "recall_1": recall_score(y_test, y_pred, pos_label=1, zero_division=0),
+            "f1_1": f1_score(y_test, y_pred, pos_label=1, zero_division=0)
+        })
+
+    pid_metrics = pd.DataFrame(pid_rows)
+    return pid_metrics
+
+
+def aggregate_pid_scores(pid_metrics, min_channels=20, min_pid=3):
+    """
+    Aggregate per-PID recall scores to region-level score.
+    """
+    # Keep only PIDs with enough channels
+    pid_metrics_ch = pid_metrics[pid_metrics["n_channel"] >= min_channels]
+
+    if pid_metrics_ch.empty or len(pid_metrics_ch)<min_pid:
+        return np.nan
+
+    # Weighted mean by number of channels
+    weights = pid_metrics["n_channel"]
+    weighted_recall = np.average(pid_metrics["recall_1"], weights=weights)
+    return weighted_recall
\ No newline at end of file
diff --git a/src/ephysatlas/regionclassifier.py b/src/ephysatlas/regionclassifier.py
index 6b1ecbd..eda90b3 100644
--- a/src/ephysatlas/regionclassifier.py
+++ b/src/ephysatlas/regionclassifier.py
@@ -189,7 +189,7 @@ def viterbi(
 
 def infer_regions(df_inference, path_model, n_folds=5):
     for fold in range(n_folds):
-        dict_model = load_model(path_model.joinpath(f"FOLD0{fold}"))
+        _, dict_model = load_model(path_model.joinpath(f"FOLD0{fold}"))
         classifier = dict_model["classifier"]
 
         df_inference["outside"] = 0

From a638a4e4dc1e7bb9b930e408f0f4d0563d5c6ae5 Mon Sep 17 00:00:00 2001
From: GaelleChapuis <gaelle.chapuis.ibl@gmail.com>
Date: Tue, 9 Sep 2025 11:06:59 +0200
Subject: [PATCH 13/14] get dataframe

---
 src/ephysatlas/autism_decoding.py | 62 +++++++++++++++++++++++++++++--
 1 file changed, 59 insertions(+), 3 deletions(-)

diff --git a/src/ephysatlas/autism_decoding.py b/src/ephysatlas/autism_decoding.py
index 3d307b1..e5a8e05 100644
--- a/src/ephysatlas/autism_decoding.py
+++ b/src/ephysatlas/autism_decoding.py
@@ -14,6 +14,39 @@
 
 BASE_PATH = Path('/Users/gaellechapuis/Documents/Work/EphysAtlas')
 
+def remove_void_root(df_voltage, mapping, br, list_acronyms=None):
+    if list_acronyms is None:
+        list_acronyms = ['void', 'root', 'void_fluid'] # Todo remove void liquid
+    vect_void_root = br.acronym2id(list_acronyms)
+    df_voltage = df_voltage[~df_voltage[mapping + "_id"].isin(vect_void_root)]
+    return df_voltage
+
+
+def set_common_region(df_voltage, df_autism, id_region_voltage, id_region_autism, mapping):
+    set_common = set(id_region_voltage).intersection(set(id_region_autism))
+    df_voltage = df_voltage[df_voltage[mapping + "_id"].isin(set_common)]
+    df_autism = df_autism[df_autism[mapping + "_id"].isin(set_common)]
+    return df_voltage, df_autism
+
+def get_br_id_min_pid(df, mapping, min_pid):
+    df_pid = df[[mapping+'_id', 'pid']].copy()
+    df_pid = df_pid.drop_duplicates()
+    gr_reg = df_pid.groupby([mapping+'_id']).aggregate('count')
+    id_region = gr_reg[gr_reg['pid']>=min_pid].index
+    # Keeps rows with brain region col values
+    filtered_df = df[df[mapping+'_id'].isin(id_region)]
+    return filtered_df, id_region
+
+
+def get_br_id_min_channel(df, mapping, min_channel):
+    df_channel = df[[mapping+'_id', 'pid', 'channel']].copy()
+    df_channel = df_channel.drop_duplicates()  # This should not change the df
+    gr_reg = df_channel.groupby([mapping+'_id']).aggregate('count')
+    id_region = gr_reg[gr_reg['channel']>=min_channel].index
+    # Keeps rows with brain region col values
+    filtered_df = df[df[mapping+'_id'].isin(id_region)]
+    return filtered_df, id_region
+
 def download_baseline_data(local_data_path=BASE_PATH, label='2025_W28', one=None):
     if one is None: one = ONE(base_url='https://alyx.internationalbrainlab.org', mode='remote')
     download_path = ephysatlas.data.download_tables(local_data_path, label=label, one=one)
@@ -22,7 +55,9 @@ def download_baseline_data(local_data_path=BASE_PATH, label='2025_W28', one=None
 def get_autism_baseline_data(path_autism, path_baseline,
                              APPLY_DENOISING = False, APPLY_TRANSFORMER = True,
                              strict = True, load_denoised = False,
-                             features=None, br=None):
+                             features=None, br=None,
+                             min_pid=5, min_channel=200, mapping='Beryl',
+                             remove_voidroot=True):
 
     # This function assumes the data is already downloaded
 
@@ -59,6 +94,27 @@ def get_autism_baseline_data(path_autism, path_baseline,
     df_voltage = df_voltage.reset_index().dropna()
     df_autism = df_autism.reset_index().dropna()
 
+    # Remove bad channels ; keep only good channels with label 0
+    df_voltage = df_voltage[df_voltage["channel_labels"] == 0]
+    df_autism = df_autism[df_autism["channel_labels"] == 0]
+
+    # Remove void and root
+    if remove_voidroot:
+        df_voltage = remove_void_root(df_voltage, mapping, br)
+        df_autism = remove_void_root(df_autism, mapping, br)
+
+    # Check per region that there is min number of PID
+    df_voltage, id_region_voltage = get_br_id_min_pid(df_voltage, mapping, min_pid)
+    df_autism, id_region_autism = get_br_id_min_pid(df_autism, mapping, min_pid)
+    # Take union of region remaining in Autism / EA
+    df_voltage, df_autism = set_common_region(df_voltage, df_autism, id_region_voltage, id_region_autism, mapping)
+
+    # Check per region that there is at least number channels
+    df_voltage, id_region_voltage = get_br_id_min_channel(df_voltage, mapping, min_channel)
+    df_autism, id_region_autism = get_br_id_min_channel(df_autism, mapping, min_channel)
+    # Take union of region remaining in Autism / EA
+    df_voltage, df_autism = set_common_region(df_voltage, df_autism, id_region_voltage, id_region_autism, mapping)
+
     # -- Check that features are in df columns
     setfeature = set(df_voltage.columns).intersection(set(features))
     setfeature.discard("channel_labels")  # Remove channel labels as a feature
@@ -137,9 +193,9 @@ def pid_cv_region_summary(df_baseline, df_autism, feature_cols, model_type="log_
 
 
 
-def get_set_classifier_xy(df_region, feature_cols):
+def get_set_classifier_xy(df_region, feature_cols, label_col="label"):
     X = df_region[feature_cols].to_numpy()
-    y = df_region["label"].to_numpy()
+    y = df_region[label_col].to_numpy()
     return X, y
 
 

From f0ca5cbf31fe271664e14c998640878e5a40ac3d Mon Sep 17 00:00:00 2001
From: GaelleChapuis <gaelle.chapuis.ibl@gmail.com>
Date: Thu, 11 Sep 2025 16:25:13 +0200
Subject: [PATCH 14/14] decoder binary

---
 src/ephysatlas/autism_decoding.py | 252 ++++++++++++++++++++++++++----
 1 file changed, 223 insertions(+), 29 deletions(-)

diff --git a/src/ephysatlas/autism_decoding.py b/src/ephysatlas/autism_decoding.py
index e5a8e05..8db6968 100644
--- a/src/ephysatlas/autism_decoding.py
+++ b/src/ephysatlas/autism_decoding.py
@@ -11,6 +11,13 @@
 from ephysatlas.features import EphysTransformer
 from ephysatlas.plots import plot_histogram, select_series
 from ephysatlas.anatomy import ClassifierRegions, NEW_VOID
+import seaborn as sns
+import xgboost as xgb
+from sklearn.metrics import classification_report
+from sklearn.model_selection import StratifiedGroupKFold
+import shap
+from collections import Counter
+import matplotlib.pyplot as plt
 
 BASE_PATH = Path('/Users/gaellechapuis/Documents/Work/EphysAtlas')
 
@@ -52,6 +59,39 @@ def download_baseline_data(local_data_path=BASE_PATH, label='2025_W28', one=None
     download_path = ephysatlas.data.download_tables(local_data_path, label=label, one=one)
     return download_path
 
+def clean_df(df_autism, df_voltage, remove_voidroot, mapping, br, min_pid, min_channel, features):
+    df_voltage = df_voltage.reset_index().dropna()
+    df_autism = df_autism.reset_index().dropna()
+
+    # Remove bad channels ; keep only good channels with label 0
+    df_voltage = df_voltage[df_voltage["channel_labels"] == 0]
+    df_autism = df_autism[df_autism["channel_labels"] == 0]
+
+    # Remove void and root
+    if remove_voidroot:
+        df_voltage = remove_void_root(df_voltage, mapping, br)
+        df_autism = remove_void_root(df_autism, mapping, br)
+
+    # Check per region that there is min number of PID
+    df_voltage, id_region_voltage = get_br_id_min_pid(df_voltage, mapping, min_pid)
+    df_autism, id_region_autism = get_br_id_min_pid(df_autism, mapping, min_pid)
+    # Take union of region remaining in Autism / EA
+    df_voltage, df_autism = set_common_region(df_voltage, df_autism, id_region_voltage, id_region_autism, mapping)
+
+    # Check per region that there is at least number channels
+    df_voltage, id_region_voltage = get_br_id_min_channel(df_voltage, mapping, min_channel)
+    df_autism, id_region_autism = get_br_id_min_channel(df_autism, mapping, min_channel)
+    # Take union of region remaining in Autism / EA
+    df_voltage, df_autism = set_common_region(df_voltage, df_autism, id_region_voltage, id_region_autism, mapping)
+
+    # -- Check that features are in df columns
+    setfeature = set(df_voltage.columns).intersection(set(features))
+    setfeature.discard("channel_labels")  # Remove channel labels as a feature
+    features = sorted(list(setfeature))
+
+    return df_autism, df_voltage, features
+
+
 def get_autism_baseline_data(path_autism, path_baseline,
                              APPLY_DENOISING = False, APPLY_TRANSFORMER = True,
                              strict = True, load_denoised = False,
@@ -91,34 +131,7 @@ def get_autism_baseline_data(path_autism, path_baseline,
             df_voltage = EphysTransformer().fit_transform(df_voltage)
             df_autism = EphysTransformer().fit_transform(df_autism)
 
-    df_voltage = df_voltage.reset_index().dropna()
-    df_autism = df_autism.reset_index().dropna()
-
-    # Remove bad channels ; keep only good channels with label 0
-    df_voltage = df_voltage[df_voltage["channel_labels"] == 0]
-    df_autism = df_autism[df_autism["channel_labels"] == 0]
-
-    # Remove void and root
-    if remove_voidroot:
-        df_voltage = remove_void_root(df_voltage, mapping, br)
-        df_autism = remove_void_root(df_autism, mapping, br)
-
-    # Check per region that there is min number of PID
-    df_voltage, id_region_voltage = get_br_id_min_pid(df_voltage, mapping, min_pid)
-    df_autism, id_region_autism = get_br_id_min_pid(df_autism, mapping, min_pid)
-    # Take union of region remaining in Autism / EA
-    df_voltage, df_autism = set_common_region(df_voltage, df_autism, id_region_voltage, id_region_autism, mapping)
-
-    # Check per region that there is at least number channels
-    df_voltage, id_region_voltage = get_br_id_min_channel(df_voltage, mapping, min_channel)
-    df_autism, id_region_autism = get_br_id_min_channel(df_autism, mapping, min_channel)
-    # Take union of region remaining in Autism / EA
-    df_voltage, df_autism = set_common_region(df_voltage, df_autism, id_region_voltage, id_region_autism, mapping)
-
-    # -- Check that features are in df columns
-    setfeature = set(df_voltage.columns).intersection(set(features))
-    setfeature.discard("channel_labels")  # Remove channel labels as a feature
-    features = sorted(list(setfeature))
+    df_autism, df_voltage, features = clean_df(df_autism, df_voltage, remove_voidroot, mapping, br, min_pid, min_channel, features)
 
     return df_autism, df_voltage, features
 
@@ -261,4 +274,185 @@ def aggregate_pid_scores(pid_metrics, min_channels=20, min_pid=3):
     # Weighted mean by number of channels
     weights = pid_metrics["n_channel"]
     weighted_recall = np.average(pid_metrics["recall_1"], weights=weights)
-    return weighted_recall
\ No newline at end of file
+    return weighted_recall
+
+
+## ----- BINARY DECODER ------
+
+N_SPLIT = 3
+RANDOM_STATE = 42
+
+
+def get_data_per_fold(df_classes, features, label_col="label",
+                      n_split=N_SPLIT, shuffle=True, random_state=RANDOM_STATE):
+    # Get data sets
+    X, y = get_set_classifier_xy(df_classes, features, label_col=label_col)
+
+    # Stratify groups to remove entire PIDs
+    sgkf = StratifiedGroupKFold(n_splits=n_split, shuffle=shuffle, random_state=random_state)
+    # Instantiate dict to save statistics of fold
+    fold_data = dict()
+
+    for fold_idx, (train_idx, test_idx) in enumerate(sgkf.split(X, y, groups=df_classes["pid"]), start=1):
+
+        X_train, X_test = X[train_idx], X[test_idx]
+        y_train, y_test = y[train_idx], y[test_idx]
+
+        y_test_random = np.random.permutation(y_test)
+
+        # Get the train/test PIDs from the split
+        train_pids = df_classes.iloc[train_idx]["pid"].unique()
+        test_pids = df_classes.iloc[test_idx]["pid"].unique()
+
+        # Count how many unique PIDs of each label are in train/test
+        train_counts = df_classes[df_classes["pid"].isin(train_pids)].drop_duplicates("pid")["label"].value_counts()
+        test_counts = df_classes[df_classes["pid"].isin(test_pids)].drop_duplicates("pid")["label"].value_counts()
+
+        # Compute class weight within the fold
+        # Compute in 2 ways, as this depends on the type of classifier used: class_weight and scale_weight
+        # y_train and y_test come from your split
+        n_train = len(y_train)
+        classes_train = np.unique(y_train)
+        K = len(classes_train)  # This will always be 2 in our case
+        # count samples per class in y_train
+        counts = Counter(y_train)
+        # compute weights
+        class_weights = {cls: n_train / (K * counts[cls]) for cls in classes_train}
+        # Scale weight
+        n_pos = sum(y_train == 1)
+        n_neg = sum(y_train == 0)
+        scale_pos_weight = n_neg / n_pos
+
+        # Check there are the right N classes in test set
+        classes_test = np.unique(y_test)
+        is_test_balanced = len(classes_test) == K
+
+        fold_data[fold_idx] = {"fold": fold_idx,
+                               "features": features,
+                               "X_train": X_train,
+                               "X_test": X_test,
+                               "y_train": y_train,
+                               "y_test": y_test,
+                               "y_test_random": y_test_random,
+                               "n_train_channels": np.bincount(y_train),
+                               "n_test_channels": np.bincount(y_test),
+                               "n_train_pids": train_counts,
+                               "n_test_pids": test_counts,
+                               "list_train_pids": train_pids,
+                               "list_test_pids": test_pids,
+                               "class_weights": class_weights,
+                               "scale_pos_weight": scale_pos_weight,
+                               "random_state": random_state,
+                               "is_test_balanced": is_test_balanced}
+
+    return fold_data
+
+
+def get_weight_classes(df_baseline, df_compare):
+    scale_pos_weight = len(df_baseline) / len(df_compare)
+    return scale_pos_weight
+
+def train_classifer_one_fold(fold, eval_metric="logloss"):
+    X_train = fold["X_train"]
+    X_test = fold["X_test"]
+    y_train = fold["y_train"]
+    y_test = fold["y_test"]
+    y_test_random = fold['y_test_random']
+    features = fold['features']
+    scale_pos_weight = fold['scale_pos_weight']
+
+    report_perf = dict()
+
+    # Train classifier
+    model = xgb.XGBClassifier(use_label_encoder=False, eval_metric=eval_metric, scale_pos_weight=scale_pos_weight)
+    model.fit(X_train, y_train)
+    # Check on performance
+    y_pred = model.predict(X_test)
+
+    report_perf['classification_report'] = classification_report(y_test, y_pred, output_dict=True)
+    report_perf['classification_report_random'] = classification_report(y_test, y_test_random, output_dict=True)
+
+    # Create explainer
+    explainer = shap.TreeExplainer(model)
+    shap_values = explainer.shap_values(X_test)
+    # Take mean absolute SHAP value across samples
+    importance = np.abs(shap_values).mean(axis=0)
+
+    report_perf['shap_values'] = shap_values
+    report_perf['importance'] = importance
+    report_perf['features'] = features
+
+    return report_perf
+
+
+def print_classification_report(report):
+    # Convert to DataFrame
+    df_report = pd.DataFrame(report).transpose()
+    print(df_report.head())
+
+
+def get_sorted_list_feature_importance(features, importance):
+    # Put into a DataFrame for clarity
+    feature_importance = pd.DataFrame({
+        "feature": features,
+        "importance": importance
+    })
+    # Sort by importance
+    feature_importance = feature_importance.sort_values("importance", ascending=False).reset_index(drop=True)
+    list_imp = feature_importance['feature'].to_list()
+    return list_imp
+
+def plot_one_fold(fold, report_perf, label='label'):
+    importance = report_perf['importance']
+    shap_values = report_perf['shap_values']
+    features = report_perf['features']
+
+    report = report_perf['classification_report']
+    report_random = report_perf['classification_report_random']
+    print_classification_report(report)
+    print_classification_report(report_random)
+    print(fold['is_test_balanced'])
+
+
+    X_test = fold['X_test']
+    X_test_df = pd.DataFrame(X_test, columns=features)
+    plot_df = X_test_df.copy()
+    plot_df[label] = fold['y_test']
+
+    # 1. Global feature importance (summary plot)
+    # This shows which features matter most overall.
+    shap.summary_plot(shap_values, X_test, feature_names=features)
+
+    # 2. Cross plot of first 3 important features
+    list_feat = get_sorted_list_feature_importance(features, importance)
+    # Select first 3 for plotting
+    features_plot = list_feat[0:3]
+    # print(features_plot)
+    sns.pairplot(data=plot_df[features_plot + [label]], hue=label)
+
+    # 3. Feature-level effect (dependence plot)
+    # To see how a single feature influences predictions:
+    shap.dependence_plot(features_plot[0], shap_values, X_test_df)
+
+
+def plot_one_fold_region(fold, report_perf, mapping='Beryl'):
+    # Plot specific to training with region as information column
+    X_test = fold['X_test']
+    features = report_perf['features']
+    shap_values = report_perf['shap_values']
+    X_test_df = pd.DataFrame(X_test, columns=features)
+
+    # Place absolute SHAP values into dataframe
+    df_shap = pd.DataFrame(np.abs(shap_values), columns=features)
+    # Overwrite region column otherwise it contain the SHAP value for region ID
+    df_shap[mapping + "_id"] = X_test_df[mapping + "_id"].astype('int')
+    # Mean absolute SHAP per feature, stratified by region
+    grouped = df_shap.groupby(mapping + "_id").mean()
+
+    plt.figure(figsize=(12,6))
+    sns.heatmap(grouped, cmap="viridis", annot=False)
+    plt.title("Mean |SHAP| values per feature and region")
+    plt.ylabel(mapping + "_id (region)")
+    plt.xlabel("Feature")
+    plt.show()
+