diff --git a/src/ephysatlas/autism_decoding.py b/src/ephysatlas/autism_decoding.py new file mode 100644 index 0000000..8db6968 --- /dev/null +++ b/src/ephysatlas/autism_decoding.py @@ -0,0 +1,458 @@ + +# Get autism data + +import numpy as np +import pandas as pd +from pathlib import Path +from one.api import ONE +import ephysatlas.data +import ephysatlas.aggregation +from ephysatlas.features import voltage_features_set +from ephysatlas.features import EphysTransformer +from ephysatlas.plots import plot_histogram, select_series +from ephysatlas.anatomy import ClassifierRegions, NEW_VOID +import seaborn as sns +import xgboost as xgb +from sklearn.metrics import classification_report +from sklearn.model_selection import StratifiedGroupKFold +import shap +from collections import Counter +import matplotlib.pyplot as plt + +BASE_PATH = Path('/Users/gaellechapuis/Documents/Work/EphysAtlas') + +def remove_void_root(df_voltage, mapping, br, list_acronyms=None): + if list_acronyms is None: + list_acronyms = ['void', 'root', 'void_fluid'] # Todo remove void liquid + vect_void_root = br.acronym2id(list_acronyms) + df_voltage = df_voltage[~df_voltage[mapping + "_id"].isin(vect_void_root)] + return df_voltage + + +def set_common_region(df_voltage, df_autism, id_region_voltage, id_region_autism, mapping): + set_common = set(id_region_voltage).intersection(set(id_region_autism)) + df_voltage = df_voltage[df_voltage[mapping + "_id"].isin(set_common)] + df_autism = df_autism[df_autism[mapping + "_id"].isin(set_common)] + return df_voltage, df_autism + +def get_br_id_min_pid(df, mapping, min_pid): + df_pid = df[[mapping+'_id', 'pid']].copy() + df_pid = df_pid.drop_duplicates() + gr_reg = df_pid.groupby([mapping+'_id']).aggregate('count') + id_region = gr_reg[gr_reg['pid']>=min_pid].index + # Keeps rows with brain region col values + filtered_df = df[df[mapping+'_id'].isin(id_region)] + return filtered_df, id_region + + +def get_br_id_min_channel(df, mapping, min_channel): + df_channel = df[[mapping+'_id', 'pid', 'channel']].copy() + df_channel = df_channel.drop_duplicates() # This should not change the df + gr_reg = df_channel.groupby([mapping+'_id']).aggregate('count') + id_region = gr_reg[gr_reg['channel']>=min_channel].index + # Keeps rows with brain region col values + filtered_df = df[df[mapping+'_id'].isin(id_region)] + return filtered_df, id_region + +def download_baseline_data(local_data_path=BASE_PATH, label='2025_W28', one=None): + if one is None: one = ONE(base_url='https://alyx.internationalbrainlab.org', mode='remote') + download_path = ephysatlas.data.download_tables(local_data_path, label=label, one=one) + return download_path + +def clean_df(df_autism, df_voltage, remove_voidroot, mapping, br, min_pid, min_channel, features): + df_voltage = df_voltage.reset_index().dropna() + df_autism = df_autism.reset_index().dropna() + + # Remove bad channels ; keep only good channels with label 0 + df_voltage = df_voltage[df_voltage["channel_labels"] == 0] + df_autism = df_autism[df_autism["channel_labels"] == 0] + + # Remove void and root + if remove_voidroot: + df_voltage = remove_void_root(df_voltage, mapping, br) + df_autism = remove_void_root(df_autism, mapping, br) + + # Check per region that there is min number of PID + df_voltage, id_region_voltage = get_br_id_min_pid(df_voltage, mapping, min_pid) + df_autism, id_region_autism = get_br_id_min_pid(df_autism, mapping, min_pid) + # Take union of region remaining in Autism / EA + df_voltage, df_autism = set_common_region(df_voltage, df_autism, id_region_voltage, id_region_autism, mapping) + + # Check per region that there is at least number channels + df_voltage, id_region_voltage = get_br_id_min_channel(df_voltage, mapping, min_channel) + df_autism, id_region_autism = get_br_id_min_channel(df_autism, mapping, min_channel) + # Take union of region remaining in Autism / EA + df_voltage, df_autism = set_common_region(df_voltage, df_autism, id_region_voltage, id_region_autism, mapping) + + # -- Check that features are in df columns + setfeature = set(df_voltage.columns).intersection(set(features)) + setfeature.discard("channel_labels") # Remove channel labels as a feature + features = sorted(list(setfeature)) + + return df_autism, df_voltage, features + + +def get_autism_baseline_data(path_autism, path_baseline, + APPLY_DENOISING = False, APPLY_TRANSFORMER = True, + strict = True, load_denoised = False, + features=None, br=None, + min_pid=5, min_channel=200, mapping='Beryl', + remove_voidroot=True): + + # This function assumes the data is already downloaded + + if br is None: + br = ClassifierRegions() + br.add_new_region(NEW_VOID) + + if features is None: features = voltage_features_set() + if not load_denoised: strict = False + + # ---- BASELINE DATA ---- + df_voltage = ephysatlas.data.read_features_from_disk(path_baseline, + strict=strict, + load_denoised=load_denoised) + + # ---- AUTISM DATA ---- + df_autism = ephysatlas.data.read_features_from_disk(path_autism, + strict=strict, + load_denoised=load_denoised) + # Apply denoising or transform + if not load_denoised: + if APPLY_DENOISING: # This applies the transformer automatically + # Apply denoising without bad channel interpolation + df_voltage['channel_labels'] = 0 + df_voltage = ephysatlas.aggregation.denoise_raw_features_data(df_voltage) + + df_autism['channel_labels'] = 0 + df_autism = ephysatlas.aggregation.denoise_raw_features_data(df_autism) + + elif APPLY_TRANSFORMER: # Applies transformer without denoising first + df_voltage = EphysTransformer().fit_transform(df_voltage) + df_autism = EphysTransformer().fit_transform(df_autism) + + df_autism, df_voltage, features = clean_df(df_autism, df_voltage, remove_voidroot, mapping, br, min_pid, min_channel, features) + + return df_autism, df_voltage, features + + +# ==== +from sklearn.preprocessing import StandardScaler +from sklearn.pipeline import Pipeline +from sklearn.linear_model import LogisticRegression +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import LeaveOneGroupOut +from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score + +def pid_cv_region_summary(df_baseline, df_autism, feature_cols, model_type="log_reg"): + """ + PID-wise cross-validation for a single brain region, with region-level aggregation. + + Parameters + ---------- + df_baseline : pd.DataFrame + Baseline data for this region + df_autism : pd.DataFrame + Autism data for this region + feature_cols : list + Features to use + model_type : str + "log_reg" (default) or "random_forest" + + Returns + ------- + pid_metrics : pd.DataFrame + Per-PID metrics: accuracy, precision, recall, f1 + region_summary : pd.Series + Region-level mean metrics aggregated over autism PIDs + final_model : fitted sklearn Pipeline on all data + """ + # Label datasets + df_baseline = df_baseline.copy() + df_autism = df_autism.copy() + df_baseline["label"] = 0 + df_autism["label"] = 1 + + # Combine data for this region + df_region = pd.concat([df_baseline, df_autism], ignore_index=True) + X, y = get_set_classifier_xy(df_region, feature_cols) + # Select classifier + if model_type == "log_reg": + clf = LogisticRegression(max_iter=500) + elif model_type == "random_forest": + clf = RandomForestClassifier(n_estimators=200, random_state=42) + else: + raise ValueError(f"Unknown model_type: {model_type}") + + # Do cross val + autism_pids = df_autism["pid"].unique() + pid_metrics = cross_val(df_region, feature_cols, clf, cv_pids=autism_pids) + weighted_recall = aggregate_pid_scores(pid_metrics, min_channels=20) + + # Train final model on all data + final_pipe = Pipeline([ + ("scaler", StandardScaler()), + ("clf", clf) + ]) + final_pipe.fit(X, y) + + feature_importances = pd.DataFrame({ + "feature": feature_cols, + "coef": clf.coef_.flatten(), + "abs_coef": np.abs(clf.coef_.flatten()) + }).sort_values("abs_coef", ascending=False) + + return pid_metrics, weighted_recall, feature_importances + + + +def get_set_classifier_xy(df_region, feature_cols, label_col="label"): + X = df_region[feature_cols].to_numpy() + y = df_region[label_col].to_numpy() + return X, y + + + +## +def cross_val(df_region, feature_cols, clf, cv_pids=None): + X, y = get_set_classifier_xy(df_region, feature_cols) + + if cv_pids is None: + cv_pids = df_region["pid"] + + logo = LeaveOneGroupOut() + pid_rows = [] + + # Cross-val each PID + groups = df_region["pid"].to_numpy() # PID grouping + for train_idx, test_idx in logo.split(df_region, y, groups=groups): + test_pid = groups[test_idx][0] + # Only leave out the PIDs to do cross-validation on + if test_pid not in cv_pids: + continue + + X_train, X_test = X[train_idx], X[test_idx] + y_train, y_test = y[train_idx], y[test_idx] + + pipe = Pipeline([ + ("scaler", StandardScaler()), + ("clf", clf) + ]) + pipe.fit(X_train, y_train) + y_pred = pipe.predict(X_test) + + # Compute per-PID metrics + + pid_rows.append({ + "pid": test_pid, + "n_channel" : len(test_idx), + "accuracy": accuracy_score(y_test, y_pred), + # These will be set to 0 if only autism data cross-val + "precision_0": precision_score(y_test, y_pred, pos_label=0, zero_division=0), + "recall_0": recall_score(y_test, y_pred, pos_label=0, zero_division=0), + "f1_0": f1_score(y_test, y_pred, pos_label=0, zero_division=0), + # These will represent autism data as label =1 + "precision_1": precision_score(y_test, y_pred, pos_label=1, zero_division=0), + "recall_1": recall_score(y_test, y_pred, pos_label=1, zero_division=0), + "f1_1": f1_score(y_test, y_pred, pos_label=1, zero_division=0) + }) + + pid_metrics = pd.DataFrame(pid_rows) + return pid_metrics + + +def aggregate_pid_scores(pid_metrics, min_channels=20, min_pid=3): + """ + Aggregate per-PID recall scores to region-level score. + """ + # Keep only PIDs with enough channels + pid_metrics_ch = pid_metrics[pid_metrics["n_channel"] >= min_channels] + + if pid_metrics_ch.empty or len(pid_metrics_ch) n_min_sample_train: # Step 0 : Filter the train data to remove large outliers train_data = train_data[ @@ -115,7 +123,12 @@ def kde_proba_distribution( n_above = 0 n_below = 0 - # TODO for extremely high value (e.g. >10 IQR), remove them, put outlier score to 1 + # For extremely high value (e.g. >10 IQR), remove them, put outlier score to 1 + val_replace = np.mean(train_data) # Use the mean of the train data so it's well within boundaries + idx_replace = np.where( (test_data > np.mean(train_data) + 10 * iqr(train_data)) | + (test_data < np.mean(train_data) - 10 * iqr(train_data)) + )[0] + test_data[idx_replace] = val_replace if np.max(test_data) > np.max(x_train): # Pad above @@ -150,11 +163,13 @@ def kde_proba_distribution( # The outlier probability is the inverse outp = 1 - y_test + outp[idx_replace] = 1 # Replace extreme outliers that were set to default value to outlier score 1 return outp, x_train, hist_train -def kde_proba_1pid(df_base, df_new, features, mapping, p_thresh=0.999999, min_ch=15): +def kde_proba_1pid(df_base, df_new, features, mapping, p_thresh=0.999999, + min_ch=15, n_pid=3, min_ch_compute=100): # Regions regions = np.unique(df_new[mapping + "_id"]).astype(int) # Store the features that are outlier per brain region in a dict @@ -170,34 +185,37 @@ def kde_proba_1pid(df_base, df_new, features, mapping, p_thresh=0.999999, min_ch listout = list() for feature in features: - print(f"{feature} [{region}]") + # print(f"{feature} [{region}]") # Load data for that regions df_train = select_series( df_base, features=[feature], acronym=None, id=region, mapping=mapping ) - # Get channel indices that are in region, keeping only feature values df_test = select_series( df_new, features=[feature], acronym=None, id=region, mapping=mapping ) + df_pid = select_series( + df_base, features=['pid'], acronym=None, id=region, mapping=mapping + ) + # For all channels at once, test if outside the distribution for the given features train_data = df_train.to_numpy() test_data = df_test.to_numpy() - score_out, _, _ = kde_proba_distribution(train_data, test_data) - # score_out, _, _ = detect_outlier_histV2(train_data, test_data) + # score_out = 0 if N pid or N channel too small in training set + if bool((df_pid.nunique().values[0] >= n_pid) & (df_pid.shape[0] >= min_ch_compute)): + score_out, _, _ = kde_proba_distribution(train_data, test_data) + else: + score_out = -1 * np.ones(test_data.shape) # Assign value -1 when cannot compute # Save into new column df_new_compute[feature + "_q"] = score_out df_new_compute[feature + "_extremes"] = 0 df_new_compute.loc[ df_new_compute[feature + "_q"] > p_thresh, feature + "_extremes" ] = 1 - # A region is assigned as having outliers if more than half its channels are outliers - # Condition on N minimum channel. - has_outliers = sum(df_new_compute[feature + "_extremes"]) > np.floor( - len(test_data) / 2 - ) - if len(test_data) >= min_ch and has_outliers: + # A region is assigned as having outliers if N minimum channel are outliers. + has_outliers = sum(df_new_compute[feature + "_extremes"]) >= min_ch + if has_outliers: listout.append(feature) if sum( df_new_compute["has_outliers"] == 0 @@ -289,3 +307,203 @@ def df_add_channel_label(df, pid, one=None): # Merge to get the labels column df = df.merge(df_label, left_index=True, right_index=True) return df + +# =========================================================== +# ====== SVM ================================================ + +def detect_modality_auto_bandwidth(data, peak_height_frac=0.1, resolution=500): + """ + Detect unimodal or multimodal distribution using KDE with Silverman's bandwidth. + + Parameters + ---------- + data : array-like + Raw data points. + peak_height_frac : float, optional + Fraction of maximum density for minimum peak height. + resolution : int, optional + Number of points in the KDE evaluation grid. + + Returns + ------- + modality : str + "unimodal" or "multimodal" + peaks : array + Indices of detected peaks in x_grid. + x_grid : array + Grid of x values for plotting/debugging. + density : array + KDE density values on x_grid. + bandwidth : float + Bandwidth used for KDE (Silverman's rule). + """ + + data = np.asarray(data) + n = len(data) + sigma = np.std(data) + + # Silverman's rule of thumb + bandwidth = 1.06 * sigma * n ** (-1 / 5) + + # KDE fit + kde = KernelDensity(kernel='gaussian', bandwidth=bandwidth).fit(data.reshape(-1, 1)) + + # Evaluate density + x_grid = np.linspace(data.min(), data.max(), resolution) + log_dens = kde.score_samples(x_grid[:, None]) + density = np.exp(log_dens) + + # Peak detection + peaks, _ = find_peaks(density, height=max(density) * peak_height_frac) + + modality = "unimodal" if len(peaks) <= 1 else "multimodal" + return modality, peaks, x_grid, density, bandwidth + + +def get_gamma_svm(X_train, f=0.4): + ''' + f : how many times larger you want gamma + ''' + modality, peaks, x_grid, density, bandwidth = detect_modality_auto_bandwidth(X_train) + num_peaks = len(peaks) + + if num_peaks <= 1 : + # Unimodal + sigma = ( iqr(X_train) / 1.349 ) # approximate standard deviation assuming normality + if sigma == 0.0: # Fall back in case IQR is 0 + sigma = np.std(X_train) / 16 + + sigma = sigma / np.sqrt(f) + + else: + # Multimodal + # Suppose you detected k peaks + k = num_peaks # from your detection method + + # Cluster the data into k groups + data_reshaped = X_train.reshape(-1, 1) + labels = KMeans(n_clusters=k, n_init=10).fit_predict(data_reshaped) + + # Compute per-cluster STD + cluster_stds = [np.std(X_train[labels == i]) for i in range(k)] + + # Use average STD for gamma + sigma = np.mean(cluster_stds) + + sigma = sigma / np.sqrt(f) + gamma = 1 / (2 * sigma ** 2) + return gamma + + +def outlier_score_svm(X_train, X_test, nu=0.2, f=0.4, kernel='rbf', output='prediction'): + # Step 0 : Filter the train data to remove large outliers + X_train = X_train[ + np.abs(X_train - np.median(X_train)) <= 5 * iqr(X_train) + ] + + gamma = get_gamma_svm(X_train, f=f) + model = OneClassSVM(kernel=kernel, gamma=gamma, nu=nu) + model.fit(X_train.reshape(-1, 1)) # This is slow for high N ; e.g. 29 seconds for 69k samples + + if output == 'prediction': + # Let the model decide what is the predicted output, score is set to 0 + pred_svm = model.predict(X_test.reshape(-1, 1)) + score_test = np.zeros(np.shape(pred_svm)) + elif output == 'score': + # Outputting a score instead of directly the SVM prediction + scores_train = model.decision_function(X_train.reshape(-1, 1)) + scores_test = model.decision_function(X_test.reshape(-1, 1)) + threshold = np.percentile(scores_train, 100 * nu) # nu fraction as outliers + pred_svm = np.where(scores_test >= threshold, 1, -1) + + # map the OneClassSVM output from {1, -1} into {0, 1} + pred_svm_01 = np.where(pred_svm == -1, 1, 0) + + return pred_svm_01, score_test + + +def generate_testset(X_train): + val_iqr = 3 * iqr(X_train) + X_test = np.linspace(min(X_train) - val_iqr, max(X_train) + val_iqr, 100).reshape(-1, 1) + return X_test + + +def score_svm_1pid(df_base, df_new, features, mapping, + min_ch_outlier=20, n_pid=3, min_ch_compute=100, max_ch_compute = 3000, p_thresh_kde=0.98): + # Regions + regions = np.unique(df_new[mapping + "_id"]).astype(int) + # Store the features that are outlier per brain region in a dict + dictout = dict((el, list()) for el in regions.tolist()) + dictout["mapping"] = mapping + dictout["features"] = features + + for count, region in tqdm.tqdm(enumerate(regions), total=len(regions)): + # Get channel indices that are in region, but keeping all info besides features + idx_reg = np.where(df_new[mapping + "_id"] == region) + df_new_compute = df_new.iloc[idx_reg].copy() + df_new_compute["has_outliers"] = False + + listout = list() + for feature in features: + # print(f"{feature} [{region}]") + # Load data for that regions + df_train = select_series( + df_base, features=[feature], acronym=None, id=region, mapping=mapping + ) + # Get channel indices that are in region, keeping only feature values + df_test = select_series( + df_new, features=[feature], acronym=None, id=region, mapping=mapping + ) + + df_pid = select_series( + df_base, features=['pid'], acronym=None, id=region, mapping=mapping + ) + + # For all channels at once, test if outside the distribution for the given features + train_data = df_train.to_numpy() + test_data = df_test.to_numpy() + + # print(f'Train N: {len(train_data)}, Test N: {len(test_data)}') + + # Outlier → 1 + # Inlier → 0 + + # score_out = 0 if N pid or N channel too small in training set + if bool((df_pid.nunique().values[0] >= n_pid) & (df_pid.shape[0] >= min_ch_compute)): + if len(train_data) > max_ch_compute: # Apply KDE method as SVM model.fit is too slow for high N + scoreq_out, _, _ = kde_proba_distribution(train_data, test_data) # This is a score to be thresholded + scoreq_out = scoreq_out.squeeze() # TODO this should not be necessary, place in kde_proba_distribution + score_out = scoreq_out > p_thresh_kde + score_out = score_out.astype(int) + + else: + pred_svm, _ = outlier_score_svm(train_data, test_data) + else: + score_out = np.zeros(test_data.shape) + # Save into new column + df_new_compute[feature + "_extremes"] = score_out + # A region is assigned as having outliers if more than N minimum channels are outliers + has_outliers = sum(df_new_compute[feature + "_extremes"]) > np.floor(min_ch_outlier) + if has_outliers: + listout.append(feature) + if sum( + df_new_compute["has_outliers"] == 0 + ): # Reassign only if entirely False + df_new_compute["has_outliers"] = True + # Save appended list of feature in dict + dictout[region] = listout + + # Concatenate dataframes + if count == 0: + df_save = df_new_compute.copy() + else: + df_save = pd.concat([df_save, df_new_compute]) + + if df_save["has_outliers"].sum() > 0: + has_outlier = True + else: + has_outlier = False + + # Resort by channel + df_save = df_save.sort_values(by=["channel"]) + return df_save, dictout, has_outlier diff --git a/src/ephysatlas/regionclassifier.py b/src/ephysatlas/regionclassifier.py index 6b1ecbd..eda90b3 100644 --- a/src/ephysatlas/regionclassifier.py +++ b/src/ephysatlas/regionclassifier.py @@ -189,7 +189,7 @@ def viterbi( def infer_regions(df_inference, path_model, n_folds=5): for fold in range(n_folds): - dict_model = load_model(path_model.joinpath(f"FOLD0{fold}")) + _, dict_model = load_model(path_model.joinpath(f"FOLD0{fold}")) classifier = dict_model["classifier"] df_inference["outside"] = 0 diff --git a/tests/test_outliers.py b/tests/test_outliers.py new file mode 100644 index 0000000..1b86baa --- /dev/null +++ b/tests/test_outliers.py @@ -0,0 +1,111 @@ +from ephysatlas import outliers +import numpy as np +import unittest + + +class TestOutliersKDEProba(unittest.TestCase): + + def test_usage(self): + # Set random seed for reproducibility + np.random.seed(42) # TODO this does not seem to work + # Generate train data from a normal distribution + train_data = np.random.normal(loc=-4.5, scale=1.2, size=2000) + # Generate test data from a normal distribution + test_data = np.random.normal(loc=-3.5, scale=0.4, size=60) + # Compute score + score, x, hist = outliers.kde_proba_distribution(train_data, test_data) + assert score.shape[0] == test_data.shape[0] + + score_out = np.array([0.94909745, 0.95985259, 0.94758700, 0.95592986, 0.93989972, + 0.96315955, 0.96212788, 0.94726918, 0.96550552, 0.96921661, + 0.94203708, 0.94796957, 0.94788689, 0.94578141, 0.96682835, + 0.95866918, 0.95697261, 0.94013844, 0.96537497, 0.94521747, + 0.96241355, 0.96347127, 0.97713305, 0.94508643, 0.94181560, + 0.96301632, 0.97997165, 0.97587523, 0.96314187, 0.95152157, + 0.96633449, 0.94571803, 0.96450351, 0.96466301, 0.97192622, + 0.98534690, 0.96404361, 0.95123884, 0.96199762, 0.94007265, + 0.94884578, 0.95349810, 0.95104398, 0.96286312, 0.94720050, + 0.97326999, 0.96212302, 0.96207678, 0.95545146, 0.96287692, + 0.96673165, 0.94662449, 0.94931726, 0.95603152, 0.94501664, + 0.94724124, 0.95189154, 0.96782068, 0.96339920, 0.96306370]) + + np.testing.assert_almost_equal(score, score_out) + + def test_bimodal(self): + # Check that a sample selected in between two Gaussian distributions is detected as an outlier + # Generate train data from 2 normal distributions + train_data = np.concatenate(( + np.random.normal(loc=-4.5, scale=1.2, size=2000), + np.random.normal(loc=17.5, scale=2.2, size=1000) + )) + + # Generate test data from a normal distribution + test_data = np.random.normal(loc=-3.5, scale=0.4, size=60) + # Add some clear outliers + test_data = np.concatenate((test_data, np.array([5.4, 6.4, 0, -9.0]))) + + # Compute score + score, _, _ = outliers.kde_proba_distribution(train_data, test_data) + assert score[-1] == 1 + + def test_pad_span(self): + # If taking a test value very far out, the train histogram will have more padded sample points + # We do not want this to influence the mean result of the score. Test to check the values don't change + + # Generate train data from a normal distribution + train_data = np.random.normal(loc=-4.5, scale=1.2, size=2000) + + # Test some clear outliers + test_data1 = np.array([-9.0, 3.3, -5.2, -7.8]) + test_data2 = np.array([-9.0, 3.3, -5.2, -1000]) + + # Compute score + score1, _, _ = outliers.kde_proba_distribution(train_data, test_data1) + score2, _, _ = outliers.kde_proba_distribution(train_data, test_data2) + + for i_sample in range(0, 3): + assert score1[i_sample] == score2[i_sample] + + + def test_log_transform(self): + # We want to check that doing twice a log/exp transform onto a Gaussian give the same values + + # Gaussian + # Generate train data from a normal distribution + train_data = np.random.normal(loc=-4.5, scale=1.2, size=2000) + # Generate test data linearly around train set + val_iqr = 3 * np.std(train_data) + test_data = np.linspace(min(train_data) - val_iqr, max(train_data) + val_iqr, 100).reshape(-1, 1) + # Compute score + score, x, hist = outliers.kde_proba_distribution(train_data, test_data) + + # Log normal + train_data_log = np.exp(train_data) + test_data_log = np.exp(test_data) + + # Compute score + score_log, _, _ = outliers.kde_proba_distribution(train_data_log, test_data_log) + # TODO this is different and the below does not pass! + np.testing.assert_almost_equal(np.exp(test_data), test_data_log) + + plot_debug = True + # Plot for debug + if plot_debug: + import matplotlib.pyplot as plt + from ephysatlas.plots import plot_histogram + + fig, axs = plt.subplots(1, 3) + axs[0].plot(test_data) + axs[1].plot(np.exp(test_data)) + axs[2].plot(test_data_log) + + fig, axs = plt.subplots(1, 1) + plot_histogram(train_data, ax=axs, normalise=True) + plt.plot(test_data, -1*(score-1), 'k.-') + plt.plot(np.log(test_data_log), -1 * (score_log - 1), 'b.-') + + fig, axs = plt.subplots(1, 1) + plot_histogram(train_data_log, ax=axs, normalise=True) + plt.plot(test_data_log, -1*(score_log-1), 'k.-') + +