# Leakage exploration

In [None]:
# Import modules
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeClassifierCV
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split


In [None]:
# Set parameters
random_state = 50
test_size = 0.3

# Load Iris dataset
iris = load_iris()

# Select features and target
X = iris.data
y = iris.target
feature_names = iris.feature_names

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=True, random_state=random_state)

In [6]:
# Check size of data
print("X shape", X.shape)
print("y shape", y.shape)
print("X_train shape", X_train.shape)
print("X_test shape", X_test.shape)
print("y_train shape", y_train.shape)
print("y_test shape", y_test.shape)



X shape (150, 4)
y shape (150,)
X_train shape (105, 4)
X_test shape (45, 4)
y_train shape (105,)
y_test shape (45,)


#  Leakage example 1:
### Train on whole data:

In [7]:
# Train our model on the whole data (Fig. 2 in Sasse et al., 2025)
dt_raw_lkg1 = DecisionTreeClassifier(max_depth=10, random_state=random_state)
dt_raw_lkg1.fit(X, y)

print("Raw Data - Train accuracy:", dt_raw_lkg1.score(X, y))
print("Raw Data - Test accuracy:", dt_raw_lkg1.score(X_test, y_test))


Raw Data - Train accuracy: 1.0
Raw Data - Test accuracy: 1.0


### Correct procedure:

In [8]:
# Train our model on the train set and test on the test set
dt_raw = DecisionTreeClassifier(max_depth=10, random_state=random_state)
dt_raw.fit(X_train, y_train)

print("Raw Data - Train accuracy:", dt_raw.score(X_train, y_train))
print("Raw Data - Test accuracy:", dt_raw.score(X_test, y_test))

Raw Data - Train accuracy: 1.0
Raw Data - Test accuracy: 0.9555555555555556


When the model was trained in train set and tested on test set the test performance dropped.
When the model was trained in the whole dataset it performed well in both, train and test datasets. 
This is because the model learned patterns of the test set during training.

# Leakage example 2:
### Feature selection on whole dataset:

The dataset used in this example corresponds to the AOMIC dataset. The features correspond to structural measures of the grey matter. The features correspond to sex.

In [9]:
# Load dataset
df_meta = pd.read_csv("./participants.tsv", sep="\t", index_col=0)
df_meta.index.name = "subject"

df_gmd = pd.read_csv("./aomic_brain_data.csv", sep=",", index_col=0)
df_gmd.index.name = "subject"

df_full = df_gmd.join(df_meta, how="inner")
df_full = df_full.dropna(axis=0, how="any")

# Select features and target
X = df_full[df_gmd.columns].values
y = df_full['sex'].values
# Convert target variable to binary 
y = np.where(y == 'male', 1, 0)

# Define reproducible cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
cv_splits = list(cv.split(X, y))  # Fix the splits for both cases

# ========== Case 1: Data Leakage ==========
# Feature selection on entire dataset (leakage)
selector = SelectKBest(score_func=f_classif, k=5)
X_selected_leak = selector.fit_transform(X, y)

# Scale and reduce dimensionality on entire dataset (leakage)
scaler = StandardScaler()
X_scaled_leak = scaler.fit_transform(X_selected_leak)

# Evaluate using fixed CV
model = RidgeClassifierCV()
scores_leakage = []
for train, test in cv_splits:
    model.fit(X_scaled_leak[train,:], y[train])
    pred = model.predict(X_scaled_leak[test,:])
    scores_leakage.append(accuracy_score(y[test], pred))
    

### Correct procedure:

In [10]:
# ========== Case 2: No Leakage ==========
# Pipeline with feature selection inside each fold
pipeline = Pipeline([
    ('select', SelectKBest(score_func=f_classif, k=5)),
    ('scaler', StandardScaler()),
    ('clf', RidgeClassifierCV())
])
scores_no_leakage = []
for train, test in cv_splits:
    pipeline.fit(X[train,:], y[train])
    pred = pipeline.predict(X[test,:])
    scores_no_leakage.append(accuracy_score(y[test], pred))

In [11]:
# ========== Compare Results ==========
results_df = pd.DataFrame({
    'Fold': np.arange(1, 6),
    'Accuracy with Leakage': scores_leakage,
    'Accuracy without Leakage': scores_no_leakage,
    'Difference': np.array(scores_leakage) - np.array(scores_no_leakage)
})

print(results_df)
print("\nMean Accuracy with Leakage: ", np.mean(scores_leakage))
print("Mean Accuracy without Leakage: ", np.mean(scores_no_leakage))
print("Mean Difference: ", np.mean(results_df['Difference']))

   Fold  Accuracy with Leakage  Accuracy without Leakage  Difference
0     1               0.818182                  0.727273    0.090909
1     2               0.681818                  0.772727   -0.090909
2     3               0.727273                  0.772727   -0.045455
3     4               0.666667                  0.571429    0.095238
4     5               0.666667                  0.619048    0.047619

Mean Accuracy with Leakage:  0.7121212121212122
Mean Accuracy without Leakage:  0.6926406926406925
Mean Difference:  0.019480519480519477


The approach causing leakage generally yielded better performance than the correct approach. Even though in this example the effect of leakage is not huge, in bigger and complex datasets its effect is much severe.

It is important to note that the results and the effect of leakage might change based on the use of different models, seeds, samples, features, etc.
Also leakage is complex and it is often unclear where it might or might not show. However, it is always important to avoid it in order to yield valid estimations of model performance.
