# Avoid Preprocessing Leakage

- Preprocessing leakage can lead to biased performance estimates and overfitting, resulting in inflated accuracy scores during testing.
- To avoid preprocessing leakage, it is important to use separate datasets for training, validation, and testing, and to perform all data preprocessing steps on the training data only.

In [18]:
#Instead of this
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# generate random data
n_samples, n_features, n_classes = 200, 10000, 2
rng = np.random.RandomState(42)
X = rng.standard_normal((n_samples, n_features))
y = rng.choice(n_classes, n_samples)

# leak test data through feature selection
X_selected = SelectKBest(k =25).fit_transform(X , y)

X_train , X_test , y_train , y_test = train_test_split(X_selected, y, random_state =42)
gbc = GradientBoostingClassifier(random_state =1)
gbc.fit(X_train, y_train)

y_pred = gbc.predict(X_test)
accuracy_score(y_test, y_pred)
# expected accuracy ~0.5; reported accuracy 0.76

0.76

In [19]:
# Do this
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

In [13]:
# generate random data
n_samples, n_features, n_classes = 200, 10000, 2
rng = np.random.RandomState(42)
X = rng.standard_normal((n_samples, n_features))
y = rng.choice(n_classes, n_samples)

In [14]:
# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [15]:
# select k best features using training data only
selector = SelectKBest(k=25)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

In [16]:
# train and evaluate the model on the training and test data
gbc = GradientBoostingClassifier(random_state=1)
gbc.fit(X_train_selected, y_train)
y_train_pred = gbc.predict(X_train_selected)
train_accuracy = accuracy_score(y_train, y_train_pred)
y_test_pred = gbc.predict(X_test_selected)
test_accuracy = accuracy_score(y_test, y_test_pred)

In [20]:
# report the accuracy scores
print("Training accuracy: {:.2f}".format(train_accuracy))
print("Test accuracy: {:.2f}".format(test_accuracy))

Training accuracy: 1.00
Test accuracy: 0.46
