# Avoid Including Test Data in the Training Data

- Test-training separation prevents model overfitting and improves generalization.
- It is crucial for ensuring model accuracy and reliability in ML-enabled systems.

In [5]:
# Instead of this
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge

def load_data():
    data = pd.read_csv('data.csv')
    X = data.drop('target', axis=1)
    y = data['target']
    return X, y

# Load data
X, y = load_data()

# Split data into train and test sets, including test data in the training process
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train = pd.concat([X_train, X_test])
y_train = pd.concat([y_train, y_test])

# Train model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Evaluate model on test set
lr_score = lr.score(X_test, y_test)
print(f"Model score: {lr_score}")

Model score: -0.01289618176228724


Line 17 and 18 describe the problem of including test data in training data.

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

def load_data():
    data = pd.read_csv('data.csv')
    X = data.drop('target', axis=1)
    y = data['target']
    return X, y


# Load data
X, y = load_data()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the model using the training data
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model using the testing data
score = model.score(X_test, y_test)
print(f"Model score: {score}")

Model score: -0.14471621073493934
