<a href="https://colab.research.google.com/github/krishdb38/All_DL/blob/master/How_to_Avoid_Data_Leakage_When_Performing_Data_Preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import numpy as np

## Data Preparation with Train and Test Sets

In [2]:
# Test Classification dataset
from sklearn.datasets import  make_classification

# define dataset
x,y = make_classification(n_samples = 1000, n_features = 20, n_informative = 15, n_redundant = 5, random_state = 7)

# Summarize the dataset
print(x.shape, y.shape)

(1000, 20) (1000,)


In [6]:
# Running the above creates an artificial datasets.
# 1000 rows and 20 Column
pd.DataFrame(x[:5])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.292995,-4.212231,-1.288332,-2.178498,-0.645277,2.580977,0.284224,-7.182793,-1.912111,2.737295,0.813957,3.969737,-2.669398,3.346923,4.197918,0.99991,-0.302019,-4.431706,-2.826467,0.449168
1,-0.068399,5.518841,11.238977,-5.0397,-2.086784,2.149685,0.559734,15.113777,-3.071834,-2.574584,3.324576,2.067542,-5.249258,-2.1545,4.931091,1.296735,-3.186133,-3.089948,1.190299,1.620256
2,0.731616,-0.684686,-0.981742,-2.552465,-5.270308,-1.561498,-1.169269,-2.104087,-1.131139,4.654775,-2.786596,-2.034761,2.149657,-0.134154,-1.198231,-2.720604,-0.123961,5.654297,-0.646599,-3.15653
3,2.309107,-0.320548,-6.591664,1.070525,-4.418769,1.134274,2.340813,-5.983425,0.675917,-1.007879,-0.761441,6.866297,1.44227,1.768678,5.173661,-1.070164,-2.447064,-1.109038,-2.997035,1.993212
4,-0.488406,-3.213065,1.100805,-1.356223,5.325086,0.729179,-0.25704,-1.035284,0.478013,-0.010764,-0.227408,2.551456,0.951594,-2.91491,-2.186843,-1.089129,1.406454,3.082424,0.925835,-2.326362


In [None]:
# Next, we can evaluate our model on the scaled dataset, starting with their naive or incorrect approach.

### Train-Test Evaluation with Naive Data Preparation

In [7]:
# The Naive approach involves first applying the data preparation method, then splitting the data before finally evaluating the model
# We can normalize the input variables using MinMaxScaler class, 
# Which is first defined with the default configuration  scaling the data to the range 0-1.
# then the fit transform on the dataset and apply it to the dataset in a single step.


# The Result is normalized version of the input variables, where each column in the array is separately normalized(e.g. has its own minimum and maximum calculated)

In [9]:
# Standardize the data set
from sklearn.preprocessing import  MinMaxScaler
scaler = MinMaxScaler()
x = scaler.fit_transform(x) # fit and transorm , single step

In [10]:
# Split the data to train and test sets
from sklearn.model_selection import  train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.3, random_state = 1)

In [13]:
print(x_train.shape,x_test.shape, y_train.shape, y_test.shape)

(700, 20) (300, 20) (700,) (300,)


## Fit the Model

In [14]:
from sklearn.linear_model import  LogisticRegression
# fit the model
model = LogisticRegression()
model.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [17]:

from sklearn.metrics import   accuracy_score
# Evaluate the Model
yhat = model.predict(x_test)

#Evaluate the accuracy
accuracy = accuracy_score(y_test, yhat)
print("Accuracy of the Model ", accuracy * 100)

Accuracy of the Model  84.33333333333334


### All above process in single

In [21]:
# naive approach to normalizing the data before splitting the data and evaluating the model
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)
# standardize the dataset
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# fit the model
model = LogisticRegression()
model.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.3f' % (accuracy*100))

Accuracy: 84.848


## Train-Test Evaluation With Correct Data Preparation
Fit the preparation on the training set, then apply the transformation to the train and test sets

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.33, random_state=1)

In [23]:
# Fit MinMaxScaler on Training set, then apply the transform() function on the train and test sets to create a normalized version of each datasets.
# define the scaler
scaler = MinMaxScaler()
# Fit on the training dataset
scaler.fit(x_train)

# scale the training set
x_train = scaler.transform(x_train)

# scale the test dataset
x_test = scaler.transform(x_test)


In [24]:
# This avoids data leakage as the calculation of the minimum and maximum value for each 
# input variable is calculated using only the training dataset (x_train) instead of the entire dataset(x)

# The Model can then be evaluated as before

In [None]:
# Try all these process together
from sklearn.datasets import  make_classification
from sklearn.model_selection import  train_test_split
from sklearn.preprocessing import  MinMaxScaler
from sklearn.linear_model import LogisticRegression
