In [2]:
# naive approach to normalizing the data before splitting the data and evaluating the model
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [5]:
# define dataset
x, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5, random_state=7)

# standardize the dataset
scaler = MinMaxScaler()
x = scaler.fit_transform(x)


# Split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=1)

# fit the model
model = LogisticRegression()
model.fit(x_train, y_train)

# evaluate model
yhat = model.predict(x_test)

# evaluate prediction
accuracy = accuracy_score(y_test, yhat)
print(f'Accuracy: {round(accuracy, 3)*100}%')

Accuracy: 84.8%



### Train-Test Evaluation with Correct Data Prepartion

In [9]:
# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5,
                           random_state=7)

# split into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=1)

# define the scaler
scaler = MinMaxScaler()

# fit on the training dataset
scaler.fit(x_train)

# scale the train and test
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

# fit the model
model = LogisticRegression()
model.fit(x_train, y_train)

# Evaluate the model
yhat = model.predict(x_test)

# Evaluate predictions
score = accuracy_score(y_test, yhat)
print(f'Accuracy: {round(score, 3) * 100}%')

Accuracy: 85.5%


### Data preparation With k-fold Cross-Validation

In [10]:
# naive data preparation for model evaluation with k-fold cross-validation
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

In [13]:
# define dataset
X, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5,
                           random_state=7)
# standardize the dataset
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# define the model
model = LogisticRegression()

# define the evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)

# evaluate the model using cross validation
scores = cross_val_score(model, x, y, scoring='accuracy', cv=cv, n_jobs=1)
print(f"Accuracy: {np.round(np.mean(scores), 2)*100}")

Accuracy: 85.0


In [14]:
# correct data preparation for model evaluation with k-fold cross-validation
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [None]:
# define dataset
x, y = make_classification(n_samples=1000, n_features=20, n_informative=15, n_redundant=5,
                           random_state=7)

# define the pipeline
steps = list()
steps.append(('scaler', MinMaxScaler()))
steps.append(('model', LogisticRegression()))
pipeline = Pipeline(steps=steps)

