# Feature scaling

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

## What is Feature Scaling?

**Feature scaling is a method used to normalize the range of independent variables or features of data.**

## When to scale your data?

- **`Gradient Descent Based Algorithms`**

<img width="500" src="images/scaling_fig01.webp">

In [None]:
import timeit
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression

# Load the Sklearn diabetes data set
diabetes_X, diabetes_y = load_diabetes(return_X_y=True)

# Create scaled data set
raw = diabetes_X[:, None, 2]
max_raw = max(raw)
min_raw = min(raw)
scaled = (2*raw - max_raw - min_raw)/(max_raw - min_raw)

def train_raw_data():
    LinearRegression().fit(raw, diabetes_y)

def train_scaled_data():
    LinearRegression().fit(scaled, diabetes_y)

# Use the timeit method to measure the execution of training method
raw_time = timeit.timeit(train_raw_data, number=1000)
scaled_time = timeit.timeit(train_scaled_data, number=1000)

# Print the time taken to train the model with raw data and scaled data
print(f"Raw data: {raw_time}s")
print(f"Scaled data: {scaled_time}s")


- **`Distance-Based Algorithms (very sensitive to the relative magnitudes of the different features)`**
    - Distance-based algorithms like KNN, K-means, and SVM are most affected by the range of features. 

- **`For feature engineering using PCA`**

- **`Linear regression`**

- **`L1 or L2 regularization`**

## When scaling your data is not necessary?

- **`Tree-based algorithms`**

## Different types of features scaling

In [None]:
from helpers import plot_scaling

plot_scaling.plot_scaling()

- Use `MinMaxScaler` as your default
- Use `RobustScaler` if you have outliers and can handle a larger range
- Use `StandardScaler` if you need normalized features
- Use `Normalizer` sparingly - it normalizes rows, not columns

In [None]:
df = pd.DataFrame({'WEIGHT': [15, 18, 12,10],
                   'PRICE': [1,3,2,5]},
                   index = ['Orange','Apple','Banana','Grape'])
df

### Normalization

- Also known as **min-max scaling** or **min-max normalization**, it is the simplest method and consists of **rescaling the range of features to scale the range in [0, 1]**. 


The general formula for normalization is given as:

<img width="150" src="images/formula_img01.png">

> We can also do a normalization over different intervals, e.g. choosing to have the variable laying in any [a, b] interval, a and b being real numbers.

In [None]:
# default range 
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

df1 = pd.DataFrame(scaler.fit_transform(df),
                   columns=['WEIGHT','PRICE'],
                   index = ['Orange','Apple','Banana','Grape'])

ax = df.plot.scatter(x='WEIGHT', y='PRICE',color=['red','green','blue','yellow'], 
                     marker = '*',s=80, label='BREFORE SCALING');

df1.plot.scatter(x='WEIGHT', y='PRICE', color=['red','green','blue','yellow'],
                 marker = 'o',s=60,label='AFTER SCALING', ax = ax);
plt.axhline(0, color='red',alpha=0.2)
plt.axvline(0, color='red',alpha=0.2)
plt.show()

In [None]:
# [-1,1] range
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(-1,1))

df1 = pd.DataFrame(scaler.fit_transform(df),
                   columns=['WEIGHT','PRICE'],
                   index = ['Orange','Apple','Banana','Grape'])

ax = df.plot.scatter(x='WEIGHT', y='PRICE',color=['red','green','blue','yellow'], 
                     marker = '*',s=80, label='BREFORE SCALING');

df1.plot.scatter(x='WEIGHT', y='PRICE', color=['red','green','blue','yellow'],
                 marker = 'o',s=60,label='AFTER SCALING', ax = ax);
plt.axhline(0, color='red',alpha=0.2)
plt.axvline(0, color='red',alpha=0.2)
plt.show()

### Standardization

- **Z-score normalization**, also known as **Z-score standardization** or **mean-variance scaling**.
- Feature standardization makes the values of each feature in the data **have zero mean and a standard deviation of one (unit variance)**. 


The general method of calculation is to determine the distribution mean and standard deviation for each feature and calculate the new data point by the following formula:

<img width="150" src="images/formula_img02.png">

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

df2 = pd.DataFrame(scaler.fit_transform(df),
                   columns=['WEIGHT','PRICE'],
                   index = ['Orange','Apple','Banana','Grape'])

ax = df.plot.scatter(x='WEIGHT', y='PRICE',color=['red','green','blue','yellow'], 
                     marker = '*',s=80, label='BREFORE SCALING');
df2.plot.scatter(x='WEIGHT', y='PRICE', color=['red','green','blue','yellow'],
                 marker = 'o',s=60,label='AFTER SCALING', ax = ax)

plt.axhline(0, color='red',alpha=0.2)
plt.axvline(0, color='red',alpha=0.2)
plt.show()

Practical example:

In [None]:
# Train a Perceptron Model without Feature Scaling
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score

iris = datasets.load_iris()

# features are sepal length and petal length
X = iris.data[:, [0, 2]]
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

prcptrn = Perceptron(eta0=0.1, random_state=1)
prcptrn.fit(X_train, y_train)

y_predict = prcptrn.predict(X_test)
print(f"Misclassified examples {(y_test != y_predict).sum()}")
print(f"Accuracy Score {accuracy_score(y_test, y_predict):.3f}")

In [None]:
# Train a Perceptron Model with Feature Scaling
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

iris = datasets.load_iris()

# features are sepal length and petal length
X = iris.data[:, [0, 2]]
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

sc = StandardScaler()
sc.fit(X_train)

X_train_scaled = sc.transform(X_train)
X_test_scaled = sc.transform(X_test)

prcptrn = Perceptron(eta0=0.1, random_state=1)
prcptrn.fit(X_train_scaled, y_train)

y_predict = prcptrn.predict(X_test_scaled)
print(f"Misclassified examples {(y_test != y_predict).sum()}")
print(f"Accuracy Score {accuracy_score(y_test, y_predict):.3f}")

### Robust Scalar

- Robust scaling is one of the **best scaling techniques when we have outliers present in our dataset**. It scales the data accordingly to the interquartile range (IQR = 75 Quartile — 25 Quartile).

In [None]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()

df3 = pd.DataFrame(scaler.fit_transform(df),
                   columns=['WEIGHT','PRICE'],
                   index = ['Orange','Apple','Banana','Grape'])

ax = df.plot.scatter(x='WEIGHT', y='PRICE',color=['red','green','blue','yellow'], 
                     marker = '*',s=80, label='BREFORE SCALING');

df3.plot.scatter(x='WEIGHT', y='PRICE', color=['red','green','blue','yellow'],
                 marker = 'o',s=60,label='AFTER SCALING', ax = ax)
plt.axhline(0, color='red',alpha=0.2)
plt.axvline(0, color='red',alpha=0.2)
plt.show()

Effect of scaling using Standard Scaler and Robust Scaler:

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

dfr = pd.DataFrame({'WEIGHT': [15, 18, 12,10,50],
                   'PRICE': [1,3,2,5,20]},
                   index = ['Orange','Apple','Banana','Grape','Jackfruit'])


scaler = StandardScaler()

df21 = pd.DataFrame(scaler.fit_transform(dfr),
                   columns=['WEIGHT','PRICE'],
                   index = ['Orange','Apple','Banana','Grape','Jackfruit'])

ax = dfr.plot.scatter(x='WEIGHT', y='PRICE',color=['red','green','blue','yellow','black'], 
                     marker = '*',s=80, label='BREFORE SCALING')

df21.plot.scatter(x='WEIGHT', y='PRICE', color=['red','green','blue','yellow','black'],
                 marker = 'o',s=60,label='STANDARD', ax = ax,figsize=(12,6))

scaler = RobustScaler()

df31 = pd.DataFrame(scaler.fit_transform(dfr),
                   columns=['WEIGHT','PRICE'],
                   index = ['Orange','Apple','Banana','Grape','Jackfruit'])

df31.plot.scatter(x='WEIGHT', y='PRICE', color=['red','green','blue','yellow','black'],
                 marker = 'v',s=60,label='ROBUST', ax = ax,figsize=(12,6))

plt.axhline(0, color='red',alpha=0.2)
plt.axvline(0, color='red',alpha=0.2);
plt.show()

### Scaling to unit length

**Normalizer works on the rows, not the columns.**

- It scales each data point such that the **feature vector has a Euclidean length of 1**.

This usually means dividing each component by the Euclidean length of the vector:

<img width="350" src="images/formula_img03.png">

## Applying Scaling Transformations

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

cancer = load_breast_cancer()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=1)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
scaler.fit(X_train)

In [None]:
# transform data
X_train_scaled = scaler.transform(X_train)

In [None]:
# print dataset properties before and after scaling
print(f"transformed shape: {X_train_scaled.shape}")
print(f"per-feature minimum before scaling:\n {X_train.min(axis=0)}")
print(f"per-feature maximum before scaling:\n {X_train.max(axis=0)}")
print(f"per-feature minimum after scaling:\n {X_train_scaled.min(axis=0)}")
print(f"per-feature maximum after scaling:\n {X_train_scaled.max(axis=0)}")

In [None]:
# transform test data
X_test_scaled = scaler.transform(X_test)

# print test data properties after scaling
print(f"per-feature minimum after scaling:\n{X_test_scaled.min(axis=0)}")
print(f"per-feature maximum after scaling:\n{X_test_scaled.max(axis=0)}")

**MinMaxScaler (and all the other scalers) always
applies exactly the same transformation to the training and the test set**. 

## Scaling Training and Test Data the Same Way

**It is important to apply exactly the same transformation to the training set and the test set for the supervised model to work on the test set.**

In [None]:
from sklearn.datasets import make_blobs

In [None]:
#make synthetic data
X, _ = make_blobs(n_samples=50, centers=5, random_state=4, cluster_std=2)

# split it into training and test sets
X_train, X_test = train_test_split(X, random_state=5, test_size=.1)

In [None]:
# plot the training and test sets
fig, axes = plt.subplots(1, 3, figsize=(13, 4))
axes[0].scatter(X_train[:, 0], X_train[:, 1], c="blue", label="Training set", s=60)
axes[0].scatter(X_test[:, 0], X_test[:, 1], marker='^', c="red", label="Test set", s=60)
axes[0].legend(loc='upper left')
axes[0].set_title("Original Data")

# scale the data using MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# visualize the properly scaled data
axes[1].scatter(X_train_scaled[:, 0], X_train_scaled[:, 1], c="blue", label="Training set", s=60)
axes[1].scatter(X_test_scaled[:, 0], X_test_scaled[:, 1], marker='^',c="red", label="Test set", s=60)
axes[1].set_title("Scaled Data")

# rescale the test set separately
# so test set min is 0 and test set max is 1
# DO NOT DO THIS! For illustration purposes only.
test_scaler = MinMaxScaler()
test_scaler.fit(X_test)
X_test_scaled_badly = test_scaler.transform(X_test)

# visualize wrongly scaled data
axes[2].scatter(X_train_scaled[:, 0], X_train_scaled[:, 1], c="blue", label="training set", s=60)
axes[2].scatter(X_test_scaled_badly[:, 0], X_test_scaled_badly[:, 1], marker='^', c="red", label="test set", s=60)
axes[2].set_title("Improperly Scaled Data")

for ax in axes:
    ax.set_xlabel("Feature 0")
    ax.set_ylabel("Feature 1")
    
plt.show()

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# calling fit and transform in sequence (using method chaining)
X_scaled = scaler.fit(X).transform(X)

# same result, but more efficient computation
X_scaled_d = scaler.fit_transform(X)

## The Effect of Preprocessing on Supervised Learning

In [None]:
from sklearn.svm import SVC

cancer = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)

svm = SVC(C=100)
svm.fit(X_train, y_train)

print(f"Test set accuracy: {svm.score(X_test, y_test):.2f}")

In [None]:
# preprocessing using 0-1 scaling
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# learning an SVM on the scaled training data
svm = SVC(C=100)
svm.fit(X_train_scaled, y_train)

# scoring on the scaled test set
print(f"Scaled test set accuracy: {svm.score(X_test_scaled, y_test):.2f}")

In [None]:
# preprocessing using zero mean and unit variance scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# learning an SVM on the scaled training data
svm.fit(X_train_scaled, y_train)

# scoring on the scaled test set
print(f"Test set accuracy: {svm.score(X_test_scaled, y_test):.2f}")

In [None]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# learning an SVM on the scaled training data
svm.fit(X_train_scaled, y_train)

# scoring on the scaled test set
print(f"Test set accuracy: {svm.score(X_test_scaled, y_test):.2f}")

## Tips

<img src="images/scaling_fig02.png">