# Impact of Feature Scaling in Machine Learning Algorithms

An overview of the impact of feature scaling in MLs algorithms

In [1]:
import pandas as pd
import numpy as np
# import the datasets from UCI
from ucimlrepo import fetch_ucirepo   
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17)
breast_cancer_wisconsin_original = fetch_ucirepo(id=15)
dry_bean_dataset = fetch_ucirepo(id=602)
glass_identification = fetch_ucirepo(id=42) 
heart_disease = fetch_ucirepo(id=45)
hepatitis = fetch_ucirepo(id=46)
iris = fetch_ucirepo(id=53)
letter_recognition = fetch_ucirepo(id=59) 
lung_cancer = fetch_ucirepo(id=62)
magic_gamma_telescope = fetch_ucirepo(id=159) 
parkinsons = fetch_ucirepo(id=174) 
rice_cammeo_and_osmancik = fetch_ucirepo(id=545)
wine = fetch_ucirepo(id=109) 

In [2]:
dataset_list = [
    'breast_cancer_wisconsin_diagnostic',
    'breast_cancer_wisconsin_original',
    'dry_bean_dataset',
    'glass_identification',
    'heart_disease',
    'hepatitis',
    'iris',
    'letter_recognition',
    'magic_gamma_telescope',
    'parkinsons',
    'rice_cammeo_and_osmancik',
    'wine'
    ]

In [3]:
# convert into dataframe for some analysis and other conversion
df_breast_cancer_wisconsin_diagnostic = breast_cancer_wisconsin_diagnostic.data.features.join(breast_cancer_wisconsin_diagnostic.data.targets)
df_breast_cancer_wisconsin_original = breast_cancer_wisconsin_original.data.features.join(breast_cancer_wisconsin_original.data.targets)
df_dry_bean_dataset = dry_bean_dataset.data.features.join(dry_bean_dataset.data.targets)
df_glass_identification = glass_identification.data.features.join(glass_identification.data.targets)
df_heart_disease = heart_disease.data.features.join(heart_disease.data.targets)
df_hepatitis = hepatitis.data.features.join(hepatitis.data.targets)
df_iris = iris.data.features.join(iris.data.targets)
df_letter_recognition = letter_recognition.data.features.join(letter_recognition.data.targets)
df_lung_cancer = lung_cancer.data.features.join(lung_cancer.data.targets)
df_magic_gamma_telescope = magic_gamma_telescope.data.features.join(magic_gamma_telescope.data.targets)
df_parkinsons = parkinsons.data.features.join(parkinsons.data.targets)
df_rice_cammeo_and_osmancik = rice_cammeo_and_osmancik.data.features.join(rice_cammeo_and_osmancik.data.targets)
df_wine = wine.data.features.join(wine.data.targets)

## Analysis if all the feature are non-null and numeric type

In [4]:
df_breast_cancer_wisconsin_diagnostic.info()
df_breast_cancer_wisconsin_original.info()
df_dry_bean_dataset.info()
df_glass_identification.info()
df_heart_disease.info()
df_hepatitis.info()
df_iris.info()
df_letter_recognition.info()
df_lung_cancer.info()
df_magic_gamma_telescope.info()
df_parkinsons.info()
df_rice_cammeo_and_osmancik.info()
df_wine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   radius1             569 non-null    float64
 1   texture1            569 non-null    float64
 2   perimeter1          569 non-null    float64
 3   area1               569 non-null    float64
 4   smoothness1         569 non-null    float64
 5   compactness1        569 non-null    float64
 6   concavity1          569 non-null    float64
 7   concave_points1     569 non-null    float64
 8   symmetry1           569 non-null    float64
 9   fractal_dimension1  569 non-null    float64
 10  radius2             569 non-null    float64
 11  texture2            569 non-null    float64
 12  perimeter2          569 non-null    float64
 13  area2               569 non-null    float64
 14  smoothness2         569 non-null    float64
 15  compactness2        569 non-null    float64
 16  concavit

In [5]:
df_breast_cancer_wisconsin_original = df_breast_cancer_wisconsin_original.fillna(0)
df_lung_cancer = df_lung_cancer.fillna(0)

## Train and test split

In [6]:
from sklearn.model_selection import train_test_split
for i in dataset_list:
    exec('X_train_{}, X_test_{}, y_train_{}, y_test_{} = train_test_split({}.data.features, {}.data.targets, test_size=0.3, random_state=0)'.format(i,i,i,i,i,i))

## Scaling Methods

In [7]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler
from sklearn.preprocessing import PowerTransformer, QuantileTransformer
scaling_list = [
    'MinMaxScaler',
    'MaxAbsScaler',
    'StandardScaler',
    'ParetoScaling',
    'StandardStabilityScaling',
    'MeanCentered',
    'MedianAbsScaler',
    'RobustScaler',
    'PowerTransformer',
    'QuantileTransformer',
    'DecimalScaling',
    'TanhTransformer',
    'LogisticSigmoidTransformer',
    'HyperbolicTangentTransformer'
]

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin
from typing import List

class MeanCentered(BaseEstimator, TransformerMixin):
    def __init__(self, cols: List[str]):
        self.cols = cols

    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        self.means_ = {col: np.mean(X[col]) for col in self.cols}
        self.stds_ = {col: np.std(X[col]) for col in self.cols}
        return self

    def transform(self, X: pd.DataFrame):
        X = X.copy()
        for col in self.cols:
            X[col] = (X[col] - self.means_[col])
        return X
    
class MedianAbsScaler(BaseEstimator, TransformerMixin):
    def __init__(self, cols: List[str]):
        self.cols = cols

    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        self.median_ = {col: np.median(X[col]) for col in self.cols}
        self.mad_ = {col: np.median(np.absolute(X[col]-np.median(X[col]))) for col in self.cols}
        return self

    def transform(self, X: pd.DataFrame):
        X = X.copy()
        for col in self.cols:
            X[col] = (X[col] - self.median_[col]) / self.mad_[col]
        return X
    
class StandardStabilityScaling(BaseEstimator, TransformerMixin):
    def __init__(self, cols: List[str]):
        self.cols = cols

    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        self.means_ = {col: np.mean(X[col]) for col in self.cols}
        self.stds_ = {col: np.std(X[col]) for col in self.cols}
        return self

    def transform(self, X: pd.DataFrame):
        X = X.copy()
        for col in self.cols:
            X[col] = ((X[col] - self.means_[col]) / self.stds_[col])*(self.means_[col]/self.stds_[col])
        return X
    
class ParetoScaling(BaseEstimator, TransformerMixin):
    def __init__(self, cols: List[str]):
        self.cols = cols

    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        self.means_ = {col: np.mean(X[col]) for col in self.cols}
        self.stds_ = {col: np.std(X[col]) for col in self.cols}
        return self

    def transform(self, X: pd.DataFrame):
        X = X.copy()
        for col in self.cols:
            X[col] = (X[col] - self.means_[col]) / np.sqrt(self.stds_[col])
        return X
    
class DecimalScaling(BaseEstimator, TransformerMixin):
    def __init__(self, cols: List[str]):
        self.cols = cols

    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        self.len_ = {col: len(str(abs(np.max((X[col]))))) for col in self.cols}
        return self

    def transform(self, X: pd.DataFrame):
        X = X.copy()
        for col in self.cols:
            X[col] = X[col] / 10**self.len_[col]
        return X

class TanhTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, cols: List[str]):
        self.cols = cols

    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        self.means_ = {col: np.mean(X[col]) for col in self.cols}
        self.stds_ = {col: np.std(X[col]) for col in self.cols}
        return self

    def transform(self, X: pd.DataFrame):
        X = X.copy()
        for col in self.cols:
            X[col] = 0.5*(np.tanh(0.01*((X[col] - self.means_[col])/ self.stds_[col])) + 1)
        return X

class LogisticSigmoidTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, cols: List[str]):
        self.cols = cols

    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        self.means_ = {col: np.mean(X[col]) for col in self.cols}
        self.stds_ = {col: np.std(X[col]) for col in self.cols}
        return self

    def transform(self, X: pd.DataFrame):
        X = X.copy()
        for col in self.cols:
            X[col] = 1 /(1 + np.exp(-(X[col] - self.means_[col]) / self.stds_[col]))
        return X
    
class HyperbolicTangentTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, cols: List[str]):
        self.cols = cols

    def fit(self, X: pd.DataFrame, y: pd.Series = None):
        self.means_ = {col: np.mean(X[col]) for col in self.cols}
        self.stds_ = {col: np.std(X[col]) for col in self.cols}
        return self

    def transform(self, X: pd.DataFrame):
        X = X.copy()
        for col in self.cols:
            X[col] = ((1 - np.exp(-(X[col] - self.means_[col]) / self.stds_[col]))) / (1 + np.exp(-(X[col] - self.means_[col]) / self.stds_[col]))
        return X

In [9]:
from sklearn.model_selection import train_test_split
for i in dataset_list:
    exec('X_train_{}, X_test_{}, y_train_{}, y_test_{} = train_test_split({}.data.features, {}.data.targets, test_size=0.3, random_state=0)'.format(i,i,i,i,i,i))
    exec('X_train_{} = X_train_{}.fillna(0)'.format(i,i))
    exec('X_test_{} = X_test_{}.fillna(0)'.format(i,i))
    exec('y_train_{} = y_train_{}.fillna(0)'.format(i,i))
    exec('y_test_{} = y_test_{}.fillna(0)'.format(i,i))

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [11]:
# prepare models
models = []
# models.append(('LR', LogisticRegression(random_state=0)))
models.append(('SVM', SVC(kernel='linear')))
models.append(('MLP', MLPClassifier()))
models.append(('RF', RandomForestClassifier()))
models.append(('NB', GaussianNB()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('Ada', AdaBoostClassifier()))
models.append(('XGB', XGBClassifier()))
models.append(('CatBoost', CatBoostClassifier()))
models.append(('LGBM', LGBMClassifier()))

In [14]:
from sklearn.metrics import accuracy_score
results = []
for name, model in models:
    for scaling in scaling_list:
        for i in dataset_list:
            if scaling in ['MinMaxScaler','MaxAbsScaler','StandardScaler','RobustScaler','PowerTransformer','QuantileTransformer']:
                exec('scaler = {}()'.format(scaling))
                exec('scaler.fit(X_train_{})'.format(i))
                exec('scaler.transform(X_test_{})'.format(i))
                exec('clf = {}'.format(model))
                exec('clf.fit(X_train_{}, y_train_{}.values.ravel())'.format(i,i))
                exec('y_pred=clf.predict(X_test_{})'.format(i))
                exec('accuracy=accuracy_score(y_pred, y_test_{}.values.ravel())'.format(i))
                a = scaling
                b = i
                exec('results.append([accuracy,{},a,b])'.format(model))
            else:
                exec('scaler = {}(cols=X_train_{}.columns)'.format(scaling,i))
                exec('scaler.fit_transform(X_train_{})'.format(i))
                exec('scaler.transform(X_test_{})'.format(i))
                exec('clf = {}'.format(model))
                exec('clf.fit(X_train_{}, y_train_{}.values.ravel())'.format(i,i))
                exec('y_pred=clf.predict(X_test_{})'.format(i))
                exec('accuracy=accuracy_score(y_pred, y_test_{}.values.ravel())'.format(i))
                
                # i didin't find any other way to save scaling method and name of data inste into results
                a = scaling
                b = i
                exec('results.append([accuracy,{},a,b])'.format(model))
        

  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  x = um.multiply(x, x, out=x)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


  x = um.multiply(x, x, out=x)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  x = um.multiply(x, x, out=x)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


  x = um.multiply(x, x, out=x)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  x = um.multiply(x, x, out=x)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  x = um.multiply(x, x, out=x)


  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)
  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


SyntaxError: positional argument follows keyword argument (<string>, line 11)

In [15]:
results

[[0.9590643274853801,
  SVC(kernel='linear'),
  'MinMaxScaler',
  'breast_cancer_wisconsin_diagnostic'],
 [0.9523809523809523,
  SVC(kernel='linear'),
  'MinMaxScaler',
  'breast_cancer_wisconsin_original'],
 [0.9169931439764937,
  SVC(kernel='linear'),
  'MinMaxScaler',
  'dry_bean_dataset'],
 [0.5692307692307692,
  SVC(kernel='linear'),
  'MinMaxScaler',
  'glass_identification'],
 [0.5714285714285714, SVC(kernel='linear'), 'MinMaxScaler', 'heart_disease'],
 [0.7872340425531915, SVC(kernel='linear'), 'MinMaxScaler', 'hepatitis'],
 [0.9777777777777777, SVC(kernel='linear'), 'MinMaxScaler', 'iris'],
 [0.8611666666666666,
  SVC(kernel='linear'),
  'MinMaxScaler',
  'letter_recognition'],
 [0.7919733613739923,
  SVC(kernel='linear'),
  'MinMaxScaler',
  'magic_gamma_telescope'],
 [0.9322033898305084, SVC(kernel='linear'), 'MinMaxScaler', 'parkinsons'],
 [0.9378827646544182,
  SVC(kernel='linear'),
  'MinMaxScaler',
  'rice_cammeo_and_osmancik'],
 [0.9814814814814815, SVC(kernel='linear')