<a href="https://colab.research.google.com/github/kyook17/UIUC_BADM/blob/main/BADM576_DS/576_Feature_Scaling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

**Standardization**
$z = (x - x.mean) / std$

* centers the mean at 0
* scales the variance at 1
* preserves the shape of the original distribution
* the minimum and maximum values of the different variables may vary
* preserves outliers



In [None]:
from sklearn.preprocessing import StandardScaler

**MinMaxScaling**

$Xscaled = (X - X.min )/ (X.max - X.min)$

* does not center the mean at 0
* variance varies across variables
* may not preserve the shape of the original distribution
* the minimum and maximum values are 0 and 1.
* sensitive to outliers

In [None]:
from sklearn.preprocessing import MinMaxScaler

There are several others. However, the two most common ones are **StandardScaler** and **MinMaxScaler**

# Example on the imapct of Scaling on Model Performance


In [None]:
!pip install dmba

In [None]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np
import matplotlib.pyplot as plt
from dmba import regressionSummary


In [None]:
# Load wine dataset
data = load_wine(as_frame=True)
X = data.data
y = data.target



In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# Comparing Linear Regression Models w/o scaling, with standard scaling and with min-max scaling.

In [None]:
# Initialize the model
linear_model = LinearRegression()



In [None]:
# Train and evaluate without scaling
linear_model.fit(X_train, y_train) # Fitting the model
predictions = linear_model.predict(X_test) # Making predictions

regressionSummary(y_test, predictions) # Check model performance





Regression statistics

               Mean Error (ME) : -0.0081
Root Mean Squared Error (RMSE) : 0.2618
     Mean Absolute Error (MAE) : 0.2030


In [None]:
# Standard Scaling
scaler_standard = StandardScaler() # Initialize Standard Scaler

X_train_scaled_standard = scaler_standard.fit_transform(X_train) # Fit the scaler and transform the train data
X_test_scaled_standard = scaler_standard.transform(X_test) # Transform the test data

linear_model.fit(X_train_scaled_standard, y_train) # Fitting the model
predictions_standard = linear_model.predict(X_test_scaled_standard) # Making predictions

regressionSummary(y_test, predictions_standard) # Check model performance



Regression statistics

               Mean Error (ME) : -0.0081
Root Mean Squared Error (RMSE) : 0.2618
     Mean Absolute Error (MAE) : 0.2030


In [None]:
# Min-Max Scaling
scaler_min_max = MinMaxScaler() # Initialize Min Max Scaler

X_train_scaled_min_max = scaler_min_max.fit_transform(X_train)  # Fit the scaler and transform the train data
X_test_scaled_min_max = scaler_min_max.transform(X_test) # Transform the test data

linear_model.fit(X_train_scaled_min_max, y_train) # Fitting the model

predictions_min_max = linear_model.predict(X_test_scaled_min_max) # Making predictions

regressionSummary(y_test, predictions_min_max) # Check model performance



Regression statistics

               Mean Error (ME) : -0.0081
Root Mean Squared Error (RMSE) : 0.2618
     Mean Absolute Error (MAE) : 0.2030


## We don't see any difference in the model performance.

# Comparing K-Nearest Neighbor Models w/o scaling, with standard scaling and with min-max scaling.

In [None]:
# Initialize the model
from sklearn.neighbors import KNeighborsRegressor

knn_model = KNeighborsRegressor()

In [None]:
# Train and evaluate without scaling
knn_model.fit(X_train, y_train) # Fitting the model

predictions = knn_model.predict(X_test) # Making predictions

regressionSummary(y_test, predictions) # Check model performance





Regression statistics

               Mean Error (ME) : -0.0056
Root Mean Squared Error (RMSE) : 0.5385
     Mean Absolute Error (MAE) : 0.3167


In [None]:
# Standard Scaling
scaler_standard = StandardScaler() # Initialize Standard Scaler

X_train_scaled_standard = scaler_standard.fit_transform(X_train) # Fit the scaler and transform the train data
X_test_scaled_standard = scaler_standard.transform(X_test) # Transform the test data

knn_model.fit(X_train_scaled_standard, y_train) # Fitting the model
predictions_standard = knn_model.predict(X_test_scaled_standard) # Making predictions

regressionSummary(y_test, predictions_standard) # Check model performance



Regression statistics

               Mean Error (ME) : -0.0111
Root Mean Squared Error (RMSE) : 0.1764
     Mean Absolute Error (MAE) : 0.0556


In [None]:
# Min-Max Scaling
scaler_min_max = MinMaxScaler() # Initialize Min Max Scaler

X_train_scaled_min_max = scaler_min_max.fit_transform(X_train)  # Fit the scaler and transform the train data
X_test_scaled_min_max = scaler_min_max.transform(X_test) # Transform the test data

knn_model.fit(X_train_scaled_min_max, y_train) # Fitting the model

predictions_min_max = knn_model.predict(X_test_scaled_min_max) # Making predictions

regressionSummary(y_test, predictions_min_max) # Check model performance



Regression statistics

               Mean Error (ME) : -0.0056
Root Mean Squared Error (RMSE) : 0.1732
     Mean Absolute Error (MAE) : 0.0500


In [None]:
# Define the data for the three individuals
# Define the DataFrame
df = pd.DataFrame({
    'Person': ['A', 'B', 'C'],
    'Age': [40, 70, 42],
    'Income': [70000, 69000, 68000]
})

df

Unnamed: 0,Person,Age,Income
0,A,40,70000
1,B,70,69000
2,C,42,68000


## Who is more similar to A, B or C?

In [None]:
# Calculate Euclidean distances between A and B, and A and C

# Extracting the feature values for each person
features_a = df.loc[df['Person'] == 'A', ['Age', 'Income']].values[0]
features_b = df.loc[df['Person'] == 'B', ['Age', 'Income']].values[0]
features_c = df.loc[df['Person'] == 'C', ['Age', 'Income']].values[0]

# Calculate distances
distance_a_b = np.sqrt(np.sum((features_a - features_b) ** 2)).round(2)
distance_a_c = np.sqrt(np.sum((features_a - features_c) ** 2)).round(2)

(distance_a_b, distance_a_c)

(1000.45, 2000.0)

In [None]:
# Calculate Euclidean distances between A and B, and A and C after applying Min-Max scaling

# Extracting Age and Income for scaling
features = df[['Age', 'Income']]

# Applying Min-Max Scaling
min_max_scaler = MinMaxScaler()
features_min_max_scaled = min_max_scaler.fit_transform(features)


# Calculate distances for Min-Max scaled data
distance_min_max_a_b = np.sqrt(np.sum((features_min_max_scaled[0] - features_min_max_scaled[1]) ** 2)).round(2)
distance_min_max_a_c = np.sqrt(np.sum((features_min_max_scaled[0] - features_min_max_scaled[2]) ** 2)).round(2)


(distance_min_max_a_b, distance_min_max_a_c )



(1.12, 1.0)

In [None]:
# Calculate Euclidean distances between A and B, and A and C after applying Standard Scaling

# Extracting Age and Income for scaling
features = df[['Age', 'Income']]

standard_scaler = StandardScaler()
features_standard_scaled = standard_scaler.fit_transform(features)

# Calculate distances for Standard scaled data
distance_standard_a_b = np.sqrt(np.sum((features_standard_scaled[0] - features_standard_scaled[1]) ** 2)).round(2)
distance_standard_a_c = np.sqrt(np.sum((features_standard_scaled[0] - features_standard_scaled[2]) ** 2)).round(2)

(distance_standard_a_b, distance_standard_a_c)

(2.51, 2.45)