In [5]:
#Data Normalization is the process of scaling numerical values into a smaller, common range — usually:
#0 to 1. It ensures that all features contribute equally to the model.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

1.Min-Max Normalization (Most Common)

Range becomes 0 to 1

In [6]:
data = {
    'Age': [20, 22, 21, 23, 24],
    'Salary': [20000, 22000, 25000, 100000, 27000],
    'Marks': [85, 90, 88, 92, 86]
}

df = pd.DataFrame(data)

print("Original Dataset:\n")
print(df)
print("\n")

Original Dataset:

   Age  Salary  Marks
0   20   20000     85
1   22   22000     90
2   21   25000     88
3   23  100000     92
4   24   27000     86




TRAIN-TEST-SPLIT BEFORE SCALING

In [7]:
X_train, X_test = train_test_split(df, test_size=0.2, random_state=42)

print("Training Data:\n")
print(X_train)
print("\n")

print("Testing Data:\n")
print(X_test)
print("\n")

Training Data:

   Age  Salary  Marks
4   24   27000     86
2   21   25000     88
0   20   20000     85
3   23  100000     92


Testing Data:

   Age  Salary  Marks
1   22   22000     90




In [8]:
minmax = MinMaxScaler()

X_train_minmax = minmax.fit_transform(X_train)
X_test_minmax = minmax.transform(X_test)

print("MinMax Scaled Training Data:\n")
print(pd.DataFrame(X_train_minmax, columns=df.columns))
print("\n")

print("MinMax Scaled Testing Data:\n")
print(pd.DataFrame(X_test_minmax, columns=df.columns))
print("\n")

MinMax Scaled Training Data:

    Age  Salary     Marks
0  1.00  0.0875  0.142857
1  0.25  0.0625  0.428571
2  0.00  0.0000  0.000000
3  0.75  1.0000  1.000000


MinMax Scaled Testing Data:

   Age  Salary     Marks
0  0.5   0.025  0.714286




Standard Scaling (Z-Score Scaling)
Mean becomes 0

Std becomes 1

Values can be negative

Outlier still affects mean and std

In [9]:
standard = StandardScaler()

X_train_standard = standard.fit_transform(X_train)
X_test_standard = standard.transform(X_test)

print("Standard Scaled Training Data:\n")
print(pd.DataFrame(X_train_standard, columns=df.columns))
print("\n")

print("Standard Scaled Testing Data:\n")
print(pd.DataFrame(X_test_standard, columns=df.columns))
print("\n")

Standard Scaled Training Data:

        Age    Salary     Marks
0  1.264911 -0.484737 -0.652753
1 -0.632456 -0.545329  0.093250
2 -1.264911 -0.696810 -1.025755
3  0.632456  1.726876  1.585258


Standard Scaled Testing Data:

   Age    Salary     Marks
0  0.0 -0.636218  0.839254




Robust Scaling (Best for Outliers)

Where:

Median = middle value

IQR = Q3 − Q1

Uses median instead of mean

Outlier impact reduced

Normal values remain more stable

In [10]:

robust = RobustScaler()

X_train_robust = robust.fit_transform(X_train)
X_test_robust = robust.transform(X_test)

print("Robust Scaled Training Data:\n")
print(pd.DataFrame(X_train_robust, columns=df.columns))
print("\n")

print("Robust Scaled Testing Data:\n")
print(pd.DataFrame(X_test_robust, columns=df.columns))
print("\n")

Robust Scaled Training Data:

   Age    Salary     Marks
0  0.8  0.046512 -0.307692
1 -0.4 -0.046512  0.307692
2 -0.8 -0.279070 -0.615385
3  0.4  3.441860  1.538462


Robust Scaled Testing Data:

   Age    Salary     Marks
0  0.0 -0.186047  0.923077




| Method   | Uses         | Outlier Effect |
| -------- | ------------ | -------------- |
| MinMax   | Min & Max    | Very High      |
| Standard | Mean & Std   | High           |
| Robust   | Median & IQR | Low            |
