In [51]:
# Implementation of the Feature Scaling - Standardization/Z-Score Scale example
# Standardization should be applied when using the algos such as
# K-means, KNN, PCA, Artificial Neural Network and Gradient Descent

import pandas as pd     # Read the dataset and data processing
import numpy as np      # Linear Algebra
import seaborn as sns   # Visualization and plotting
import matplotlib.pyplot as plt

In [21]:
df = pd.read_csv('salary.csv')
df.head()

Unnamed: 0,YearsExperience,Salary
0,1.1,39343
1,1.3,46205
2,1.5,37731
3,2.0,43525
4,2.2,39891


In [55]:
df.sample(5)

Unnamed: 0,YearsExperience,Salary
16,5.1,66029
6,3.0,60150
34,13.5,139465
30,11.2,127345
33,12.9,135675


In [34]:
from sklearn.model_selection import train_test_split

# Replace 'your_target_variable' with the actual name of your target variable
X = df.drop('Salary', axis=1)
y = df['YearsExperience']

# Performing the train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Checking the shapes of the train and test sets
X_train.shape, X_test.shape

((24, 1), (11, 1))

**Standard Scaler**

In [37]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# fit the scaler to the train set, it will learn the parameters
# the mean and standard deviation stored in the following line
scaler.fit(X_train)

# transform train and test sets
# as we got the mean and SD, now place the values in the formula of the Standardization which is shown below
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [39]:
scaler.mean_    # mean of the age and mean of the salary is shown below

array([6.1375])

In [41]:
X_train  # original value before standardization

Unnamed: 0,YearsExperience
34,13.5
32,12.3
26,9.5
30,11.2
8,3.2
13,4.1
5,2.9
17,5.3
14,4.5
31,11.5


In [43]:
X_train_scaled  # values after the standardization through formula

array([[ 1.9212254 ],
       [ 1.6080885 ],
       [ 0.87743571],
       [ 1.32104633],
       [-0.76653306],
       [-0.53168037],
       [-0.84481728],
       [-0.21854347],
       [-0.42730141],
       [ 1.39933056],
       [ 0.66867777],
       [-1.26233316],
       [-0.55777512],
       [-0.81872254],
       [ 0.53820406],
       [-1.02748048],
       [-0.06197501],
       [ 0.25116189],
       [-0.03588027],
       [-0.63605934],
       [-0.76653306],
       [ 1.76465695],
       [-1.07966996],
       [-1.31452264]])

In [45]:
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [47]:
np.round(X_train.describe(), 1)       # it will describe the mean and sd etc of original value

Unnamed: 0,YearsExperience
count,24.0
mean,6.1
std,3.9
min,1.1
25%,3.2
50%,4.9
75%,8.9
max,13.5


In [49]:
np.round(X_train_scaled.describe(), 1)    # now mean = 0 and sd = 1 after standardization

Unnamed: 0,YearsExperience
count,24.0
mean,0.0
std,1.0
min,-1.3
25%,-0.8
50%,-0.3
75%,0.7
max,1.9
