## Feature Scaling(Standardization) and Feature Selection

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression

sns.set()

In [2]:
data = pd.read_csv('/home/jason/learning/data-science/resources/udemy/simpleLinearRegression/1.02.+Multiple+linear+regression.csv')
data.head()

Unnamed: 0,SAT,"Rand 1,2,3",GPA
0,1714,1,2.4
1,1664,3,2.52
2,1760,3,2.54
3,1685,3,2.74
4,1693,2,2.83


In [3]:
x = data[['SAT', 'Rand 1,2,3']]
y = data['GPA']

x.head(), y.head()

(    SAT  Rand 1,2,3
 0  1714           1
 1  1664           3
 2  1760           3
 3  1685           3
 4  1693           2,
 0    2.40
 1    2.52
 2    2.54
 3    2.74
 4    2.83
 Name: GPA, dtype: float64)

### Standardization

In [4]:
from sklearn.preprocessing import StandardScaler

In [5]:
## scaler object will be used to subtract mean and divide by the standard deviation
scaler = StandardScaler()

In [6]:
# `fit` calculate and store the mean and the standardd devation of each feature and will be stored in scaler object
scaler.fit(x)

StandardScaler()

In [7]:
x_scaled = scaler.transform(x)

In [8]:
x_scaled

array([[-1.26338288, -1.24637147],
       [-1.74458431,  1.10632974],
       [-0.82067757,  1.10632974],
       [-1.54247971,  1.10632974],
       [-1.46548748, -0.07002087],
       [-1.68684014, -1.24637147],
       [-0.78218146, -0.07002087],
       [-0.78218146, -1.24637147],
       [-0.51270866, -0.07002087],
       [ 0.04548499,  1.10632974],
       [-1.06127829,  1.10632974],
       [-0.67631715, -0.07002087],
       [-1.06127829, -1.24637147],
       [-1.28263094,  1.10632974],
       [-0.6955652 , -0.07002087],
       [ 0.25721362, -0.07002087],
       [-0.86879772,  1.10632974],
       [-1.64834403, -0.07002087],
       [-0.03150724,  1.10632974],
       [-0.57045283,  1.10632974],
       [-0.81105355,  1.10632974],
       [-1.18639066,  1.10632974],
       [-1.75420834,  1.10632974],
       [-1.52323165, -1.24637147],
       [ 1.23886453, -1.24637147],
       [-0.18549169, -1.24637147],
       [-0.5608288 , -1.24637147],
       [-0.23361183,  1.10632974],
       [ 1.68156984,

In [9]:
lr = LinearRegression()
lr.fit(x_scaled, y)

LinearRegression()

In [10]:
lr.coef_

array([ 0.17181389, -0.00703007])

In [11]:
lr.intercept_

3.330238095238095

### Summary Table

In [12]:
lr_summary = pd.DataFrame(data=[['bias'], ['SAT'], ['Rand1,2,3']], columns=['features'])

# 1. `weights` is he machine learning term for `coefficients`
#     ==> bigger the `weight`, bigger the impact of the feature on the regression
#     ==> closer the `weight` is to 0(zero), the smaller the impact
# 2. `bias` is the machine learinng term for `intercept`
lr_summary['weights'] = lr.intercept_,lr.coef_[0], lr.coef_[1]
lr_summary

Unnamed: 0,features,weights
0,bias,3.330238
1,SAT,0.171814
2,"Rand1,2,3",-0.00703


### Making prediction with weights (standardized coefficients)

In [13]:
test_data = pd.DataFrame(data=[[1700, 2], [1800, 1]], columns=['SAT', 'Rand 1,2,3'])
test_data

Unnamed: 0,SAT,"Rand 1,2,3"
0,1700,2
1,1800,1


#### NOTE: The test data needs to be in scaled (standardized) with the same scaler object since it was used in training the data.
#### Then the test data needs to be transformed with same scaler object for scaling (standardizing) it.

In [15]:
test_data_scaled = scaler.transform(test_data)
test_data_scaled

array([[-1.39811928, -0.07002087],
       [-0.43571643, -1.24637147]])

In [16]:
lr.predict(test_data_scaled)

array([3.09051403, 3.26413803])

### Does removing Rand 1,2,3 makes any differnce ?

In [25]:
new_lr = LinearRegression()
x_sat = x_scaled[:, 0].reshape(-1,1)
new_lr.fit(x_sat, y)

LinearRegression()

In [26]:
new_lr.predict(test_data_scaled[:, 0].reshape(-1,1))

array([3.08970998, 3.25527879])