# Application of standard scaler in a regression model

Based on the model given out in the following https://stackoverflow.com/questions/31029340/how-to-adjust-scaled-scikit-learn-logicistic-regression-coeffs-to-score-a-non-sc

In [1]:
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [2]:
import numpy as np
import pandas as pd

In [3]:
boston = load_boston()

In [6]:
boston.keys()

['data', 'feature_names', 'DESCR', 'target']

In [10]:
print(boston.DESCR)

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [12]:
# Looking at the description of the data tells us the target variable name
# print boston.DESCR
data = pd.DataFrame(
    data = np.c_[boston.data, boston.target],
    columns = list(boston.feature_names) + ['MVAL'],
)
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MVAL
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [13]:
X = boston.data
y = boston.target

In [14]:
lr = LinearRegression()
lr.fit(X,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [15]:
orig_coefs = lr.coef_

In [16]:
coefs1 = pd.DataFrame(
    data={
        'feature': boston.feature_names, 
        'orig_coef' : orig_coefs, 
    }
)
coefs1

Unnamed: 0,feature,orig_coef
0,CRIM,-0.107171
1,ZN,0.046395
2,INDUS,0.02086
3,CHAS,2.688561
4,NOX,-17.795759
5,RM,3.804752
6,AGE,0.000751
7,DIS,-1.475759
8,RAD,0.305655
9,TAX,-0.012329


In [17]:
# Now we normalise the data
scalerX = StandardScaler().fit(X)
scalery = StandardScaler().fit(y.reshape(-1,1)) # Have to reshape to avoid warnings

In [18]:
normed_X = scalerX.transform(X)
normed_y = scalery.transform(y.reshape(-1,1)) # Have to reshape to avoid warnings

In [22]:
normed_y = normed_y.ravel() # Turn y back into a vector again

In [25]:
# Check it's worked
print np.mean(normed_X, axis=0), np.mean(normed_X, axis=0) # Should be 0s
print np.std(normed_X, axis=0), np.std(normed_y, axis=0)   # Should be 1s

[  6.34099712e-17  -6.34319123e-16  -2.68291099e-15   4.70199198e-16
   2.49032240e-15  -1.14523016e-14  -1.40785495e-15   9.21090169e-16
   5.44140929e-16  -8.86861950e-16  -9.20563581e-15   8.16310129e-15
  -3.37016317e-16] [  6.34099712e-17  -6.34319123e-16  -2.68291099e-15   4.70199198e-16
   2.49032240e-15  -1.14523016e-14  -1.40785495e-15   9.21090169e-16
   5.44140929e-16  -8.86861950e-16  -9.20563581e-15   8.16310129e-15
  -3.37016317e-16]
[ 1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.] 1.0


In [26]:
# Now we redo our regression
lr = LinearRegression()
lr.fit(normed_X, normed_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [27]:
coefs2 = pd.DataFrame(
    data={
        'feature' : boston.feature_names,
        'orig_coef' : orig_coefs,
        'norm_coef' : lr.coef_,
        'scaleX' : scalerX.scale_,
        'scaley' : scalery.scale_[0],
    },
    columns=['feature', 'orig_coef', 'norm_coef', 'scaleX', 'scaley']
)
coefs2

Unnamed: 0,feature,orig_coef,norm_coef,scaleX,scaley
0,CRIM,-0.107171,-0.100175,8.588284,9.188012
1,ZN,0.046395,0.117651,23.299396,9.188012
2,INDUS,0.02086,0.01556,6.853571,9.188012
3,CHAS,2.688561,0.074249,0.253743,9.188012
4,NOX,-17.795759,-0.224215,0.115763,9.188012
5,RM,3.804752,0.290666,0.701923,9.188012
6,AGE,0.000751,0.002299,28.121033,9.188012
7,DIS,-1.475759,-0.33788,2.103628,9.188012
8,RAD,0.305655,0.289376,8.698651,9.188012
9,TAX,-0.012329,-0.225936,168.370495,9.188012


In [28]:
# We can recreate our original coefficients by dividing by the
# scale of the feature (scaleX) and multiplying by the scale
# of the target (scaleY)
coefs2['rescaled_coef'] = coefs2.norm_coef / coefs2.scaleX * coefs2.scaley
coefs2

Unnamed: 0,feature,orig_coef,norm_coef,scaleX,scaley,rescaled_coef
0,CRIM,-0.107171,-0.100175,8.588284,9.188012,-0.107171
1,ZN,0.046395,0.117651,23.299396,9.188012,0.046395
2,INDUS,0.02086,0.01556,6.853571,9.188012,0.02086
3,CHAS,2.688561,0.074249,0.253743,9.188012,2.688561
4,NOX,-17.795759,-0.224215,0.115763,9.188012,-17.795759
5,RM,3.804752,0.290666,0.701923,9.188012,3.804752
6,AGE,0.000751,0.002299,28.121033,9.188012,0.000751
7,DIS,-1.475759,-0.33788,2.103628,9.188012,-1.475759
8,RAD,0.305655,0.289376,8.698651,9.188012,0.305655
9,TAX,-0.012329,-0.225936,168.370495,9.188012,-0.012329
