In [44]:
## import module 
import numpy
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression

In [20]:
## import boston housing data
a = numpy.load('./data/boston_housing.npz')
X, y = a['x'], a['y']
print(X.shape, y.shape)

(506, 13) (506,)


## Boston Housing data
### features: 
CRIM: per capita crime rate by town <br>
ZN: proportion of residential land zoned for lots over 25,000 sq.ft. <br>
INDUS: proportion of non-retail business acres per town <br>
CHAS: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) <br>
NOX: nitric oxides concentration (parts per 10 million) <br>
RM: average number of rooms per dwelling <br>
AGE: proportion of owner-occupied units built prior to 1940 <br>
DIS: weighted distances to five Boston employment centres <br>
RAD: index of accessibility to radial highways <br>
TAX: full-value property-tax rate per $10,000 <br>
PTRATIO: pupil-teacher ratio by town <br>
B: 1000(Bk - 0.63)^2 where Bk is the proportion of b. by town <br>
LSTAT: % lower status of the population

### Targets: 
MEDV, Median value of owner-occupied homes in $1000's

In [None]:
print(a.feature_names)

In [16]:
## train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
### Feature variables
print(X_train.shape)
X_train[:1]

(404, 13)


array([[1.50234e+01, 0.00000e+00, 1.81000e+01, 0.00000e+00, 6.14000e-01,
        5.30400e+00, 9.73000e+01, 2.10070e+00, 2.40000e+01, 6.66000e+02,
        2.02000e+01, 3.49480e+02, 2.49100e+01]])

In [25]:
## Target
print(y_train.shape)
print(y_train[1])

(404,)
19.9


## Train the model

In [31]:
## Train the model
lm = LinearRegression()
lm.fit(X_train, y_train)

## Parpameter of trained model
print("Coefficient of traind model:", lm.coef_)
print("Intercept of traind model:", lm.intercept_ )

Coefficient of traind model: [-1.13055924e-01  3.01104641e-02  4.03807204e-02  2.78443820e+00
 -1.72026334e+01  4.43883520e+00 -6.29636221e-03 -1.44786537e+00
  2.62429736e-01 -1.06467863e-02 -9.15456240e-01  1.23513347e-02
 -5.08571424e-01]
Intercept of traind model: 30.246750993923946


## Validate the trained model

In [34]:
## the coefficient of determination R^2 of the prediction.
print("R^2 of train data: ", lm.score(X_train, y_train))
print("R^2 of test data: ", lm.score(X_test, y_test))

R^2 of train data:  0.7508856358979673
R^2 of test data:  0.6687594935356329


## Predict with the trained model

In [57]:
y_pred = lm.predict(X_test)
mse = sklearn.metrics.mean_squared_error(y_test, y_pred)
print("MSE between predicted and test data : ", mse)

i = 1
y_pred1 = lm.predict([X_test[i]]) 
print("Predicted value : ", y_pred1)
print("Actual value: ", y_test[i])

MSE between predicted and test data :  24.291119474973456
Predicted value :  [36.02556534]
Actual value:  32.4


## Feature Engineering

In [43]:
from sklearn.preprocessing import StandardScaler

In [46]:
## Transform data
Scaler = StandardScaler().fit(X_train)
X_train2 = pd.DataFrame(Scaler.transform(X_train))
X_test2 = pd.DataFrame(Scaler.transform(X_test))

In [48]:
## Transform data
Scaler = StandardScaler().fit(X_train)
X_train2 = Scaler.transform(X_train)
X_test2 = Scaler.transform(X_test)

In [52]:
X_train[:5]

array([[1.50234e+01, 0.00000e+00, 1.81000e+01, 0.00000e+00, 6.14000e-01,
        5.30400e+00, 9.73000e+01, 2.10070e+00, 2.40000e+01, 6.66000e+02,
        2.02000e+01, 3.49480e+02, 2.49100e+01],
       [6.27390e-01, 0.00000e+00, 8.14000e+00, 0.00000e+00, 5.38000e-01,
        5.83400e+00, 5.65000e+01, 4.49860e+00, 4.00000e+00, 3.07000e+02,
        2.10000e+01, 3.95620e+02, 8.47000e+00],
       [3.46600e-02, 3.50000e+01, 6.06000e+00, 0.00000e+00, 4.37900e-01,
        6.03100e+00, 2.33000e+01, 6.64070e+00, 1.00000e+00, 3.04000e+02,
        1.69000e+01, 3.62250e+02, 7.83000e+00],
       [7.05042e+00, 0.00000e+00, 1.81000e+01, 0.00000e+00, 6.14000e-01,
        6.10300e+00, 8.51000e+01, 2.02180e+00, 2.40000e+01, 6.66000e+02,
        2.02000e+01, 2.52000e+00, 2.32900e+01],
       [7.25800e-01, 0.00000e+00, 8.14000e+00, 0.00000e+00, 5.38000e-01,
        5.72700e+00, 6.95000e+01, 3.79650e+00, 4.00000e+00, 3.07000e+02,
        2.10000e+01, 3.90950e+02, 1.12800e+01]])

In [49]:
X_train2[:5]

array([[ 1.28770177, -0.50032012,  1.03323679, -0.27808871,  0.48925206,
        -1.42806858,  1.02801516, -0.80217296,  1.70689143,  1.57843444,
         0.84534281, -0.07433689,  1.75350503],
       [-0.33638447, -0.50032012, -0.41315956, -0.27808871, -0.15723342,
        -0.68008655, -0.43119908,  0.32434893, -0.62435988, -0.58464788,
         1.20474139,  0.4301838 , -0.5614742 ],
       [-0.40325332,  1.01327135, -0.71521823, -0.27808871, -1.00872286,
        -0.40206304, -1.6185989 ,  1.3306972 , -0.97404758, -0.60272378,
        -0.63717631,  0.06529747, -0.65159505],
       [ 0.38822983, -0.50032012,  1.03323679, -0.27808871,  0.48925206,
        -0.30045039,  0.59168149, -0.8392398 ,  1.70689143,  1.57843444,
         0.84534281, -3.86819251,  1.52538664],
       [-0.32528234, -0.50032012, -0.41315956, -0.27808871, -0.15723342,
        -0.83109424,  0.03374663, -0.00549428, -0.62435988, -0.58464788,
         1.20474139,  0.3791194 , -0.16578736]])

In [51]:
## Train the model
lm2 = LinearRegression()
lm2.fit(X_train2, y_train)

print("R^2 of train data: ", lm2.score(X_train2, y_train))
print("R^2 of test data: ", lm2.score(X_test2, y_test))

y_pred2 = lm2.predict(X_test2)
mse2 = sklearn.metrics.mean_squared_error(y_test, y_pred2)
print("MSE between predicted and test data : ", mse2)

R^2 of train data:  0.7508856358979672
R^2 of test data:  0.668759493535632
MSE between predicted and test data :  24.291119474973517
