In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
from sklearn.datasets import load_boston

In [3]:
boston = load_boston()

In [4]:
print(boston.DESCR)

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [5]:
boston.data.shape

(506, 13)

In [6]:
boston.target.shape

(506,)

In [7]:
boston_df = DataFrame(boston.data)

In [8]:
boston_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98
1,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14
2,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,0.06905,0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33


In [9]:
boston_df.columns = boston.feature_names

In [10]:
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98
1,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14
2,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,0.06905,0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33


In [11]:
boston_df['target'] = boston.target

In [12]:
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [13]:
X = boston.data

In [14]:
Y = boston.target

In [15]:
m,d = X.shape


In [16]:
one_arr = np.ones((m,1))
X = np.hstack((X,one_arr))


In [17]:
X_sym = X.T.dot(X)

In [18]:
X_sym.shape

(14, 14)

In [19]:
np.linalg.inv(np.array([[1,0,0],
                    [0,1,0],
                    [0,0,1]]))

array([[ 1.,  0.,  0.],
       [ 0.,  1.,  0.],
       [ 0.,  0.,  1.]])

In [20]:
X_sym_inv = np.linalg.inv(X_sym)

In [21]:
X_sym_inv.shape

(14, 14)

In [22]:
Y.shape

(506,)

In [23]:
X_trn_Y = X.T.dot(Y)

In [24]:
W = X_sym_inv.dot(X_trn_Y)

In [25]:
W.shape

(14,)

In [57]:
W

array([ -1.07170557e-01,   4.63952195e-02,   2.08602395e-02,
         2.68856140e+00,  -1.77957587e+01,   3.80475246e+00,
         7.51061703e-04,  -1.47575880e+00,   3.05655038e-01,
        -1.23293463e-02,  -9.53463555e-01,   9.39251272e-03,
        -5.25466633e-01,   3.64911033e+01])

In [59]:
err_vec = X.dot(W)-Y

In [62]:
rmse = np.sqrt(np.mean(err_vec**2))

In [63]:
rmse

4.6795063006355164

In [27]:
t = np.arange(100)+1

In [28]:
ind = np.random.choice(t,10,replace=False)
ind

array([99, 47, 16, 40, 37, 50, 52, 78, 30, 82])

In [29]:
t[ind]

array([100,  48,  17,  41,  38,  51,  53,  79,  31,  83])

In [30]:
np.random.permutation(t)

array([ 27,  60,  83,  70,  99,  14,  16,  10,  54,  19,  87,  47,  46,
        28,  90,  55,  77,  45,  38,  29,  97,  13,  51,  37,   2,  59,
        82,  84,  78,  53, 100,   5,  39,  31,  35,  17,   7,  72,  22,
        40,  43,  34,  67,  15,   3,  30,  56,  68,   9,  25,  48,  33,
        74,  66,  71,  62,  49,  94,  80,  86,  32,  58,  50,  79,  61,
        92,  93,   8,  21,   4,  64,  11,  69,  95,  91,  24,  98,  26,
        75,   1,  41,  89,  42,  96,  57,  12,  23,  73,  52,  20,  88,
        18,   6,  44,  63,  36,  85,  76,  81,  65])

In [31]:
t

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100])

In [32]:
X_rand = np.random.permutation(X)

In [33]:
trn_factor = 0.8

### Linear Regression with Test and Train split

In [43]:
 from sklearn import cross_validation

In [44]:
X_trn,X_tst,Y_trn,Y_tst = cross_validation.train_test_split(X,Y,test_size = 0.2)

In [46]:
X_trn.shape

(404, 14)

In [47]:
Y_tst.shape

(102,)

In [69]:
X.shape

(506, 14)

In [229]:
def fit_reg_model(X,Y,C):
    X_sym = X.T.dot(X)
    LI = C*np.eye(X_sym.shape[0])
    W = np.linalg.inv(X_sym+LI).dot(X.T.dot(Y))
    err_vec = X.dot(W)-Y
    rmse = np.sqrt(np.mean(err_vec**2))
    return [W,rmse]

In [117]:
def get_test_error(W,X,Y):
    err_vec = X.dot(W)-Y
    rmse = np.sqrt(np.mean(err_vec**2))
    return rmse

In [215]:
C = 0.1

In [216]:
X_trn_sym = X_trn.T.dot(X_trn)

In [217]:
X_trn_sym.shape


(14, 14)

In [218]:
CI = C*np.eye(X_trn_sym.shape[0])

In [219]:
CI.shape

(14, 14)

In [220]:
W = np.linalg.inv(X_trn_sym+CI).dot(X_trn.T.dot(Y_trn))

In [221]:
W

array([ -1.21299539e-01,   5.11715085e-02,   1.28469968e-02,
         2.50438782e+00,  -1.37508909e+01,   3.92391907e+00,
         1.30824920e-04,  -1.38544631e+00,   3.01820326e-01,
        -1.28078052e-02,  -8.56932646e-01,   9.68375806e-03,
        -4.99912548e-01,   3.14819980e+01])

In [222]:
trn_err_vec = X_trn.dot(W)-Y_trn

In [223]:
rmse_trn = np.sqrt(np.mean(trn_err_vec**2))

In [224]:
rmse_trn

4.7620994794794385

In [235]:
W,rmse_trn = fit_reg_model(X_trn,Y_trn,0.1)

In [236]:
W

array([ -1.21299539e-01,   5.11715085e-02,   1.28469968e-02,
         2.50438782e+00,  -1.37508909e+01,   3.92391907e+00,
         1.30824920e-04,  -1.38544631e+00,   3.01820326e-01,
        -1.28078052e-02,  -8.56932646e-01,   9.68375806e-03,
        -4.99912548e-01,   3.14819980e+01])

In [237]:
rmse_trn

4.7620994794794385

In [238]:
get_test_error(W,X_tst,Y_tst)

4.4153051205469334

In [242]:
W,rmse_trn = fit_reg_model(X_trn,Y_trn,1)

In [243]:
W

array([ -1.18904388e-01,   5.38721763e-02,  -1.36335662e-02,
         2.38881828e+00,  -4.66965026e+00,   4.77801341e+00,
        -4.63599979e-03,  -1.14884177e+00,   2.39279805e-01,
        -1.17910544e-02,  -5.74919707e-01,   1.20844890e-02,
        -4.59450543e-01,   1.43415721e+01])

In [244]:
rmse_trn

4.8524873632812016

In [245]:
get_test_error(W,X_tst,Y_tst)

4.5154193682598969

In [256]:
W,rmse_trn = fit_reg_model(X_trn,Y_trn,0.01)

In [257]:
W

array([ -1.22062626e-01,   5.04832604e-02,   2.08892317e-02,
         2.51924239e+00,  -1.63403591e+01,   3.70302363e+00,
         1.66234130e-03,  -1.44908818e+00,   3.18585753e-01,
        -1.30510355e-02,  -9.31250285e-01,   9.08137117e-03,
        -5.10052630e-01,   3.60261710e+01])

In [258]:
rmse_trn

4.7569367899338415

In [259]:
get_test_error(W,X_tst,Y_tst)

4.4054735565792411