# load data

In [6]:
from sklearn.datasets import load_boston

boston = load_boston()
print(boston.data.shape)

(506, 13)


In [8]:
print(boston.feature_names)
print(boston.DESCR)

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial hig

In [9]:
import numpy as np

print(np.max(boston.target), np.min(boston.target), np.mean(boston.target))

50.0 5.0 22.532806324110677


# preprocess data

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size=0.25, random_state=33)

In [16]:
print(X_train[0], y_train[0])

[  0.54011  20.        3.97      0.        0.647     7.203    81.8
   2.1121    5.      264.       13.      392.8       9.59   ] 33.8


In [18]:
from sklearn.preprocessing import StandardScaler

scalerX = StandardScaler().fit(X_train)
X_train = scalerX.transform(X_train)
X_test = scalerX.transform(X_test)

In [19]:
print(X_train[0], y_train[0])

[-0.32076092  0.35553428 -1.00966618 -0.28784917  0.87716097  1.28834383
  0.4759489  -0.83034371 -0.47659648 -0.81061061 -2.49222645  0.35062335
 -0.39859013] 33.8


In [25]:
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)
print(y_train.shape)

(379, 1)


In [26]:
scalery = StandardScaler().fit(y_train)
y_train = scalery.transform(y_train)
y_test = scalery.transform(y_test)

# learn data

In [33]:
# 线性模型
from sklearn import linear_model

clf_sgd = linear_model.SGDRegressor(loss='squared_loss', penalty=None, random_state=42)
clf_sgd_12 = linear_model.SGDRegressor(loss='squared_loss', penalty='l2', random_state=42)

# svm
from sklearn.svm import SVR

clf_svr = SVR(kernel='linear')
clf_svr_poly = SVR(kernel='poly')
clf_svr_rbf = SVR(kernel='rbf')

# random forest
from sklearn import ensemble

clf_et = ensemble.ExtraTreesRegressor()

In [43]:
from sklearn.model_selection import cross_val_score

clfs = [clf_sgd, clf_sgd_12, clf_svr, clf_svr_poly, clf_svr_rbf, clf_et]
for clf in clfs:
    scores = cross_val_score(clf, X_train, y_train, cv=5)
    print(np.mean(scores))

0.7098518257344566
0.7098561728261388
0.7017180425033174
0.7866677623279259
0.8178910469741936
0.8842333819989465


# predict data

In [44]:
clf_et.score(X_test, y_test)

0.8064103758449693

In [45]:
for clf in clfs:
    clf.fit(X_train, y_train)
    print(clf.score(X_test, y_test))

0.6636381795948472
0.6636463830153829
0.650659546421538
0.403650651025512
0.7559887416340947
0.8087278485626916
