In [4]:
from sklearn.datasets import load_boston
from pandas import DataFrame
import numpy as np

In [6]:
boston = load_boston()

In [10]:
print(boston['DESCR'])

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [14]:
df = DataFrame(data=boston.data, columns=boston.feature_names)

In [17]:
df['PRICE'] = np.array(boston.target)

In [25]:
df.head(10)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2
5,0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3.0,222.0,18.7,394.12,5.21,28.7
6,0.08829,12.5,7.87,0.0,0.524,6.012,66.6,5.5605,5.0,311.0,15.2,395.6,12.43,22.9
7,0.14455,12.5,7.87,0.0,0.524,6.172,96.1,5.9505,5.0,311.0,15.2,396.9,19.15,27.1
8,0.21124,12.5,7.87,0.0,0.524,5.631,100.0,6.0821,5.0,311.0,15.2,386.63,29.93,16.5
9,0.17004,12.5,7.87,0.0,0.524,6.004,85.9,6.5921,5.0,311.0,15.2,386.71,17.1,18.9


In [24]:
df['RM'].head()

0    6.575
1    6.421
2    7.185
3    6.998
4    7.147
Name: RM, dtype: float64

In [None]:
# =====================
# 単回帰分析
# =====================

In [36]:
# 説明変数
data = df.loc[:, ['RM']].values

In [40]:
data[0:5]

array([[6.575],
       [6.421],
       [7.185],
       [6.998],
       [7.147]])

In [41]:
# 目的変数
target = df.loc[:,'PRICE'].values

In [44]:
target[0:5]

array([24. , 21.6, 34.7, 33.4, 36.2])

In [45]:
from sklearn.linear_model import LinearRegression

In [48]:
# モデル生成
model = LinearRegression()
model.get_params()

{'copy_X': True,
 'fit_intercept': True,
 'n_jobs': None,
 'normalize': False,
 'positive': False}

In [49]:
# fitでパラメータ推定
model.fit(data,target)

LinearRegression()

In [53]:
# 部屋数1で推定（金額がマイナスはおかしい！そもそも学習データに部屋数1は存在しないので、こういった結果になる）
model.predict([[1]])

array([-25.5685118])

In [None]:
# =====================
# 重回帰分析（2変数）
# =====================

In [58]:
data2 = df.loc[:, ['CRIM', 'RM']]

In [59]:
data2

Unnamed: 0,CRIM,RM
0,0.00632,6.575
1,0.02731,6.421
2,0.02729,7.185
3,0.03237,6.998
4,0.06905,7.147
...,...,...
501,0.06263,6.593
502,0.04527,6.120
503,0.06076,6.976
504,0.10959,6.794


In [61]:
target2 = df.loc[:,'PRICE'].values

In [62]:
model2 = LinearRegression()

In [63]:
model2.fit(data2,target2)

LinearRegression()

In [66]:
# 課題：部屋数4、犯罪率0.3
model2.predict([[0.3, 4]])

array([4.24007956])

【考察】
オリジナルのデータを完全に眺めたわけではないが、犯罪率より部屋数のほうが重要度が高いように見える結果となった。
さらに説明変数を追加すると更に精度があげられるのかは検証する必要がある。
また、先生がデータを眺めて、PRICEが上限50でカットされている？に気づかれていた。
改めて特徴量の理解とデータを俯瞰して見ることが重要だと感じた。