In [1]:
from sklearn.datasets import load_boston
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

## Data Load

In [4]:
boston = load_boston()
x_data = boston.data
y_data = boston.target.reshape(boston.target.size, 1)

x_data[:3], y_data

(array([[6.3200e-03, 1.8000e+01, 2.3100e+00, 0.0000e+00, 5.3800e-01,
         6.5750e+00, 6.5200e+01, 4.0900e+00, 1.0000e+00, 2.9600e+02,
         1.5300e+01, 3.9690e+02, 4.9800e+00],
        [2.7310e-02, 0.0000e+00, 7.0700e+00, 0.0000e+00, 4.6900e-01,
         6.4210e+00, 7.8900e+01, 4.9671e+00, 2.0000e+00, 2.4200e+02,
         1.7800e+01, 3.9690e+02, 9.1400e+00],
        [2.7290e-02, 0.0000e+00, 7.0700e+00, 0.0000e+00, 4.6900e-01,
         7.1850e+00, 6.1100e+01, 4.9671e+00, 2.0000e+00, 2.4200e+02,
         1.7800e+01, 3.9283e+02, 4.0300e+00]]), array([[24. ],
        [21.6],
        [34.7],
        [33.4],
        [36.2],
        [28.7],
        [22.9],
        [27.1],
        [16.5],
        [18.9],
        [15. ],
        [18.9],
        [21.7],
        [20.4],
        [18.2],
        [19.9],
        [23.1],
        [17.5],
        [20.2],
        [18.2],
        [13.6],
        [19.6],
        [15.2],
        [14.5],
        [15.6],
        [13.9],
        [16.6],
        [14.8],

In [7]:
boston["DESCR"]

".. _boston_dataset:\n\nBoston house prices dataset\n---------------------------\n\n**Data Set Characteristics:**  \n\n    :Number of Instances: 506 \n\n    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\n\n    :Attribute Information (in order):\n        - CRIM     per capita crime rate by town\n        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.\n        - INDUS    proportion of non-retail business acres per town\n        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n        - NOX      nitric oxides concentration (parts per 10 million)\n        - RM       average number of rooms per dwelling\n        - AGE      proportion of owner-occupied units built prior to 1940\n        - DIS      weighted distances to five Boston employment centres\n        - RAD      index of accessibility to radial highways\n        - TAX      full-value property-tax rate per $10,000

In [10]:
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

## Data Scaling

In [14]:
from sklearn import preprocessing

In [19]:
std_scale = preprocessing.StandardScaler().fit(x_data) # std, minmax 등 scaler 원하는 것 선택
minmax_scale = preprocessing.MinMaxScaler().fit(x_data)
x_scaled_data = minmax_scale.transform(x_data)
x_scaled_data[:3]

array([[0.00000000e+00, 1.80000000e-01, 6.78152493e-02, 0.00000000e+00,
        3.14814815e-01, 5.77505269e-01, 6.41606591e-01, 2.69203139e-01,
        0.00000000e+00, 2.08015267e-01, 2.87234043e-01, 1.00000000e+00,
        8.96799117e-02],
       [2.35922539e-04, 0.00000000e+00, 2.42302053e-01, 0.00000000e+00,
        1.72839506e-01, 5.47997701e-01, 7.82698249e-01, 3.48961980e-01,
        4.34782609e-02, 1.04961832e-01, 5.53191489e-01, 1.00000000e+00,
        2.04470199e-01],
       [2.35697744e-04, 0.00000000e+00, 2.42302053e-01, 0.00000000e+00,
        1.72839506e-01, 6.94385898e-01, 5.99382080e-01, 3.48961980e-01,
        4.34782609e-02, 1.04961832e-01, 5.53191489e-01, 9.89737254e-01,
        6.34657837e-02]])

## Data Split

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(x_scaled_data, y_data, test_size=0.33)

In [23]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((339, 13), (167, 13), (339, 1), (167, 1))

## Linear Regression Fitting

In [25]:
from sklearn import linear_model

In [26]:
regr = linear_model.LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=1)
# n_jobs 몇개의 컴퓨터(코어)를 쓸거냐. (data가 많을때)
regr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [28]:
print('Coeff.:', regr.coef_)
print('intercept:', regr.intercept_)

Coeff.: [[-10.70191764   7.07045751   0.35606448   3.35542289 -10.46943089
   13.756193     2.74460079 -19.41569992   9.11373921  -8.46045831
   -9.26746628   3.6401107  -23.3681474 ]]
intercept: [30.72216769]


## Prediction

In [33]:
regr.predict(x_data[:5])

array([[-965.49014142],
       [-726.51435637],
       [-660.26232229],
       [-525.763947  ],
       [-548.63899494]])

## Measure Metric

In [35]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [37]:
y_hat = regr.predict(X_test)
y_hat

array([[31.52249898],
       [21.28159549],
       [19.8874996 ],
       [22.48328904],
       [29.81440949],
       [34.20919751],
       [30.32414623],
       [26.85624372],
       [20.95169333],
       [22.19388138],
       [19.18665291],
       [16.85064856],
       [18.2108852 ],
       [ 7.192175  ],
       [10.02255616],
       [30.37187033],
       [26.88984297],
       [18.50674204],
       [19.82051769],
       [35.53085073],
       [23.55952908],
       [18.39008778],
       [23.19520475],
       [19.70402454],
       [27.78436574],
       [22.87977262],
       [10.67496311],
       [36.57127937],
       [31.04100945],
       [38.68779497],
       [ 8.0345997 ],
       [19.919334  ],
       [26.10446937],
       [21.62790218],
       [19.81574712],
       [28.64445863],
       [20.59620617],
       [20.15989742],
       [13.65765779],
       [13.89055718],
       [10.08632417],
       [14.27652686],
       [19.38619198],
       [29.73293629],
       [23.93905552],
       [15

In [39]:
print(r2_score(y_test, y_hat))
print(mean_absolute_error(y_test, y_hat))
print(mean_squared_error(y_test, y_hat))

0.7410664065370167
3.411581695640409
21.803631502012976
