In [24]:
import pandas as pd
from sklearn import datasets
boston_data = datasets.load_boston()

In [25]:
print(boston_data.keys())

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])


In [26]:
print(boston_data.data.shape)

(506, 13)


In [27]:
print(boston_data.feature_names)

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


In [28]:
print(boston_data.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [29]:
house = pd.DataFrame(boston_data.data)
house.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [34]:
# house.columns = boston_data.feature_names
house.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
house.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [40]:
house['MEDV'] = boston_data.target
house['MEDV'].head()

0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
Name: MEDV, dtype: float64

In [42]:
house.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [43]:
house.shape

(506, 14)

In [44]:
# LSTAT와 INDUS는 MEDV와 지수적 감소 형태를
# RM은 선형 관계를 보여주고 있음.

import plotly.express as px
fig = px.scatter_matrix(house, dimensions=['LSTAT', 'INDUS', 'NOX', 'RM', 'MEDV'], title='Scatter Matrix')
fig.show()

In [45]:
fig = px.scatter_matrix(house, dimensions=['ZN', 'AGE', 'DIS', 'TAX', 'PTRATIO', 'MEDV'], title='Scatter Matrix')
fig.show()

In [47]:
# 종속변수 MEDV(집가격) & 설명변수의 상관계수 행렬 구하기

import numpy as np
cm1 = np.corrcoef(house[col1].values.T)
cm2 = np.corrcoef(house[col2].values.T)
print(cm1)
print(cm2)

In [48]:
# 일부 변수에 대한 Log 변환

# 특성변수 LSTAT와 INDUS가 y와 지수적 관계를 보이므로
# 로그를 취해 변환된 특성변수를 만듦.

import numpy as np
house['LLSTAT'] = np.log(house['LSTAT'])
house['LINDUS'] = np.log(house['INDUS'])
house.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV,LLSTAT,LINDUS
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0,1.60543,0.837248
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6,2.21266,1.95586
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7,1.393766,1.95586
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4,1.07841,0.779325
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2,1.673351,0.779325


In [49]:
# y 변수와 특성변수 X를 정의하고
# Train Data와 Test Data를 만든 후
# LinearRegression을 호출하여 선형회귀분석을 하는 프로그램

# 회귀계수와 휘귀절편은 LinearRegression의 속성(attribute)인 intercept_와 coef_를 이용하여 출력하고 있음.

y = house['MEDV'].values
house = house.drop(['LSTAT', 'INDUS', 'MEDV'], axis=1)
X = house.values

In [53]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

from sklearn.linear_model import LinearRegression
mlr = LinearRegression()
mlr.fit(X_train, y_train)

LinearRegression()

In [54]:
# 추정값 확인 - Coefficient

print('Slope:', mlr.coef_)

Slope: [-1.36676828e-01  3.13177997e-02  2.52393199e+00 -1.70295629e+01
  1.23977704e+00  3.06818458e-02 -1.28840466e+00  2.61968148e-01
 -6.58141653e-03 -8.27862485e-01  4.90558897e-03 -9.97211822e+00
 -6.04522425e-01]


In [55]:
# 추정값 확인 - Intercept

print('Intercept:', mlr.intercept_)

Intercept: 65.69779117501659


In [56]:
# 잔차에 대한 Plot 구하기

# 추정된 휘귀모형을 Train Data와 Test Data에 적용하여 잔차를 구함
# 파란색 : Train Data의 잔차 / 빨간색 : Test Data의 잔차

y_train_pred = mlr.predict(X_train)
y_test_pred = mlr.predict(X_test)

import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=y_train_pred, y=y_train_pred-y_train, mode='markers', name='Training data'))
fig.add_trace(go.Scatter(x=y_test_pred, y=y_test_pred-y_test, mode='markers', name='Test data'))
fig.update_layout(width=600, height=400, title_text='Residual Plots versus predicted values', title_x=0.5)
fig.update_xaxes(title_text='residuals')
fig.update_yaxes(title_text='predicted')
fig.show()

In [57]:
# 모형의 MSE - Mean Squared Error

from sklearn.metrics import mean_squared_error
print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)))

MSE train: 18.139, test: 17.416


In [58]:
# 모형의 R^2

from sklearn.metrics import r2_score
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))

R^2 train: 0.777, test: 0.810


In [59]:
# 로버스트 회귀 RANSAC Regression

from sklearn.linear_model import RANSACRegressor
rans = RANSACRegressor(max_trials=100, min_samples=45, loss='absolute_loss', residual_threshold=5.0, random_state=1)
rans.fit(X, y)
y_train_pred = rans.predict(X_train)
y_test_pred = rans.predict(X_test)

In [62]:
inlier_mask = rans.inlier_mask_
outlier_mask = np.logical_not(inlier_mask)

In [63]:
print(inlier_mask)

[False  True  True  True False  True  True False  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True False  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True False  True  True
  True  True  True  True False False False  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True False  True  True False  True  True  True  True
  True  True False  True False  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
 False  True  True  True  True  True  True  True  True False  True  True
  True  True  True  True False  True  True  True  True  True  True  True
  True False  True  True  True False False False  T

In [64]:
print(outlier_mask)

[ True False False False  True False False  True False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False  True False False False False False False False
 False False False False False False False False False  True False False
 False False False False  True  True  True False False False False False
 False False False False False False False False False False False False
 False False False False  True False False  True False False False False
 False False  True False  True False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
  True False False False False False False False False  True False False
 False False False False  True False False False False False False False
 False  True False False False  True  True  True Fa

In [65]:
# SVR Regression

from sklearn.svm import SVR

svl = SVR(kernel='linear', C=1.0, epsilon=0.1)
svr = SVR(kernel='rbf', C=1.0, epsilon=0.1, gamma='scale')

svl.fit(X_train, y_train)
svr.fit(X_train, y_train)

y_train_predsvl = svl.predict(X_train)
y_train_predsvr = svr.predict(X_train)
y_test_predsvl = svl.predict(X_test)
y_test_predsvr = svr.predict(X_test)

In [67]:
# 모형의 MSE

from sklearn.metrics import mean_squared_error

mse_l_train = mean_squared_error(y_train, y_train_predsvl)
mse_l_test = mean_squared_error(y_test, y_test_predsvl)
mse_n_train = mean_squared_error(y_train, y_train_predsvr)
mse_n_test = mean_squared_error(y_test, y_test_predsvr)

print(mse_l_train)
print(mse_l_test)
print(mse_n_train)
print(mse_n_test)

22.269701769971398
16.382339448697692
66.44380086215858
75.25569380095273


In [69]:
# 모형의 R^2

from sklearn.metrics import r2_score
R2_l = r2_score(y_train, y_train_predsvl)
R2_n = r2_score(y_train, y_train_predsvr)

print(R2_l)
print(R2_n)

0.7257056445901271
0.1816163630425145
