# 다중 선형회귀분석

## Boston Housing Dataset

![title](img/house_001.png)

In [1]:
# 필요 라이브러리 로드
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# 데이터 import
from sklearn.datasets import load_boston

boston_dataset = load_boston()
boston = pd.DataFrame(boston_dataset.data, columns=boston_dataset.feature_names)

## 데이터 준비

In [3]:
boston.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [4]:
# 기초통계량 확인
boston.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97


In [5]:
X = boston[['LSTAT', 'RM']].values.reshape([-1, 2]) # 독립변수
y = boston_dataset.target.reshape([-1, 1]) # 종속변수

In [6]:
from sklearn.model_selection import train_test_split
# 훈련데이터와 테스트데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [7]:
print('X_train shape: {}'.format(X_train.shape))
print('y_train shape: {}'.format(y_train.shape))
print('X_test shape: {}'.format(X_test.shape))
print('y_test shape: {}'.format(y_test.shape))

X_train shape: (339, 2)
y_train shape: (339, 1)
X_test shape: (167, 2)
y_test shape: (167, 1)


In [8]:
def weight_variable(shape):
    initial = tf.zeros(shape)
    return tf.Variable(initial, name='weight')

def bias_variable(shape):
    initial = tf.zeros(shape)
    return tf.Variable(initial, name='bias')

위의 경우에서의 행렬연산은 아래와 같습니다

$$
X = \begin{pmatrix}LSTAT_{1} & RM_{1}\\LSTAT_{2} & RM_{2}\\. & .\\. & .\\. & .\\LSTAT_{339} & RM_{339}\end{pmatrix}, 
Y = \begin{pmatrix}y_{1}\\y_{2}\\.\\.\\.\\y_{339}\end{pmatrix}, 
W = \begin{pmatrix}w_{1}&w_{2}\end{pmatrix}, 
W^T = \begin{pmatrix}w_{1}\\w_{2}\end{pmatrix}, 
b = b
$$

$$
Y = XW^{T} + b
$$

$$
\begin{pmatrix}y_{1}\\y_{2}\\.\\.\\.\\y_{n}\end{pmatrix} = \begin{pmatrix}LSTAT_{1} & RM_{1}\\LSTAT_{2} & RM_{2}\\. & .\\. & .\\. & .\\LSTAT_{339} & RM_{339}\end{pmatrix}\begin{pmatrix}w_{1}\\w_{2}\end{pmatrix} + b = \begin{pmatrix}LSTAT_{1}w_{1} + RM_{1}w_{2} + b\\LSTAT_{2}w_{1} + RM_{2}w_{2} + b\\.\\.\\.\\LSTAT_{339}w_{1} + RM_{339}w_{2} + b\end{pmatrix}
$$

위의 연산식에 따라 변수 행렬 모양을 정의합니다

In [9]:
# 변수 초기화
with tf.name_scope('MultiLinearRegression'):
    w = weight_variable([1, 2])
    b = bias_variable([1])

    x = tf.placeholder(tf.float32, shape=[None, 2])
    y_t = tf.placeholder(tf.float32, shape=[None, 1])
    # tf.transpose() 함수를 이용하여 행렬을 전치합니다
    y = tf.matmul(x, tf.transpose(w)) + b

손실함수(loss function)는 평균제곱오차(MSE, Mean Square Error)를 활용합니다.

$$MSE = \frac{1}{2m} \sum_{n=1}^{N} (t_{n} - y_{n})^2$$

위 손실함수를 최소로 하는 $w$와 $b$를 찾아내는것이 학습의 과정입니다

$$\underset{w, b}{\operatorname{argmin}}{MSE}$$

In [10]:
with tf.name_scope('loss'):
    loss = tf.reduce_mean(tf.square(y_t - y))
    tf.summary.scalar('MSE', loss) # 텐서보드에서 확인할 값
    train_step = tf.train.GradientDescentOptimizer(learning_rate=0.001).minimize(loss)

In [11]:
# 초기화
epochs = 10000
init = tf.global_variables_initializer()

with tf.Session() as sess:
    # 세션 초기화
    sess.run(init)

    train_writer = tf.summary.FileWriter('./board/train', sess.graph)
    test_writer = tf.summary.FileWriter('./board/test', sess.graph)
    
    merged_summary = tf.summary.merge_all()
    
    for epoch in range(epochs):
        # tensorboard에 기록
        # train
        history, _ = sess.run([merged_summary, train_step], feed_dict={x: X_train, y_t: y_train})
        train_writer.add_summary(history, epoch)
        # test
        history, _ = sess.run([merged_summary, loss], feed_dict={x: X_test, y_t: y_test})
        test_writer.add_summary(history, epoch)
        
    train_writer.close()
    test_writer.close()

In [12]:
!tensorboard --logdir=./board

TensorBoard 1.14.0 at http://gwonhyeogmin-ui-MacBookPro.local:6006/ (Press CTRL+C to quit)
^C


In [13]:
# tensorboad log file 삭제
!rm -rf ./board/*

출처
* https://towardsdatascience.com/linear-regression-on-boston-housing-dataset-f409b7e4a155