# Lab: Linear Regression

In [None]:
!pip install scikit-learn pandas

In [None]:
from IPython.display import Image

# 주택 데이터셋 탐색

## 데이터프레임으로 주택 데이터셋 읽기

Description, which was previously available at: [https://archive.ics.uci.edu/ml/datasets/Housing](https://archive.ics.uci.edu/ml/datasets/Housing)

Attributes:
    
<pre>
1. CRIM      per capita crime rate by town
2. ZN        proportion of residential land zoned for lots over 
                 25,000 sq.ft.
3. INDUS     proportion of non-retail business acres per town
4. CHAS      Charles River dummy variable (= 1 if tract bounds 
                 river; 0 otherwise)
5. NOX       nitric oxides concentration (parts per 10 million)
6. RM        average number of rooms per dwelling
7. AGE       proportion of owner-occupied units built prior to 1940
8. DIS       weighted distances to five Boston employment centres
9. RAD       index of accessibility to radial highways
10. TAX      full-value property-tax rate per $10,000
11. PTRATIO  pupil-teacher ratio by town
12. B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks 
                 by town
13. LSTAT    % lower status of the population
14. MEDV     Median value of owner-occupied homes in $1000s
</pre>

In [None]:
import pandas as pd

df = pd.read_csv('https://raw.githubusercontent.com/rasbt/'
                 'python-machine-learning-book-3rd-edition/'
                 'master/ch10/housing.data.txt',
                 header=None,
                 sep='\s+')

df.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 
              'NOX', 'RM', 'AGE', 'DIS', 'RAD', 
              'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
df.head()

<br>
<br>

In [None]:
df.tail()

## 데이터셋의 중요 특징 시각화

`mlxtend`를 설치합니다.

In [None]:
!pip install --upgrade mlxtend

In [None]:
import matplotlib.pyplot as plt
from mlxtend.plotting import scatterplotmatrix

In [None]:
# 'CRIM', 'LSTAT', 'INDUS', 'NOX', 'RM', 'MEDV' 변수를 이용해 scatterplotmatrix 출력 해보기

# list로 분석하고자 하는 변수의 이름을 넣어주면 사용 가능
cols = 

scatterplotmatrix(df[cols].values, figsize=(10, 8), 
                  names=cols, alpha=0.5)
plt.tight_layout()
# plt.savefig('images/10_03.png', dpi=300)
plt.show()

In [None]:
import numpy as np
from mlxtend.plotting import heatmap


cm = np.corrcoef(df[cols].values.T)
hm = heatmap(cm, row_names=cols, column_names=cols)

# plt.savefig('images/10_04.png', dpi=300)
plt.show()

<br>
<br>

## Simple Linear Regression

In [None]:
# 두 변수를 설정
# X = 입력
# y = 출력 (집값)

X = 
y = 

In [None]:
from sklearn.preprocessing import StandardScaler


sc_x = StandardScaler()
sc_y = StandardScaler()

# fit_transform
X_std = 
y_std = 

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
slr = LinearRegression()

# fit 실시

y_pred = 

print('기울기: %.3f' % slr.coef_[0])
print('절편: %.3f' % slr.intercept_)

In [None]:
slr_std = LinearRegression()


y_pred_std = 

print('기울기: %.3f' % slr_std.coef_[0])
print('절편: %.3f' % slr_std.intercept_)

In [None]:
X

In [None]:
X_std

In [None]:
def lin_regplot(X, y, model):
    plt.scatter(X, y, c='steelblue', edgecolor='white', s=70)
    plt.plot(X, model.predict(X), color='black', lw=2)    
    return 

In [None]:
lin_regplot(X, y, slr)
plt.xlabel('Average number of rooms [RM]')
plt.ylabel('Price in $1000s [MEDV]')

# plt.savefig('images/10_07.png', dpi=300)
plt.show()

In [None]:
lin_regplot(X_std, y_std, slr_std)
plt.xlabel('Average number of rooms [RM]')
plt.ylabel('Price in $1000s [MEDV]')

# plt.savefig('images/10_07.png', dpi=300)
plt.show()

## Multiple Linear Regression

In [None]:
from sklearn.model_selection import train_test_split

# 모든 변수를 X로 설정
X = 
# 집값을 y로 설정
y = 

# train, test 데이터로 분리
X_train, X_test, y_train, y_test = 

In [None]:
slr = LinearRegression()


y_train_pred = 
y_test_pred = 

In [None]:
plt.scatter(y_train_pred,  y_train_pred - y_train,
            c='steelblue', marker='o', edgecolor='white',
            label='Training data')
plt.scatter(y_test_pred,  y_test_pred - y_test,
            c='limegreen', marker='s', edgecolor='white',
            label='Test data')
plt.xlabel('Predicted values')
plt.ylabel('Residuals')
plt.legend(loc='upper left')
plt.hlines(y=0, xmin=-10, xmax=50, color='black', lw=2)
plt.xlim([-10, 50])
plt.tight_layout()

# plt.savefig('images/10_09.png', dpi=300)
plt.show()

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

print('훈련 MSE: %.3f, 테스트 MSE: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
print('훈련 R^2: %.3f, 테스트 R^2: %.3f' % (
        r2_score(y_train, y_train_pred),
        r2_score(y_test, y_test_pred)))