# 선형 회귀식의 계수를 찾는 법 - OLS VS. SGD
- 보스턴 집값 데이터 활용(RM VS Price)

### 필요한 모듈 import

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# 필요한 라이브러리 import 
from sklearn.datasets import load_boston
from sklearn.linear_model import SGDRegressor # 확률적 배치 경사하강법
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# 1. LinearRegression 모델을 사용한 경우

In [2]:
from sklearn.datasets import load_boston

# 데이터 수집
boston = load_boston()

import pandas as pd

df =pd.DataFrame(boston.data, columns=boston.feature_names)

X = pd.DataFrame(df['RM'])
y = boston.target

# 전체 데이터 중 80%는 학습용, 20%는 검증용으로 분리
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,\
                                                   random_state=1)
import numpy as np
from sklearn.linear_model import LinearRegression

#모델 객체 생성
reg = LinearRegression()

# 모델 학습
reg.fit(X_train, y_train)

# 계수 및 절편 확인: _속성은 학습을 통해 결정되는 속성
print(reg.coef_, reg.intercept_)

#회귀식
print("y = {:2f}X + {:.3f}".format(reg.coef_[0], reg.intercept_))

from sklearn.metrics import mean_squared_error, r2_score

# 예측 수행
y_pred = reg.predict(X_test)

# MSE, RMSE, r2_score
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MSE:", np.round(mse, 3))
print("RMSE: ", np.round(rmse, 3))
print("R2: ", np.round(r2, 3))

[8.46109164] -30.571032410898336
y = 8.461092X + -30.571
MSE: 36.517
RMSE:  6.043
R2:  0.602


# 2. SGDRegressor

In [5]:
from sklearn.linear_model import SGDRegressor

reg = SGDRegressor() # 임의의 점에서 시작

# 모델 학습
reg.fit(X_train, y_train)

# 계수 및 절편 확인: _속성은 학습을 통해 결정되는 속성
print(reg.coef_, reg.intercept_)

#회귀식
print("y = {:2f}X + {:.3f}".format(reg.coef_[0], reg.intercept_[0])) # 1차 함수가 아니라 reg.intercept_[0]

from sklearn.metrics import mean_squared_error, r2_score

# 예측 수행
y_pred = reg.predict(X_test)

# MSE, RMSE, r2_score
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MSE:", np.round(mse, 3))
print("RMSE: ", np.round(rmse, 3))
print("R2: ", np.round(r2, 3))

[4.14637016] [-3.12681598]
y = 4.146370X + -3.127
MSE: 55.396
RMSE:  7.443
R2:  0.396


# 2. SGDRegressor with Hyperparameter

In [6]:
from sklearn.linear_model import SGDRegressor

reg = SGDRegressor(max_iter=1000, eta0=0.001, learning_rate='constant')

# 모델 학습
reg.fit(X_train, y_train)

# 계수 및 절편 확인: _속성은 학습을 통해 결정되는 속성
print(reg.coef_, reg.intercept_)

#회귀식
print("y = {:2f}X + {:.3f}".format(reg.coef_[0], reg.intercept_[0])) # 1차 함수가 아니라 reg.intercept_[0]

from sklearn.metrics import mean_squared_error, r2_score

# 예측 수행
y_pred = reg.predict(X_test)

# MSE, RMSE, r2_score
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MSE:", np.round(mse, 3))
print("RMSE: ", np.round(rmse, 3))
print("R2: ", np.round(r2, 3))

[4.0602922] [-2.15608891]
y = 4.060292X + -2.156
MSE: 56.36
RMSE:  7.507
R2:  0.385


# 3. SGDRegressor with Scaling

In [10]:
from sklearn.linear_model import SGDRegressor

# 표준화 scaling
train_mean = np.mean(X_train, axis=0)
train_std = np.std(X_train, axis=0)
X_train = (X_train - train_mean) / train_std # 표준화 시켜서 바꿔줌
X_test = (X_test - train_mean) / train_std

reg = SGDRegressor(max_iter=1000, eta0=0.001, learning_rate='constant')

# 모델 학습
reg.fit(X_train, y_train)

# 계수 및 절편 확인: _속성은 학습을 통해 결정되는 속성
print(reg.coef_, reg.intercept_)

#회귀식
print("y = {:2f}X + {:.3f}".format(reg.coef_[0], reg.intercept_[0])) # 1차 함수가 아니라 reg.intercept_[0]

from sklearn.metrics import mean_squared_error, r2_score

# 예측 수행
y_pred = reg.predict(X_test)

# MSE, RMSE, r2_score
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MSE:", np.round(mse, 3))
print("RMSE: ", np.round(rmse, 3))
print("R2: ", np.round(r2, 3))

[5.81599757] [22.36961368]
y = 5.815998X + 22.370
MSE: 36.643
RMSE:  6.053
R2:  0.6


# 4. SGDRegressor with StandardScaler()

In [12]:
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler

# 표준화 scaling
scaler = StandardScaler()
X_trian = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test) # 기준은(fit은) 안만들고 transform만 함!

reg = SGDRegressor(max_iter=1000000, eta0=0.001, tol=0.0001, learning_rate='constant')

# 모델 학습
reg.fit(X_train, y_train)

# 계수 및 절편 확인: _속성은 학습을 통해 결정되는 속성
print(reg.coef_, reg.intercept_)

#회귀식
print("y = {:2f}X + {:.3f}".format(reg.coef_[0], reg.intercept_[0])) # 1차 함수가 아니라 reg.intercept_[0]

from sklearn.metrics import mean_squared_error, r2_score

# 예측 수행
y_pred = reg.predict(X_test)

# MSE, RMSE, r2_score
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("MSE:", np.round(mse, 3))
print("RMSE: ", np.round(rmse, 3))
print("R2: ", np.round(r2, 3))

[5.87979881] [22.34287955]
y = 5.879799X + 22.343
MSE: 36.437
RMSE:  6.036
R2:  0.602
