# 2. Multiple Linear Regression

### 원-핫 인코딩

In [3]:
import pandas as pd

In [4]:
dataset = pd.read_csv('MultipleLinearRegressionData.csv')
X = dataset.iloc[:, :-1].values # 독립 변수
y = dataset.iloc[:, -1].values # 종속 변수

In [5]:
X

array([[0.5, 3, 'Home'],
       [1.2, 4, 'Library'],
       [1.8, 2, 'Cafe'],
       [2.4, 0, 'Cafe'],
       [2.6, 2, 'Home'],
       [3.2, 0, 'Home'],
       [3.9, 0, 'Library'],
       [4.4, 0, 'Library'],
       [4.5, 5, 'Home'],
       [5.0, 1, 'Cafe'],
       [5.3, 2, 'Cafe'],
       [5.8, 0, 'Cafe'],
       [6.0, 3, 'Library'],
       [6.1, 1, 'Cafe'],
       [6.2, 1, 'Library'],
       [6.9, 4, 'Home'],
       [7.2, 2, 'Cafe'],
       [8.4, 1, 'Home'],
       [8.6, 1, 'Library'],
       [10.0, 0, 'Library']], dtype=object)

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'), [2])], remainder='passthrough')
X = ct.fit_transform(X) # 변환 데이터 반영
X

# 1 0 : Home
# 0 1 : Library
# 0 0 : Cafe

array([[1.0, 0.0, 0.5, 3],
       [0.0, 1.0, 1.2, 4],
       [0.0, 0.0, 1.8, 2],
       [0.0, 0.0, 2.4, 0],
       [1.0, 0.0, 2.6, 2],
       [1.0, 0.0, 3.2, 0],
       [0.0, 1.0, 3.9, 0],
       [0.0, 1.0, 4.4, 0],
       [1.0, 0.0, 4.5, 5],
       [0.0, 0.0, 5.0, 1],
       [0.0, 0.0, 5.3, 2],
       [0.0, 0.0, 5.8, 0],
       [0.0, 1.0, 6.0, 3],
       [0.0, 0.0, 6.1, 1],
       [0.0, 1.0, 6.2, 1],
       [1.0, 0.0, 6.9, 4],
       [0.0, 0.0, 7.2, 2],
       [1.0, 0.0, 8.4, 1],
       [0.0, 1.0, 8.6, 1],
       [0.0, 1.0, 10.0, 0]], dtype=object)

### 데이터 세트 분리

In [7]:
from sklearn.model_selection import train_test_split
# X_train: 공부할 때 볼 문제들
# y_train: 공부할 때 볼 정답지
# X_test: 공부 끝난 후 풀어볼 실전 문제
# y_test: 실전 문제의 실제 정답
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### 학습 (다중 선형 회귀)

In [8]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression() # 객체 생성
reg.fit(X_train, y_train) # 학습

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


### 예측 값과 실제 값 비교 (테스트 세트)

In [9]:
y_pred = reg.predict(X_test)
y_pred # 모델의 예측 값

array([ 92.15457859,  10.23753043, 108.36245302,  38.14675204])

In [10]:
y_test # 실제 정답

array([ 90,   8, 100,  38])

In [11]:
reg.coef_ # [1.0, 0.0, 0.5, 3] 위에서 원-핫 인코딩한 4개에 대한 계수 정보

array([-5.82712824, -1.04450647, 10.40419528, -1.64200104])

In [12]:
reg.intercept_

np.float64(5.365006706544804)

### 모델 평가

In [13]:
reg.score(X_train, y_train) # 훈련 세트

0.9623352565265527

In [14]:
reg.score(X_test, y_test) # 테스트 세트

0.9859956178877447

### 다양한 평가 지표 (회귀 모델)

1. MAE (Mean Absolute Error) : (실제 값과 예측 값) 차이의 절대값
1. MSE (Mean Squared Error) : 차이의 제곱
1. RMSE (Root Mean Squared Error) : 차이의 제곱에 루트
1. R2 : 결정 계수

> R2 는 1에 가까울수록, 나머지는 0에 가까울수록 좋음

In [15]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_pred) # 실제 값, 예측 값 # MAE

3.2253285188287757

In [16]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred) # MSE

19.900226981514795

In [19]:
from sklearn.metrics import root_mean_squared_error
root_mean_squared_error(y_test, y_pred) # RMSE

4.460967045553553

In [22]:
from sklearn.metrics import r2_score
r2_score(y_test, y_pred) # R2, Scikit-learn에서 회귀 모델의 .score() 메서드는 기본적으로 R2를 계산함

0.9859956178877447