In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

plt.rc('font',family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)

Target Data

-1978 보스턴 주택 가격
-506개 타운의 주택 가격 중앙값 (단위 1,000 달러)

Feature Data

- CRIM: 범죄율
- INDUS: 비소매상업지역 면적 비율
- NOX: 일산화질소 농도
- RM: 주택당 방 수
- LSTAT: 인구 중 하위 계층 비율
- B: 인구 중 흑인 비율
- PTRATIO: 학생/교사 비율
- ZN: 25,000 평방피트를 초과 거주지역 비율
- CHAS: 찰스강의 경계에 위치한 경우는 1, 아니면 0
- AGE: 1940년 이전에 건축된 주택의 비율
- RAD: 방사형 고속도로까지의 거리
- DIS: 직업센터의 거리
- TAX: 재산세율

In [7]:
df = pd.read_csv('dataset/Boston_house.csv')
df.head()

Unnamed: 0,AGE,B,RM,CRIM,DIS,INDUS,LSTAT,NOX,PTRATIO,RAD,ZN,TAX,CHAS,Target
0,65.2,396.9,6.575,0.00632,4.09,2.31,4.98,0.538,15.3,1,18.0,296,0,24.0
1,78.9,396.9,6.421,0.02731,4.9671,7.07,9.14,0.469,17.8,2,0.0,242,0,21.6
2,61.1,392.83,7.185,0.02729,4.9671,7.07,4.03,0.469,17.8,2,0.0,242,0,34.7
3,45.8,394.63,6.998,0.03237,6.0622,2.18,2.94,0.458,18.7,3,0.0,222,0,33.4
4,54.2,396.9,7.147,0.06905,6.0622,2.18,5.33,0.458,18.7,3,0.0,222,0,36.2


In [12]:
boston_data = df.drop(columns='Target')

target = df[['Target']]
x_data = df[['CRIM','RM','LSTAT']]
x_data.head()

Unnamed: 0,CRIM,RM,LSTAT
0,0.00632,6.575,4.98
1,0.02731,6.421,9.14
2,0.02729,7.185,4.03
3,0.03237,6.998,2.94
4,0.06905,7.147,5.33


상수항 결합

회귀분석모형 수식을 간단하게 만들기 위해 다음과 같이 상수항을 독립변수 데이터에 추가하는 것을 상수항 결합(bias augmentation)작업이라고 한다.


상수항 결합을 하게 되면 모든 원소가 1인 벡터가 입력 데이터 행렬에 추가된다.

이렇게 되면 전체 수식이 다음과 같이 상수항이 추가된 가중치 벡터 w와 상수항이 추가된 입력 데이터 벡터 x의 내적으로 간단히 표시된다.

일반적으로 선형회귀모형은 항상 상수항 결합을 하기 때문에 특별히 벡터 기호를 xa 또는 wa라고 표시하지 않아도 상수항 결합이 되어있는 것으로 볼 수 있다.

statsmodels 패키지는 상수항 결합을 위한 add_constant 함수를 제공한다.

In [15]:
# 회귀분석을 하기 위한 B_0, 상수항 추가
x_data1 = sm.add_constant(x_data,has_constant='add')

In [16]:
x_data1

Unnamed: 0,const,CRIM,RM,LSTAT
0,1.0,0.00632,6.575,4.98
1,1.0,0.02731,6.421,9.14
2,1.0,0.02729,7.185,4.03
3,1.0,0.03237,6.998,2.94
4,1.0,0.06905,7.147,5.33
...,...,...,...,...
501,1.0,0.06263,6.593,9.67
502,1.0,0.04527,6.120,9.08
503,1.0,0.06076,6.976,5.64
504,1.0,0.10959,6.794,6.48


In [20]:
# 회귀모델 적합
multi_model = sm.OLS(target,x_data1)
fitted_multi_model = multi_model.fit()

In [21]:
# summary 함수를 통해 OLS 결과 출력
fitted_multi_model.summary()

0,1,2,3
Dep. Variable:,Target,R-squared:,0.646
Model:,OLS,Adj. R-squared:,0.644
Method:,Least Squares,F-statistic:,305.2
Date:,"Tue, 17 Nov 2020",Prob (F-statistic):,1.01e-112
Time:,21:04:12,Log-Likelihood:,-1577.6
No. Observations:,506,AIC:,3163.0
Df Residuals:,502,BIC:,3180.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-2.5623,3.166,-0.809,0.419,-8.783,3.658
CRIM,-0.1029,0.032,-3.215,0.001,-0.166,-0.040
RM,5.2170,0.442,11.802,0.000,4.348,6.085
LSTAT,-0.5785,0.048,-12.135,0.000,-0.672,-0.485

0,1,2,3
Omnibus:,171.754,Durbin-Watson:,0.822
Prob(Omnibus):,0.0,Jarque-Bera (JB):,628.308
Skew:,1.535,Prob(JB):,3.67e-137
Kurtosis:,7.514,Cond. No.,216.0


 위의 sm.OLS(target, x_data1).fit().summary()를 통해 나온 결과는 다중공선성, 오차항의 정규성, 등분산성 등의 기본가정의 문제가 위배된 경우를 처리하여 분석해주지 않는다. 이는 분석가의 역량이며, 다음에 다중공선성을 확인하는 과정을 진행하겠다.

In [25]:
result1 = smf.ols(formula='target ~ CRIM + RM + LSTAT',data=df).fit()

In [26]:
result1.summary()

0,1,2,3
Dep. Variable:,target,R-squared:,0.646
Model:,OLS,Adj. R-squared:,0.644
Method:,Least Squares,F-statistic:,305.2
Date:,"Tue, 17 Nov 2020",Prob (F-statistic):,1.01e-112
Time:,21:23:14,Log-Likelihood:,-1577.6
No. Observations:,506,AIC:,3163.0
Df Residuals:,502,BIC:,3180.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-2.5623,3.166,-0.809,0.419,-8.783,3.658
CRIM,-0.1029,0.032,-3.215,0.001,-0.166,-0.040
RM,5.2170,0.442,11.802,0.000,4.348,6.085
LSTAT,-0.5785,0.048,-12.135,0.000,-0.672,-0.485

0,1,2,3
Omnibus:,171.754,Durbin-Watson:,0.822
Prob(Omnibus):,0.0,Jarque-Bera (JB):,628.308
Skew:,1.535,Prob(JB):,3.67e-137
Kurtosis:,7.514,Cond. No.,216.0


In [27]:
result2 = smf.ols(formula='target ~ CRIM + ZN + CHAS + NOX + RM + AGE + DIS + RAD + TAX + PTRATIO + B + LSTAT',
                  data=df).fit()

In [28]:
result2.summary()

0,1,2,3
Dep. Variable:,target,R-squared:,0.741
Model:,OLS,Adj. R-squared:,0.734
Method:,Least Squares,F-statistic:,117.3
Date:,"Tue, 17 Nov 2020",Prob (F-statistic):,6.42e-136
Time:,21:24:10,Log-Likelihood:,-1498.9
No. Observations:,506,AIC:,3024.0
Df Residuals:,493,BIC:,3079.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,36.3639,5.091,7.143,0.000,26.361,46.366
CRIM,-0.1084,0.033,-3.304,0.001,-0.173,-0.044
ZN,0.0459,0.014,3.368,0.001,0.019,0.073
CHAS,2.7164,0.856,3.173,0.002,1.034,4.399
NOX,-17.4295,3.681,-4.735,0.000,-24.662,-10.197
RM,3.7970,0.416,9.132,0.000,2.980,4.614
AGE,0.0007,0.013,0.053,0.958,-0.025,0.027
DIS,-1.4896,0.195,-7.648,0.000,-1.872,-1.107
RAD,0.2999,0.064,4.710,0.000,0.175,0.425

0,1,2,3
Omnibus:,178.124,Durbin-Watson:,1.079
Prob(Omnibus):,0.0,Jarque-Bera (JB):,784.481
Skew:,1.521,Prob(JB):,4.49e-171
Kurtosis:,8.287,Cond. No.,15000.0


- R-squred, Adj.R-squred : 보통 설명력이라고 말하는 값인데 주어진 데이터를 현재 모형이 얼마나 잘 설명하고 있는지를 나타내는 지수입니다. 단 R-squared는 독립변수가 추가될 수 록 증가하는 값이라 Adj.R-squared 값을 더 많이 봅니다. 업이나 분석자에 따라서 판단하는 기준이 다른데 교육교재나 도서에서나 0.9이상의 값을 볼 수 있고 현실적으로는 0.7 이상인 경우 설명력이 높다고 봅니다. 걔중에는 0.6까지 기준을 낮추는 경우가 있는데 ... 분석자의 판단이겠죠.
- Prob(F-statistics) : 모형에 대한 p-value 로 통상 0.05이하인 경우 통계적으로 유의하다고 판단합니다.
- P>[t] : 각 독립변수의 계수에 대한 p-value로 해당 독립변수가 유의미한지 판단합니다.


Adj.R-squared는 0.734, 모형의 P-value는 0.05이하로 통계적으로 유의미합니다만 AGE 변수에 대한 P-value가 0.954로 유의미하지 않습니다. 즉 AGE 변수는 Target에 영향을 주는 변수라고 볼 수 없습니다. 이 경우 AGE 변수를 제외하고 다시 회귀분석을 수행합니다.

In [29]:
result3 = smf.ols(formula='target ~ CRIM + ZN + CHAS + NOX + RM + DIS + RAD + TAX + PTRATIO + B + LSTAT',
                  data=df).fit()

In [30]:
result3.summary()

0,1,2,3
Dep. Variable:,target,R-squared:,0.741
Model:,OLS,Adj. R-squared:,0.735
Method:,Least Squares,F-statistic:,128.2
Date:,"Tue, 17 Nov 2020",Prob (F-statistic):,5.54e-137
Time:,21:25:30,Log-Likelihood:,-1498.9
No. Observations:,506,AIC:,3022.0
Df Residuals:,494,BIC:,3072.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,36.3411,5.067,7.171,0.000,26.385,46.298
CRIM,-0.1084,0.033,-3.307,0.001,-0.173,-0.044
ZN,0.0458,0.014,3.390,0.001,0.019,0.072
CHAS,2.7187,0.854,3.183,0.002,1.040,4.397
NOX,-17.3760,3.535,-4.915,0.000,-24.322,-10.430
RM,3.8016,0.406,9.356,0.000,3.003,4.600
DIS,-1.4927,0.186,-8.037,0.000,-1.858,-1.128
RAD,0.2996,0.063,4.726,0.000,0.175,0.424
TAX,-0.0118,0.003,-3.493,0.001,-0.018,-0.005

0,1,2,3
Omnibus:,178.43,Durbin-Watson:,1.078
Prob(Omnibus):,0.0,Jarque-Bera (JB):,787.785
Skew:,1.523,Prob(JB):,8.6e-172
Kurtosis:,8.3,Cond. No.,14700.0


Adj.R-squared는 0.735로 더 좋아졌습니다.. 
모형의 P-value는 0.05이하이고 모든 변수가 P-value 0.05 이하로 유의미한 결과를 보여줍니다. 변수들 중 질소산화물농도(NOX)가 1 증가할 때 주택가격의 값이 17이 감소하고 RM은 3.7, CHAS는 2.7 정도 집값 상승에 영향을 줍니다.


# 추가 고려 사항
- 회귀분석을 진행함에 있어서 확인하고 넘어가야할 부분들이 총 4가지가 있습니다.

- 정규성 : 잔차의 분포가 정규분포를 따르는지
- 등분산성 : 잔차의 분포가 등분산성, 즉 고르게 분포하는지
- 선형성 : 종속변수와 독립변수가 선형적 관계를 가지는지
