# 지도학습 (5) - 회귀분석 결과보고

논문등의 보고서에서 결과 보고를 위해 회귀계수 표를 구성하기 위한 값을 구하는 과정

## #01. 작업 준비

### 패키지

In [1]:
import sys

sys.path.append("../../")
import helper

import numpy as np
from scipy import stats
from pandas import read_excel, DataFrame
from statsmodels.stats.outliers_influence import variance_inflation_factor

### 데이터

In [2]:
origin = read_excel("https://data.hossam.kr/F02/fish2.xlsx")
origin.head()

Unnamed: 0,길이,높이,두께,무게
0,8.4,2.11,1.41,5.9
1,13.7,3.53,2.0,32.0
2,15.0,3.82,2.43,40.0
3,16.2,4.59,2.63,51.5
4,17.4,4.59,2.94,70.0


## #02. 머신러닝에 의한 회귀분석 수행

In [3]:
xnames = ['길이','높이','두께']
yname = '무게'

In [4]:
# train/test 데이터로 분할하지 않음 - 이전 코드의 결과 값과 비교하기 위함
result = helper.ml_ols(origin, xnames, yname, degree=1, test_size=0)

print("계수 :", result.coef)
print("절편 :", result.intercept)

계수 : [0.07547791 0.55646807 0.34344464]
절편 : -2.0459587650727864e-16


## #03. 결과보고에 필요한 값 구하기

`회귀식 = 길이 * 계수1 + 높이 * 계수2 + 두께 * 계수3 + 절편 * 1(상수항)`

### 1) 절편과 계수를 하나의 배열로 결합

In [5]:
params = np.append(result.intercept, result.coef)
params

array([-2.04595877e-16,  7.54779081e-02,  5.56468069e-01,  3.43444640e-01])

### 2) 상수항 추가

In [6]:
# 독립변수 추출
x = origin.filter(xnames)

# 종속변수 추출
y = origin[yname]   #Series 객체 1차원

# # 종속변수의 DF형태 추출 2차원
# y = origin.filter([yname])

# 상수항 추가
designX = x.copy()
designX.insert(0, '상수', 1)
designX.head()

Unnamed: 0,상수,길이,높이,두께
0,1,8.4,2.11,1.41
1,1,13.7,3.53,2.0
2,1,15.0,3.82,2.43
3,1,16.2,4.59,2.63
4,1,17.4,4.59,2.94


### 3) `행렬곱` 구하기

In [8]:
dot = np.dot(designX.T,designX) #dot은 행렬의 곱을 표현 할 때 사용
dot

array([[   56.    ,  1562.    ,   440.28  ,   265.75  ],
       [ 1562.    , 48045.12  , 13688.339 ,  8270.876 ],
       [  440.28  , 13688.339 ,  3917.2114,  2365.5425],
       [  265.75  ,  8270.876 ,  2365.5425,  1434.4117]])

### 4) 행렬곱에 대한 `역행렬`

In [9]:
inv = np.linalg.inv(dot)    #역행렬
inv

array([[ 0.25997581, -0.02937614,  0.05587393,  0.02907514],
       [-0.02937614,  0.00811062, -0.0207489 , -0.00710593],
       [ 0.05587393, -0.0207489 ,  0.11758923, -0.08463348],
       [ 0.02907514, -0.00710593, -0.08463348,  0.17585582]])

### 5) `역행렬의 대각선 값` 반환

In [10]:
dia = inv.diagonal()
dia

array([0.25997581, 0.00811062, 0.11758923, 0.17585582])

### 6) 평균제곱오차(MSE) 구하기

상수항이 적용된 형태이므로 API를 통한 값이 아닌 직접 구한 값이 필요하다

In [11]:
predictions = result.fit.predict(x) #1차원
MSE = (sum((y-predictions)**2)) / (len(designX)-len(designX.iloc[0]))
MSE

276505.26113936875

### 7) 표준오차

In [12]:
se_b = np.sqrt(MSE * dia)
se_b

array([268.1131833 ,  47.35640562, 180.31649795, 220.51090366])

### 8) t-value 구하기

In [13]:
ts_b = params/se_b
ts_b

array([-7.63095175e-19,  1.59382679e-03,  3.08606298e-03,  1.55749505e-03])

### 9) p-value 구하기

`자유도`를 위해 전체 행에서 1을 빼고 계산한다

In [14]:
# 누적분포함수 cdf
p_values = [2*(1-stats.t.cdf(np.abs(i),(len(designX)-len(designX.iloc[0])))) for i in ts_b]
p_values

[1.0, 0.9987344096000763, 0.9975494909065263, 0.9987632590778108]

### 10) VIF 구하기

In [15]:
vif = []

for i, v in enumerate(xnames):
    j = list(origin.columns).index(v)
    vif.append(variance_inflation_factor(origin, j))
    
vif

[338.76030542544714, 500.757055790855, 263.01505845905143]

### 11) 결과표 구성하기

In [16]:
resultDf = DataFrame({
    "종속변수": [yname] * len(xnames),
    "독립변수": xnames,
    "B": result.coef,
    "표준오차": se_b[1:],
    "β": 0,
    "t": ts_b[1:],
    "유의확률": p_values[1:],
    "VIF": vif,
})

resultDf    #DF 형식
# helper.prettyPrint(resultDf)

Unnamed: 0,종속변수,독립변수,B,표준오차,β,t,유의확률,VIF
0,무게,길이,0.075478,47.356406,0,0.001594,0.998734,338.760305
1,무게,높이,0.556468,180.316498,0,0.003086,0.997549,500.757056
2,무게,두께,0.343445,220.510904,0,0.001557,0.998763,263.015058


### 12) statsmodels 패키지의 결과와 비교하기

In [17]:
result = helper.myOls(origin, x=['길이','높이','두께'], y='무게')
result.table
# helper.prettyPrint(result.table)

Unnamed: 0_level_0,Unnamed: 1_level_0,B,표준오차,β,t,유의확률,VIF
종속변수,독립변수,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
무게,길이,2.9083,7.734,0,0.376*,0.708,338.760305
무게,높이,67.2047,29.447,0,2.282*,0.027,500.757056
무게,두께,67.2603,36.011,0,1.868*,0.067,263.015058


### 13) 모듈에 추가된 기능 확인하기

결과보고에 필요한 값 구하기 1~11을 모듈로

In [18]:
ols_result = helper.ml_ols(origin, xnames="길이,높이,두께", yname="무게", degree=1, test_size=0)
ols_result.table

Unnamed: 0,종속변수,독립변수,B,표준오차,β,t,유의확률,VIF
0,무게,길이,0.075478,0.200711,0,0.376052,0.708408,36.405744
1,무게,높이,0.556468,0.243828,0,2.282213,0.026598,58.948458
2,무게,두께,0.343445,0.183881,0,1.867759,0.067436,32.517638


## 참고자료(Reference)

1) https://m.blog.naver.com/pmw9440/222482746927

2) https://stackoverflow.com/questions/27928275/find-p-value-significance-in-scikit-learn-linearregression

3) https://stats.stackexchange.com/questions/85943/how-to-derive-the-standard-error-of-linear-regression-coefficient

4) https://calcworkshop.com/linear-regression/t-test/

5) https://www.google.com/url?sa=i&url=http%3A%2F%2Fwww.few.vu.nl%2F~wvanwie%2FCourses%2FHighdimensionalDataAnalysis%2FWNvanWieringen_HDDA_Lecture234_RidgeRegression_20182019.pdf&psig=AOvVaw31slQKGfkCNf2PAa3VEIGC&ust=1630070470635000&source=images&cd=vfe&ved=0CAsQjRxqFwoTCIjJworkzvICFQAAAAAdAAAAABAR