# 지도학습(5) - 회귀분석 결과보고

## #01. 작업준비

### 패키지 가져오기 


In [1]:
import pandas as pd 
from sklearn.linear_model import LinearRegression
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))
import helper
from scipy import stats
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import mean_squared_error, r2_score, mean_squared_log_error, mean_absolute_error

import numpy as np

## #01. 데이터 가져오기

In [2]:
origin= pd.read_excel('https://data.hossam.kr/F02/fish2.xlsx')

origin.head()

Unnamed: 0,길이,높이,두께,무게
0,8.4,2.11,1.41,5.9
1,13.7,3.53,2.0,32.0
2,15.0,3.82,2.43,40.0
3,16.2,4.59,2.63,51.5
4,17.4,4.59,2.94,70.0


## #02. 머신러닝에 의한 회귀분석 수행

In [3]:
xnames = ['길이','높이','두께']
yname = '무게'


In [4]:
result = helper.ml_ols(origin,xnames,yname,degree=1,test_size=0)

print("계수: ", result.coef)
print("절편: ", result.intercept)

계수:  [ 2.9082713  67.20469902 67.26029602]
절편:  -546.4397914448656


## #03. 결과보고에 필요한 값 구하기

### 1) 절편과 계수를 하나의 배열로 결합

In [5]:
params = np.append(result.intercept,result.coef)
params

array([-546.43979144,    2.9082713 ,   67.20469902,   67.26029602])

### 2) 상수항 추가하기

In [6]:
# 독립변수 
x=origin.filter(xnames)

# 종속변수
y= origin[yname]

# 상수항 추가

designx=x.copy()
designx.insert(0,'상수',1)
designx.head()
x

Unnamed: 0,길이,높이,두께
0,8.4,2.11,1.41
1,13.7,3.53,2.0
2,15.0,3.82,2.43
3,16.2,4.59,2.63
4,17.4,4.59,2.94
5,18.0,5.22,3.32
6,18.7,5.2,3.12
7,19.0,5.64,3.05
8,19.6,5.14,3.04
9,20.0,5.08,2.77


In [7]:
y.head()

0     5.9
1    32.0
2    40.0
3    51.5
4    70.0
Name: 무게, dtype: float64

### 3) 행렬곱 구하기

In [8]:
dot =np.dot(designx.T,designx)
dot

array([[   56.    ,  1562.    ,   440.28  ,   265.75  ],
       [ 1562.    , 48045.12  , 13688.339 ,  8270.876 ],
       [  440.28  , 13688.339 ,  3917.2114,  2365.5425],
       [  265.75  ,  8270.876 ,  2365.5425,  1434.4117]])

### 4) 행렬곱의 역행렬

In [9]:
inv= np.linalg.inv(dot)
inv

array([[ 0.25997581, -0.02937614,  0.05587393,  0.02907514],
       [-0.02937614,  0.00811062, -0.0207489 , -0.00710593],
       [ 0.05587393, -0.0207489 ,  0.11758923, -0.08463348],
       [ 0.02907514, -0.00710593, -0.08463348,  0.17585582]])

### 5) 역행렬의 대각선 반환

In [10]:
dia = inv.diagonal()
dia

array([0.25997581, 0.00811062, 0.11758923, 0.17585582])

### 6) 평균 제곱 오차 구하기
상수항이 적용된 형태이므로 직접구한 값 필요

In [11]:
y.head()

0     5.9
1    32.0
2    40.0
3    51.5
4    70.0
Name: 무게, dtype: float64

In [12]:
predictions = result.fit.predict(x)


MSE = (sum((y-predictions)**2)) / (len(designx)-len(designx.iloc[0]))
MSE

7374.273394715794

### 7) 표준오차


In [13]:
se_b = np.sqrt(MSE*dia)
se_b

array([43.78507388,  7.73368804, 29.44715768, 36.0112326 ])

### 8) t-value 구하기


In [14]:
ts_b = params/se_b
ts_b

array([-12.48004726,   0.37605232,   2.28221344,   1.86775878])

### 9) p-value 구하기

In [15]:
p_values = [2*(1-stats.t.cdf(np.abs(i),(len(designx)-len(designx.iloc[0])))) for i in ts_b]
p_values

[0.0, 0.7084079152880327, 0.026597717787692154, 0.06743585337091651]

### VIF 값 

In [16]:
vif = []

for i,v in enumerate(xnames):
    j=list(origin.columns).index(v)
    vif.append(variance_inflation_factor(origin, j))

vif

[338.76030542544714, 500.757055790855, 263.01505845905143]

### 11) 결과표 구성하기

In [17]:
result.coef[0]

2.9082713001342935

In [18]:
resultDf = pd.DataFrame({
    "종속변수": [yname] * len(xnames),
    "독립변수": xnames,
    "B": result.coef[0],
    "표준오차": se_b[1:],
    "β": 0,
    "t": ts_b[1:],
    "유의확률": p_values[1:],
    "VIF": vif,
})

resultDf

Unnamed: 0,종속변수,독립변수,B,표준오차,β,t,유의확률,VIF
0,무게,길이,2.908271,7.733688,0,0.376052,0.708408,338.760305
1,무게,높이,2.908271,29.447158,0,2.282213,0.026598,500.757056
2,무게,두께,2.908271,36.011233,0,1.867759,0.067436,263.015058


### 12) 모듈에 추가된 기능 확인하기


In [19]:
x=['길이','높이','두께']
y=['무게']

In [20]:
ols_result= helper.ml_ols(data=origin,xnames=x,yname=y,degree=1,test_size=0)


In [22]:
ols_result.table

Unnamed: 0,종속변수,독립변수,B,표준오차,β,t,유의확률,VIF
0,[무게],길이,2.908271,7.733688,0,0.376052,0.708408,338.760305
1,[무게],높이,67.204699,29.447158,0,2.282213,0.026598,500.757056
2,[무게],두께,67.260296,36.011233,0,1.867759,0.067436,263.015058
