In [None]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.stats as sms
import scipy.stats as stats
from scipy.stats.mstats import zscore
from sklearn.metrics import mean_squared_error
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.linear_model import LinearRegression

In [None]:
iris = pd.read_excel("/SBA/iris.xlsx",
                     sheet_name = 0,
                     header     = 0)

#### 2. 다중선형 회귀분석(Multiple Linear Regression Analysis)

In [14]:
# Y : 양적 자료 : 1개
# X : 양적 자료 : 2개 이상

In [15]:
# 예제 데이터 : iris
# Y : petal_length
# X : sepal_length, sepal_width, petal_width

In [16]:
Y = iris.petal_length
X = iris.loc[:, ["sepal_length", "sepal_width", "petal_width"]]

In [17]:
iris_model = sm.OLS(endog = Y,
                    exog  = X).fit()
iris_model.summary()

0,1,2,3
Dep. Variable:,petal_length,R-squared (uncentered):,0.994
Model:,OLS,Adj. R-squared (uncentered):,0.994
Method:,Least Squares,F-statistic:,8426.0
Date:,"Thu, 27 Aug 2020",Prob (F-statistic):,3.14e-164
Time:,11:16:51,Log-Likelihood:,-39.808
No. Observations:,150,AIC:,85.62
Df Residuals:,147,BIC:,94.65
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
sepal_length,0.6942,0.043,16.205,0.000,0.610,0.779
sepal_width,-0.6729,0.061,-10.969,0.000,-0.794,-0.552
petal_width,1.4680,0.063,23.247,0.000,1.343,1.593

0,1,2,3
Omnibus:,2.614,Durbin-Watson:,1.784
Prob(Omnibus):,0.271,Jarque-Bera (JB):,2.448
Skew:,0.105,Prob(JB):,0.294
Kurtosis:,3.59,Cond. No.,24.0


In [18]:
# 회귀분석 결과의 해석

In [19]:
# 1단계 : 회귀모형의 타당성 검정
# 귀무가설 : 회귀모형은 타당하지 않다.
# 대립가설 : 회귀모형은 타당하다.

# F-statistic: 8426.
# Prob (F-statistic): 3.14e-164

# 결론
# 유의확률이 0.000이므로 유의수준 0.05에서 
# 회귀모형은 통계적으로 유의하게 타당한 것으로 나타났다.

In [20]:
# 2단계 : X들의 유의성 검정
# 귀무가설 : X는 Y에게 영향을 주지 않는다(beta1 == 0 or beta2 == 0)
# 대립가설 : X는 Y에게 영향을 준다(beta1 != 0 or beta2 != 0)

#               coef    std err    t    P>|t|     [0.025   0.975]
# sepal_length 0.6942   0.043   16.205   0.000   0.610   0.779
# sepal_width -0.6729   0.061   -10.969  0.000   -0.794  -0.552
# petal_width  1.4680   0.063   23.247   0.000   1.343   1.593

# sepal_length : 유의확률 = 0.000 : 유의한 영향을 준다.
# sepal_width  : 유의확률 = 0.000 : 유의한 영향을 준다.
# petal_width  : 유의확률 = 0.000 : 유의한 영향을 준다.

In [21]:
# 3단계 : X(들)는 Y에게 어떠한 영향을 주는가?

#               coef
# sepal_length  0.6942
# sepal_width  -0.6729
# petal_width   1.4680

# 반올림
# sepal_length : 0.694
# sepal_width  : -0.673
# petal_width  : 1.468

# 꽃받침의 길이는 꽃받침의 너비와 꽃잎의 너비가 고정되어 있을 때에
# 꽃받침의 길이가 1cm 증가하면 꽃잎의 길이를 약 0.694cm 정도 증가시키고,

# 꽃받침의 너비는 꽃받침의 길이와 꽃잎의 너비가 고정되어 있을 때에
# 꽃받침의 너비가 1cm 증가하면 꽃잎의 길이를 약 0.673cm 정도 감소시키고,

# 꽃잎의 너비는 꽃받침의 길이와 꽃받침의 너비가 고정되어 있을 때에
# 꽃잎의 너비가 1cm 증가하면 꽃잎의 길이를 약 1.468cm 정도 증가시키는 것으로
# 나타났다.

In [23]:
# 4단계 : 회귀모형의 설명력, X들의 설명력

# R-squared (uncentered)     : 0.994
# Adj. R-squared (uncentered): 0.994

# 결정계수는 X가 증가하면 결정계수는 무조건 증가됨
# Adj. R-squared : Adjusted R-Square : 수정된 결정계수
# 수정된 결정계수는 Y에게 유의한 X가 모형에 들어올 때는 결정계수가 증가하고
# Y에게 유의하지 않은 X가 모형에 들어올 때는 결정계수가 증가하지 않도록 조치를 한 것임

# Adjusted R-Square = 0.994
# 회귀모형이 Y의 변동을 약 99.4% 정도 설명하고 있다.
# 꽃받침의 길이, 꽃받침의 너비, 꽃잎의 너비가 
# 꽃잎의 길이의 변동을 약 99.4% 정도 설명하고 있다.

In [24]:
# 5단계 : 예측
iris_predict = iris_model.predict(exog = X)
iris_predict

0      1.479089
1      1.676675
2      1.403261
3      1.401125
4      1.342382
         ...   
145    6.009103
146    5.480638
147    5.429854
148    5.392854
149    4.719722
Length: 150, dtype: float64

In [25]:
# MSE : Mean Square Error
# MSE는 작을수록 모형이 좋다고 판단함.
mean_squared_error(y_true = iris.petal_length,
                   y_pred = iris_predict)

0.0995488606907594

In [29]:
X.columns

Index(['sepal_length', 'sepal_width', 'petal_width'], dtype='object')

In [32]:
# 모형 Check
# (1) 다중공선성(Multicolinearity)
# X들 간에 상관관계가 있는지를 알려줌
# VIF(Variance Inflation Factor)

# statsmodels.stats.outliers_influence.variance_inflation_factor
from statsmodels.stats.outliers_influence import variance_inflation_factor
VIF             = pd.DataFrame()
VIF["VIF"]      = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
VIF["Variable"] = X.columns

In [33]:
VIF

Unnamed: 0,VIF,Variable
0,94.373039,sepal_length
1,52.984682,sepal_width
2,11.868708,petal_width


In [34]:
Y = iris.petal_length
X = iris.loc[:, ["sepal_width", "petal_width"]]

In [36]:
iris_model1 = sm.OLS(endog = Y,
                     exog  = X).fit()
iris_model1.summary()

0,1,2,3
Dep. Variable:,petal_length,R-squared (uncentered):,0.984
Model:,OLS,Adj. R-squared (uncentered):,0.984
Method:,Least Squares,F-statistic:,4519.0
Date:,"Thu, 27 Aug 2020",Prob (F-statistic):,2.12e-133
Time:,13:11:54,Log-Likelihood:,-116.67
No. Observations:,150,AIC:,237.3
Df Residuals:,148,BIC:,243.4
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
sepal_width,0.2937,0.024,12.319,0.000,0.247,0.341
petal_width,2.3580,0.052,45.472,0.000,2.256,2.460

0,1,2,3
Omnibus:,0.706,Durbin-Watson:,1.365
Prob(Omnibus):,0.703,Jarque-Bera (JB):,0.374
Skew:,-0.076,Prob(JB):,0.829
Kurtosis:,3.192,Cond. No.,4.25


In [37]:
VIF             = pd.DataFrame()
VIF["VIF"]      = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
VIF["Variable"] = X.columns
VIF

Unnamed: 0,VIF,Variable
0,2.891774,sepal_width
1,2.891774,petal_width


In [40]:
# (2) 표준화된 회귀계수
# scipy.stats.mstats.zscore
iris_model_zscore = sm.OLS(endog = zscore(Y),
                           exog  = zscore(X)).fit()
iris_model_zscore.summary()

0,1,2,3
Dep. Variable:,y,R-squared (uncentered):,0.934
Model:,OLS,Adj. R-squared (uncentered):,0.933
Method:,Least Squares,F-statistic:,1043.0
Date:,"Thu, 27 Aug 2020",Prob (F-statistic):,5.77e-88
Time:,13:18:46,Log-Likelihood:,-9.2507
No. Observations:,150,AIC:,22.5
Df Residuals:,148,BIC:,28.52
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,-0.0877,0.023,-3.856,0.000,-0.133,-0.043
x2,0.9308,0.023,40.942,0.000,0.886,0.976

0,1,2,3
Omnibus:,6.235,Durbin-Watson:,1.6
Prob(Omnibus):,0.044,Jarque-Bera (JB):,6.208
Skew:,0.366,Prob(JB):,0.0449
Kurtosis:,3.677,Cond. No.,1.47


In [41]:
# 꽃잎의 너비(petal_width : 0.931)가 꽃받침의 너비(sepal_width : -0.088)보다
# 꽃잎의 길이(petal_length)에 더 큰 영향을 미치는 것으로 나타났다.
# 근거 : 표준화된 회귀계수의 절대값

In [44]:
# 6단계 : 모형진단

# (1) Durbin-Watson: 1.600
# 오차의 독립성 가정 확인 
# 1.5 <= DW <= 2.5 : 독립성 가정을 만족
# 결론 : 독립성 가정을 만족함

# (2) Jarque-Bera (JB): 6.208
# Prob(JB): 0.0449
# 정규성 검정을 확인
# 결론 : 정규성 가정이 깨짐

ValueError: Samples must be one-dimensional.

###### 더미변수(Dummy Variable)

In [45]:
# 질적 자료를 X에 넣고 싶을 때에
# 질적 자료를 그대로 넣을 수는 없고
# 더미변수를 만들어서 더미변수를 회귀모형의 X에 넣는다.

In [48]:
# dummy_data = pandas.get_dummies(질적자료)
# data = data.join(dummy_data)
species_dummy = pd.get_dummies(iris.species,
                               prefix = "d")
species_dummy

iris = iris.join(species_dummy)
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,d_setosa,d_versicolor,d_virginica
0,5.1,3.5,1.4,0.2,setosa,1,0,0
1,4.9,3.0,1.4,0.2,setosa,1,0,0
2,4.7,3.2,1.3,0.2,setosa,1,0,0
3,4.6,3.1,1.5,0.2,setosa,1,0,0
4,5.0,3.6,1.4,0.2,setosa,1,0,0


In [49]:
Y = iris.petal_length
X = iris.loc[:, ["sepal_width", "petal_width", "d_setosa", "d_versicolor", "d_virginica"]]

In [50]:
iris_model_dummy = sm.OLS(endog = Y,
                          exog  = X).fit()
iris_model_dummy.summary()

0,1,2,3
Dep. Variable:,petal_length,R-squared:,0.957
Model:,OLS,Adj. R-squared:,0.956
Method:,Least Squares,F-statistic:,803.4
Date:,"Thu, 27 Aug 2020",Prob (F-statistic):,7.97e-98
Time:,13:38:08,Log-Likelihood:,-61.9
No. Observations:,150,AIC:,133.8
Df Residuals:,145,BIC:,148.9
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
sepal_width,0.2440,0.102,2.385,0.018,0.042,0.446
petal_width,0.8281,0.170,4.876,0.000,0.492,1.164
d_setosa,0.4218,0.337,1.251,0.213,-0.245,1.088
d_versicolor,2.4860,0.272,9.155,0.000,1.949,3.023
d_virginica,3.1485,0.339,9.276,0.000,2.478,3.819

0,1,2,3
Omnibus:,12.22,Durbin-Watson:,1.711
Prob(Omnibus):,0.002,Jarque-Bera (JB):,15.433
Skew:,0.514,Prob(JB):,0.000445
Kurtosis:,4.188,Cond. No.,55.2


In [51]:
Y = iris.petal_length
X = iris.loc[:, ["sepal_width", "petal_width", "d_versicolor", "d_virginica"]]

In [52]:
iris_model_dummy2 = sm.OLS(endog = Y,
                           exog  = X).fit()
iris_model_dummy2.summary()

0,1,2,3
Dep. Variable:,petal_length,R-squared (uncentered):,0.992
Model:,OLS,Adj. R-squared (uncentered):,0.992
Method:,Least Squares,F-statistic:,4616.0
Date:,"Thu, 27 Aug 2020",Prob (F-statistic):,1.49e-152
Time:,13:40:02,Log-Likelihood:,-62.705
No. Observations:,150,AIC:,133.4
Df Residuals:,146,BIC:,145.5
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
sepal_width,0.3696,0.020,18.936,0.000,0.331,0.408
petal_width,0.7505,0.158,4.738,0.000,0.437,1.064
d_versicolor,2.2409,0.188,11.895,0.000,1.869,2.613
d_virginica,2.9322,0.293,10.020,0.000,2.354,3.510

0,1,2,3
Omnibus:,12.612,Durbin-Watson:,1.697
Prob(Omnibus):,0.002,Jarque-Bera (JB):,16.035
Skew:,0.526,Prob(JB):,0.00033
Kurtosis:,4.207,Cond. No.,41.4


In [54]:
# VIF
VIF             = pd.DataFrame()
VIF["VIF"]      = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
VIF["Variable"] = X.columns
VIF

Unnamed: 0,VIF,Variable
0,3.927182,sepal_width
1,54.646215,petal_width
2,12.78508,d_versicolor
3,30.847802,d_virginica


In [57]:
Y = iris.petal_length
X = iris.loc[:, ["sepal_width", "petal_width", "d_versicolor"]]

iris_model_dummy3 = sm.OLS(endog = Y,
                           exog  = X).fit()
print(iris_model_dummy3.summary())

VIF             = pd.DataFrame()
VIF["VIF"]      = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
VIF["Variable"] = X.columns
VIF

                                 OLS Regression Results                                
Dep. Variable:           petal_length   R-squared (uncentered):                   0.987
Model:                            OLS   Adj. R-squared (uncentered):              0.986
Method:                 Least Squares   F-statistic:                              3651.
Date:                Thu, 27 Aug 2020   Prob (F-statistic):                   8.86e-138
Time:                        13:45:17   Log-Likelihood:                         -101.96
No. Observations:                 150   AIC:                                      209.9
Df Residuals:                     147   BIC:                                      218.9
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
                   coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------

Unnamed: 0,VIF,Variable
0,2.972749,sepal_width
1,3.067393,petal_width
2,1.449498,d_versicolor


##### 변수선택 방법

In [59]:
# Forward Selection    : 전진 선택법
# Backward Elimination : 후진 소거법
# Stepwise             : 단계(단계 선택법)

# mlxtend 패키지
# from mlxtend.feature_selection import SequentialFeatureSelector as sfs
# from sklearn.linear_model import LinearRegression
result = sfs(LinearRegression(),
             k_features = 3,
             forward    = True,
             floating   = False,
             scoring    = "r2",
             cv         = 5).fit(X, Y)

result.k_feature_names_

('sepal_width', 'petal_width', 'd_versicolor')

In [60]:
# 문제2
# 예제 데이터 : diamonds
# Y : price
# X : carat, depth, table, x, y, z : 양적 자료
# X : cut, color, clarity          : 질적 자료

# 최종 모형에는 유의한 X들만 있도록 해 보세요.

In [61]:
diamonds = pd.read_excel(io         = "d:/SBA/diamonds.xlsx",
                         sheet_name = 0,
                         header     = 0)

In [62]:
# dummy variable
cut_dummy     = pd.get_dummies(diamonds.cut,     prefix = "d")
color_dummy   = pd.get_dummies(diamonds.color,   prefix = "d")
clarity_dummy = pd.get_dummies(diamonds.clarity, prefix = "d")

diamonds2 = diamonds.join(cut_dummy)
diamonds2 = diamonds2.join(color_dummy)
diamonds2 = diamonds2.join(clarity_dummy)
diamonds2.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,...,d_I,d_J,d_I1,d_IF,d_SI1,d_SI2,d_VS1,d_VS2,d_VVS1,d_VVS2
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,...,0,0,0,0,0,1,0,0,0,0
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,...,0,0,0,0,1,0,0,0,0,0
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,...,0,0,0,0,0,0,1,0,0,0
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63,...,1,0,0,0,0,0,0,1,0,0
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,...,0,1,0,0,0,1,0,0,0,0


In [63]:
diamonds2.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z', 'd_Fair', 'd_Good', 'd_Ideal', 'd_Premium', 'd_Very Good', 'd_D',
       'd_E', 'd_F', 'd_G', 'd_H', 'd_I', 'd_J', 'd_I1', 'd_IF', 'd_SI1',
       'd_SI2', 'd_VS1', 'd_VS2', 'd_VVS1', 'd_VVS2'],
      dtype='object')

In [64]:
Y = diamonds2.price
X = diamonds2[['carat', 'depth', 'table', 'x', 'y',
       'z', 'd_Fair', 'd_Good', 'd_Ideal', 'd_Premium', 'd_Very Good', 'd_D',
       'd_E', 'd_F', 'd_G', 'd_H', 'd_I', 'd_J', 'd_I1', 'd_IF', 'd_SI1',
       'd_SI2', 'd_VS1', 'd_VS2', 'd_VVS1', 'd_VVS2']]

In [66]:
diamonds_model = sm.OLS(endog = Y,
                        exog  = X).fit()
diamonds_model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.92
Model:,OLS,Adj. R-squared:,0.92
Method:,Least Squares,F-statistic:,26880.0
Date:,"Thu, 27 Aug 2020",Prob (F-statistic):,0.0
Time:,14:33:18,Log-Likelihood:,-455730.0
No. Observations:,53940,AIC:,911500.0
Df Residuals:,53916,BIC:,911700.0
Df Model:,23,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
carat,1.126e+04,48.628,231.494,0.000,1.12e+04,1.14e+04
depth,-63.8061,4.535,-14.071,0.000,-72.694,-54.918
table,-26.4741,2.912,-9.092,0.000,-32.181,-20.767
x,-1008.2611,32.898,-30.648,0.000,-1072.741,-943.781
y,9.6089,19.333,0.497,0.619,-28.284,47.502
z,-50.1189,33.486,-1.497,0.134,-115.752,15.515
d_Fair,1879.3054,178.936,10.503,0.000,1528.589,2230.022
d_Good,2459.0569,171.813,14.312,0.000,2122.302,2795.812
d_Ideal,2712.2173,164.041,16.534,0.000,2390.696,3033.739

0,1,2,3
Omnibus:,14433.356,Durbin-Watson:,1.183
Prob(Omnibus):,0.0,Jarque-Bera (JB):,565680.446
Skew:,0.577,Prob(JB):,0.0
Kurtosis:,18.823,Cond. No.,1.16e+16


In [67]:
# y를 제거
Y = diamonds2.price
X = diamonds2[['carat', 'depth', 'table', 'x', 'z', 'd_Fair', 'd_Good', 'd_Ideal', 'd_Premium', 'd_Very Good', 'd_D',
       'd_E', 'd_F', 'd_G', 'd_H', 'd_I', 'd_J', 'd_I1', 'd_IF', 'd_SI1',
       'd_SI2', 'd_VS1', 'd_VS2', 'd_VVS1', 'd_VVS2']]

diamonds_model = sm.OLS(endog = Y,
                        exog  = X).fit()
diamonds_model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.92
Model:,OLS,Adj. R-squared:,0.92
Method:,Least Squares,F-statistic:,28100.0
Date:,"Thu, 27 Aug 2020",Prob (F-statistic):,0.0
Time:,14:34:50,Log-Likelihood:,-455730.0
No. Observations:,53940,AIC:,911500.0
Df Residuals:,53917,BIC:,911700.0
Df Model:,22,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
carat,1.126e+04,48.602,231.630,0.000,1.12e+04,1.14e+04
depth,-64.0026,4.517,-14.168,0.000,-72.856,-55.149
table,-26.5012,2.911,-9.103,0.000,-32.207,-20.795
x,-1000.3541,28.795,-34.740,0.000,-1056.793,-943.915
z,-47.9253,33.194,-1.444,0.149,-112.986,17.135
d_Fair,1885.3651,178.519,10.561,0.000,1535.466,2235.264
d_Good,2465.6901,171.293,14.395,0.000,2129.955,2801.425
d_Ideal,2718.7170,163.518,16.626,0.000,2398.221,3039.213
d_Premium,2647.6525,167.585,15.799,0.000,2319.185,2976.120

0,1,2,3
Omnibus:,14432.531,Durbin-Watson:,1.183
Prob(Omnibus):,0.0,Jarque-Bera (JB):,565907.254
Skew:,0.577,Prob(JB):,0.0
Kurtosis:,18.826,Cond. No.,5.44e+17


In [68]:
# z를 제거
Y = diamonds2.price
X = diamonds2[['carat', 'depth', 'table', 'x', 'd_Fair', 'd_Good', 'd_Ideal', 'd_Premium', 'd_Very Good', 'd_D',
       'd_E', 'd_F', 'd_G', 'd_H', 'd_I', 'd_J', 'd_I1', 'd_IF', 'd_SI1',
       'd_SI2', 'd_VS1', 'd_VS2', 'd_VVS1', 'd_VVS2']]

diamonds_model = sm.OLS(endog = Y,
                        exog  = X).fit()
diamonds_model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.92
Model:,OLS,Adj. R-squared:,0.92
Method:,Least Squares,F-statistic:,29440.0
Date:,"Thu, 27 Aug 2020",Prob (F-statistic):,0.0
Time:,14:35:17,Log-Likelihood:,-455730.0
No. Observations:,53940,AIC:,911500.0
Df Residuals:,53918,BIC:,911700.0
Df Model:,21,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
carat,1.126e+04,48.600,231.626,0.000,1.12e+04,1.14e+04
depth,-66.7693,4.091,-16.322,0.000,-74.787,-58.752
table,-26.4573,2.911,-9.089,0.000,-32.163,-20.752
x,-1029.4779,20.549,-50.098,0.000,-1069.755,-989.201
d_Fair,1956.5290,171.582,11.403,0.000,1620.228,2292.830
d_Good,2536.7695,164.067,15.462,0.000,2215.196,2858.343
d_Ideal,2789.7893,155.934,17.891,0.000,2484.158,3095.420
d_Premium,2719.2875,160.073,16.988,0.000,2405.543,3033.032
d_Very Good,2683.3491,160.321,16.737,0.000,2369.119,2997.579

0,1,2,3
Omnibus:,14433.691,Durbin-Watson:,1.183
Prob(Omnibus):,0.0,Jarque-Bera (JB):,566407.977
Skew:,0.577,Prob(JB):,0.0
Kurtosis:,18.833,Cond. No.,5.98e+17


In [69]:
# VIF
VIF             = pd.DataFrame()
VIF["VIF"]      = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
VIF["Variable"] = X.columns
VIF

  vif = 1. / (1. - r_squared_i)


Unnamed: 0,VIF,Variable
0,22.413762,carat
1,1.450534,depth
2,1.786944,table
3,22.442478,x
4,inf,d_Fair
5,inf,d_Good
6,inf,d_Ideal
7,inf,d_Premium
8,inf,d_Very Good
9,inf,d_D


In [73]:
# x 제거
Y = diamonds2.price
X = diamonds2[['carat', 'depth', 'table', 'd_Fair', 'd_Good', 'd_Ideal', 'd_Premium', 'd_Very Good', 'd_D',
       'd_E', 'd_F', 'd_G', 'd_H', 'd_I', 'd_J', 'd_I1', 'd_IF', 'd_SI1',
       'd_SI2', 'd_VS1', 'd_VS2', 'd_VVS1', 'd_VVS2']]

diamonds_model = sm.OLS(endog = Y,
                        exog  = X).fit()
print(diamonds_model.summary())

VIF             = pd.DataFrame()
VIF["VIF"]      = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
VIF["Variable"] = X.columns
VIF

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.916
Model:                            OLS   Adj. R-squared:                  0.916
Method:                 Least Squares   F-statistic:                 2.942e+04
Date:                Thu, 27 Aug 2020   Prob (F-statistic):               0.00
Time:                        14:39:59   Log-Likelihood:            -4.5696e+05
No. Observations:               53940   AIC:                         9.140e+05
Df Residuals:                   53919   BIC:                         9.141e+05
Df Model:                          20                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
carat        8895.1940     12.079    736.390      

  vif = 1. / (1. - r_squared_i)


Unnamed: 0,VIF,Variable
0,1.323098,carat
1,1.378257,depth
2,1.786714,table
3,inf,d_Fair
4,inf,d_Good
5,inf,d_Ideal
6,inf,d_Premium
7,inf,d_Very Good
8,inf,d_D
9,inf,d_E


In [74]:
# d_Ideal, d_Premium, d_Very Good, d_G, d_VS2 제거
Y = diamonds2.price
X = diamonds2[['carat', 'depth', 'table', 'd_Fair', 'd_Good', 'd_D',
       'd_E', 'd_F', 'd_H', 'd_I', 'd_J', 'd_I1', 'd_IF', 'd_SI1',
       'd_SI2', 'd_VS1', 'd_VVS1', 'd_VVS2']]

diamonds_model = sm.OLS(endog = Y,
                        exog  = X).fit()
print(diamonds_model.summary())

VIF             = pd.DataFrame()
VIF["VIF"]      = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
VIF["Variable"] = X.columns
VIF

                                 OLS Regression Results                                
Dep. Variable:                  price   R-squared (uncentered):                   0.957
Model:                            OLS   Adj. R-squared (uncentered):              0.957
Method:                 Least Squares   F-statistic:                          6.730e+04
Date:                Thu, 27 Aug 2020   Prob (F-statistic):                        0.00
Time:                        14:42:46   Log-Likelihood:                     -4.5698e+05
No. Observations:               53940   AIC:                                  9.140e+05
Df Residuals:                   53922   BIC:                                  9.142e+05
Df Model:                          18                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

Unnamed: 0,VIF,Variable
0,5.052256,carat
1,396.496292,depth
2,402.825601,table
3,1.081024,d_Fair
4,1.121925,d_Good
5,1.660286,d_D
6,1.919034,d_E
7,1.858188,d_F
8,1.763519,d_H
9,1.519069,d_I


In [75]:
# table 제거
Y = diamonds2.price
X = diamonds2[['carat', 'depth', 'd_Fair', 'd_Good', 'd_D',
       'd_E', 'd_F', 'd_H', 'd_I', 'd_J', 'd_I1', 'd_IF', 'd_SI1',
       'd_SI2', 'd_VS1', 'd_VVS1', 'd_VVS2']]

diamonds_model = sm.OLS(endog = Y,
                        exog  = X).fit()
print(diamonds_model.summary())

VIF             = pd.DataFrame()
VIF["VIF"]      = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
VIF["Variable"] = X.columns
VIF

                                 OLS Regression Results                                
Dep. Variable:                  price   R-squared (uncentered):                   0.957
Model:                            OLS   Adj. R-squared (uncentered):              0.957
Method:                 Least Squares   F-statistic:                          7.085e+04
Date:                Thu, 27 Aug 2020   Prob (F-statistic):                        0.00
Time:                        14:43:37   Log-Likelihood:                     -4.5713e+05
No. Observations:               53940   AIC:                                  9.143e+05
Df Residuals:                   53923   BIC:                                  9.144e+05
Df Model:                          17                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

Unnamed: 0,VIF,Variable
0,4.95794,carat
1,11.963318,depth
2,1.079549,d_Fair
3,1.116897,d_Good
4,1.657294,d_D
5,1.911634,d_E
6,1.854175,d_F
7,1.76266,d_H
8,1.518812,d_I
9,1.294428,d_J


In [76]:
# depth 제거
Y = diamonds2.price
X = diamonds2[['carat', 'd_Fair', 'd_Good', 'd_D',
       'd_E', 'd_F', 'd_H', 'd_I', 'd_J', 'd_I1', 'd_IF', 'd_SI1',
       'd_SI2', 'd_VS1', 'd_VVS1', 'd_VVS2']]

diamonds_model = sm.OLS(endog = Y,
                        exog  = X).fit()
print(diamonds_model.summary())

VIF             = pd.DataFrame()
VIF["VIF"]      = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
VIF["Variable"] = X.columns
VIF

                                 OLS Regression Results                                
Dep. Variable:                  price   R-squared (uncentered):                   0.938
Model:                            OLS   Adj. R-squared (uncentered):              0.938
Method:                 Least Squares   F-statistic:                          5.070e+04
Date:                Thu, 27 Aug 2020   Prob (F-statistic):                        0.00
Time:                        14:44:20   Log-Likelihood:                     -4.6724e+05
No. Observations:               53940   AIC:                                  9.345e+05
Df Residuals:                   53924   BIC:                                  9.346e+05
Df Model:                          16                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

Unnamed: 0,VIF,Variable
0,3.53255,carat
1,1.077585,d_Fair
2,1.109305,d_Good
3,1.294208,d_D
4,1.415621,d_E
5,1.457966,d_F
6,1.512059,d_H
7,1.388195,d_I
8,1.237164,d_J
9,1.107607,d_I1


In [77]:
# MSE
mean_squared_error(y_true = diamonds.price,
                   y_pred = diamonds_model.predict(X))

1956104.63077452