# VIF 산출

- 변수간 다중 공선성을 감안, 이를 해결하기 위한 지표를 산출

- 모든 변수의 VIF가 20 이하가 될 때까지 VIF를 반복산출

- GDS_GROSS_MASS_35 / COR_FINANCIAL_VALUE / IMP_TRANSPORT_MODE_AT_BODR_26 / TRD_COUNTRY_2 4개 칼럼 삭제

In [1]:
import numpy as np
import pandas as pd
import warnings
import sys

warnings.filterwarnings('ignore')

In [2]:
def syspath(path) :
    if path not in sys.path :
        sys.path.append(path)
    else : pass

In [4]:
# VIF 산출을 위해 Preprocessing 모듈의 calculate_vif 메소드 사용 
# Label Encoding을 위해 Preprocessing 모듈의 Encoder 클래스 사용

syspath('./dataset/mod')

import Preprocessing
from Preprocessing import Encoder

In [5]:
# 데이터셋 불러오기
df = pd.read_pickle('./dataset/20_basic_feature_df.pkl')

# 분석을 위하여 Label Encoding 진행
en = Encoder(df)
le = en.label()
le.head()

Unnamed: 0,TRD_NAME_2,TRD_COUNTRY_2,TRD_ADDR_2,CUS_TOTAL_NUMBER_OF_ITEMS_5,CUS_TOTAL_NUMBER_OF_PACKAGES_6,CUS_REF_NO_7,CON_TIN_8,CON_NAME_8,CON_COUNTRY_8,CON_ADDR_8,...,CAL_ADDITIONAL_RATE_OF_TAX_47,PAM_FINANCIAL_VALUE_47,CAL_METHOD_OF_PAYMENT_47,IMP_DATE_OF_DECLARATION_54,REP_TIN_54,PERSON_NAME_54,GEND_REFERENCE_54,GEND_ISSUE_DATE_54,ACCEPTANCE_DATE,LABEL
0,43123,84,23498,1,64.0,105237,620010597,55562,18,42267,...,1,553.76,0,20170214,212,254,850,20150624.0,87118,0
1,43123,84,23498,1,64.0,105237,620010597,55562,18,42267,...,0,17286.89,4,20170214,212,254,850,20150624.0,87118,0
2,43123,84,23498,1,64.0,105237,620010597,55562,18,42267,...,0,62232.82,0,20170214,212,254,850,20150624.0,87118,0
3,54,10,2079,1,1213.0,122477,560002910,52636,18,25011,...,1,553.76,0,20170214,212,972,735,20150624.0,87205,0
4,54,10,2079,1,1213.0,122477,560002910,52636,18,25011,...,0,8662.33,4,20170214,212,972,735,20150624.0,87205,0


In [5]:
# Label Encoding 한 데이터에서 LABEL 칼럼 제거
X1 = le.drop('LABEL', axis=1)

# 모듈을 사용하여 VIF 산출
Preprocessing.calculate_vif(X1)

  import pandas.util.testing as tm


Unnamed: 0,VARIABLES,VIF
26,GDS_GROSS_MASS_35,3402.018726
34,IDG_NET_MASS_38,3398.975041
36,COR_FINANCIAL_VALUE,243.665655
38,STC_FINANCIAL_VALUE_46,209.097957
20,IMP_TRANSPORT_MODE_AT_BODR_26,116.447707
19,IMP_INLAND_TRANSPORT_MODE_25,116.427501
1,TRD_COUNTRY_2,22.609762
12,IMP_CNT_OF_DISPATCH_EXP_CD_15,20.381486
40,CAL_TAX_BASE_47,9.84898
9,IMP_TRADING_COUNTRY_11,8.082479


In [6]:
# VIF가 가장 높은 컬럼 제거 : GDS_GROSS_MASS_35	(3402.018726)

X2 = X1.drop('GDS_GROSS_MASS_35', axis=1)

Preprocessing.calculate_vif(X2)

Unnamed: 0,VARIABLES,VIF
35,COR_FINANCIAL_VALUE,243.665473
37,STC_FINANCIAL_VALUE_46,209.0977
20,IMP_TRANSPORT_MODE_AT_BODR_26,116.447626
19,IMP_INLAND_TRANSPORT_MODE_25,116.427258
1,TRD_COUNTRY_2,22.609722
12,IMP_CNT_OF_DISPATCH_EXP_CD_15,20.381486
39,CAL_TAX_BASE_47,9.848957
9,IMP_TRADING_COUNTRY_11,8.082385
10,VAL_FINANCIAL_VALUE_12,4.184226
25,IDG_COUNTRY_OF_ORIGIN_34,3.542077


In [7]:
# VIF가 가장 높은 컬럼 제거 : COR_FINANCIAL_VALUE	(243.665473)
# 현재 GDS_GROSS_MASS_35 / COR_FINANCIAL_VALUE 제거 됨

X3 = X2.drop('COR_FINANCIAL_VALUE', axis=1)

Preprocessing.calculate_vif(X3)

Unnamed: 0,VARIABLES,VIF
20,IMP_TRANSPORT_MODE_AT_BODR_26,116.447418
19,IMP_INLAND_TRANSPORT_MODE_25,116.42659
1,TRD_COUNTRY_2,22.609703
12,IMP_CNT_OF_DISPATCH_EXP_CD_15,20.381145
36,STC_FINANCIAL_VALUE_46,9.008403
9,IMP_TRADING_COUNTRY_11,8.082352
38,CAL_TAX_BASE_47,8.051732
10,VAL_FINANCIAL_VALUE_12,4.169795
25,IDG_COUNTRY_OF_ORIGIN_34,3.541875
13,IMP_COUNTRY_OF_ORIGIN_16,2.925262


In [8]:
# VIF가 가장 높은 컬럼 제거 : IMP_TRANSPORT_MODE_AT_BODR_26	(116.447418)
# 현재 GDS_GROSS_MASS_35 / COR_FINANCIAL_VALUE / IMP_TRANSPORT_MODE_AT_BODR_26 제거 됨

X4 = X3.drop('IMP_TRANSPORT_MODE_AT_BODR_26', axis=1)

Preprocessing.calculate_vif(X4)

Unnamed: 0,VARIABLES,VIF
1,TRD_COUNTRY_2,22.60958
12,IMP_CNT_OF_DISPATCH_EXP_CD_15,20.381144
35,STC_FINANCIAL_VALUE_46,9.008396
9,IMP_TRADING_COUNTRY_11,8.082321
37,CAL_TAX_BASE_47,8.051731
10,VAL_FINANCIAL_VALUE_12,4.169784
24,IDG_COUNTRY_OF_ORIGIN_34,3.54144
13,IMP_COUNTRY_OF_ORIGIN_16,2.925173
31,IDG_ADD_NATIONAL_PROC_37,2.79721
26,PRF_PREFERENCE_CODE_2,2.599183


In [9]:
# VIF가 가장 높은 컬럼 제거 : TRD_COUNTRY_2	(22.609580)
# 현재 GDS_GROSS_MASS_35 / COR_FINANCIAL_VALUE / IMP_TRANSPORT_MODE_AT_BODR_26 / TRD_COUNTRY_2 제거 됨

X5 = X4.drop('TRD_COUNTRY_2', axis=1)

Preprocessing.calculate_vif(X5)

# 모든 칼럼들이 VIF 20 이하가 된 것을 확인 가능, VIF 산출 종료

Unnamed: 0,VARIABLES,VIF
34,STC_FINANCIAL_VALUE_46,9.008282
36,CAL_TAX_BASE_47,8.05171
11,IMP_CNT_OF_DISPATCH_EXP_CD_15,7.19922
8,IMP_TRADING_COUNTRY_11,7.19295
9,VAL_FINANCIAL_VALUE_12,4.169492
23,IDG_COUNTRY_OF_ORIGIN_34,3.535129
12,IMP_COUNTRY_OF_ORIGIN_16,2.919614
30,IDG_ADD_NATIONAL_PROC_37,2.797044
25,PRF_PREFERENCE_CODE_2,2.598195
16,TOT_FINANCIAL_VALUE_22,2.01352
