In [1]:
import pandas as pd

In [24]:
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")
loan = pd.read_csv("./data/loan_before.csv")

In [25]:
train.columns

Index(['SK_ID_CURR', 'TARGET', 'CODE_GENDER', 'FLAG_OWN_CAR',
       'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT',
       'AMT_ANNUITY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE',
       'NAME_EDUCATION_TYPE', 'NAME_HOUSING_TYPE',
       'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'CNT_FAM_MEMBERS',
       'HOUR_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'EXT_SOURCE_1',
       'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_LAST_PHONE_CHANGE',
       'AMT_REQ_CREDIT_BUREAU_YEAR'],
      dtype='object')

1. SK_ID_CURR: 아이디
2. TARGET: 갚았는지 여부
3. AMT_INCOMR_TOTAL: 수입
4. AMT_CREDIT: 대출금앱
5. AMT_ANNUITY: 1달마다 갚아야 하는 금액
6. NAME_TYPE_SUITE: 동행자
7. NAME_INCOME_TYPE: 직업 종류
8. REGION_POPULATION_RELATIVE: 지역의 인구
9. DAYS_ID_PUBLISH: ID 문서를 변경한 날짜
10. HOUR_APPR_PROCESS_START: 대출신청시간
11. EXT_SOURCE_1,2,3: 신용점수1,2,3
12. AMT_REQ_CREDIT_BUREAU_YEAR: 신용평가기관에 해당 사람에 대한 신용정보를 조회한 개수

In [26]:
loan.columns

Index(['SK_ID_CURR', 'DAYS_CREDIT', 'CNT_CREDIT_PROLONG', 'AMT_CREDIT_SUM',
       'CREDIT_TYPE'],
      dtype='object')

### 문제인식

1. 어떤 요소가 대출금 상환 여부에 큰 영향을 주는가?
2. 그리고 그 요소들은 대출금 상환 여부에 어떤 영향을 미칠까?

### 방법론

#### 분석과정

이 질문에 대한 해답을 얻기 위해 해석가능한 머신러닝을 활용

**프로세스**

Step 1: feature enginnering 수행<br>
Step 2: 머신러닝 모델 훈련<br>
Step 3: shap value를 통해 어떤 변수가 가장 큰 영향을 주는지 알아본다.<br>
Step 4: 영향을 많이 주는 5개의 변수와 대출금 상환 여부와의 관계를 살펴본다.<br>

## Feature Engineering

In [27]:
train.head()

Unnamed: 0,SK_ID_CURR,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,NAME_TYPE_SUITE,...,DAYS_ID_PUBLISH,OWN_CAR_AGE,CNT_FAM_MEMBERS,HOUR_APPR_PROCESS_START,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_YEAR
0,162716,0,0.0,0,1,0,101250.0,480060.0,22509.0,Unaccompanied,...,-2256,,2.0,15,Kindergarten,,0.665732,,-139.0,1.0
1,361317,0,0.0,0,1,2,225000.0,237024.0,12231.0,Unaccompanied,...,-4133,,4.0,3,Business Entity Type 3,0.575699,0.53514,0.670652,-957.0,0.0
2,246925,0,1.0,0,0,0,171000.0,247675.5,26005.5,Unaccompanied,...,-4340,,2.0,10,Business Entity Type 3,0.092273,0.002272,0.154744,-827.0,1.0
3,294331,0,1.0,1,0,0,225000.0,180000.0,9000.0,Unaccompanied,...,-1621,12.0,2.0,12,Trade: type 7,0.221815,0.120444,,-569.0,2.0
4,152173,0,0.0,0,1,0,180000.0,337500.0,12852.0,Unaccompanied,...,-2870,,1.0,12,XNA,,0.722415,0.723837,-1544.0,4.0


AMT_CREDIT에서 AMT_ANNUITY를 나누어 해당 사람이 몇개월에 걸쳐서 돈을 갚아야 하는지 변수화

In [28]:
train['AMT_CREDIT_TO_ANNUITY_RATIO']=train['AMT_CREDIT']/train['AMT_ANNUITY']
test['AMT_CREDIT_TO_ANNUITY_RATIO']=test['AMT_CREDIT']/test['AMT_ANNUITY']

In [29]:
loan.head()

Unnamed: 0,SK_ID_CURR,DAYS_CREDIT,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,CREDIT_TYPE
0,400486,-2413,0,180000.0,Consumer credit
1,400486,-941,0,40500.0,Credit card
2,400486,-1110,0,114727.5,Consumer credit
3,400486,-2133,0,225000.0,Consumer credit
4,400486,-391,0,142290.0,Consumer credit


대출 평균 금액 변수화

In [30]:
loan.groupby(['SK_ID_CURR'])['AMT_CREDIT_SUM'].sum().reset_index()

Unnamed: 0,SK_ID_CURR,AMT_CREDIT_SUM
0,100026,5625000.000
1,100041,822305.160
2,100084,98100.000
3,100088,1777500.000
4,100100,3163743.450
...,...,...
25707,456153,2277901.575
25708,456212,1090489.500
25709,456238,1109502.000
25710,456253,3960000.000


loan 파일에서 AMT_CREDIT_SUM(대출 금액), DAYS_CREDIT(일정), CNT_CREDIT_PROLONG(연장 횟수)의 평균을 column으로 추가한다.

In [31]:
train = pd.merge(train,loan.groupby(['SK_ID_CURR'])['AMT_CREDIT_SUM'].mean().reset_index(), on='SK_ID_CURR',how='left')
train.head()

Unnamed: 0,SK_ID_CURR,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,NAME_TYPE_SUITE,...,CNT_FAM_MEMBERS,HOUR_APPR_PROCESS_START,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_YEAR,AMT_CREDIT_TO_ANNUITY_RATIO,AMT_CREDIT_SUM
0,162716,0,0.0,0,1,0,101250.0,480060.0,22509.0,Unaccompanied,...,2.0,15,Kindergarten,,0.665732,,-139.0,1.0,21.327469,315000.0
1,361317,0,0.0,0,1,2,225000.0,237024.0,12231.0,Unaccompanied,...,4.0,3,Business Entity Type 3,0.575699,0.53514,0.670652,-957.0,0.0,19.378955,2067966.0
2,246925,0,1.0,0,0,0,171000.0,247675.5,26005.5,Unaccompanied,...,2.0,10,Business Entity Type 3,0.092273,0.002272,0.154744,-827.0,1.0,9.523966,62193.86
3,294331,0,1.0,1,0,0,225000.0,180000.0,9000.0,Unaccompanied,...,2.0,12,Trade: type 7,0.221815,0.120444,,-569.0,2.0,20.0,27704.43
4,152173,0,0.0,0,1,0,180000.0,337500.0,12852.0,Unaccompanied,...,1.0,12,XNA,,0.722415,0.723837,-1544.0,4.0,26.260504,106875.0


In [32]:
test = pd.merge(test,loan.groupby(['SK_ID_CURR'])['AMT_CREDIT_SUM'].mean().reset_index(), on='SK_ID_CURR',how='left')
test.head()

Unnamed: 0,SK_ID_CURR,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,NAME_TYPE_SUITE,NAME_INCOME_TYPE,...,CNT_FAM_MEMBERS,HOUR_APPR_PROCESS_START,ORGANIZATION_TYPE,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_YEAR,AMT_CREDIT_TO_ANNUITY_RATIO,AMT_CREDIT_SUM
0,333864,1.0,1,0,1,225000.0,315000.0,16002.0,Unaccompanied,Commercial associate,...,2.0,14,Self-employed,,0.664203,0.179822,-1116.0,0.0,19.685039,2356490.0
1,253018,0.0,0,0,1,90000.0,225000.0,11250.0,Unaccompanied,Working,...,3.0,12,Business Entity Type 3,0.399164,0.259984,,-982.0,0.0,20.0,39327.77
2,360625,1.0,0,0,0,202500.0,888840.0,37494.0,Unaccompanied,Working,...,1.0,10,Self-employed,0.212372,0.470406,0.234015,-2052.0,2.0,23.706193,63571.5
3,287255,1.0,1,1,2,202500.0,168102.0,20079.0,Unaccompanied,Working,...,4.0,14,Transport: type 4,0.82304,0.64861,0.304672,-2420.0,3.0,8.37203,3420000.0
4,281149,0.0,0,1,0,306000.0,734166.0,49261.5,Unaccompanied,Working,...,2.0,12,Business Entity Type 3,,0.655703,0.315472,-1511.0,2.0,14.903444,137832.5


In [33]:
train = pd.merge(train,loan.groupby(['SK_ID_CURR'])['DAYS_CREDIT'].mean().reset_index(), on='SK_ID_CURR',how='left')
test = pd.merge(test,loan.groupby(['SK_ID_CURR'])['DAYS_CREDIT'].mean().reset_index(), on='SK_ID_CURR',how='left')

train = pd.merge(train,loan.groupby(['SK_ID_CURR'])['CNT_CREDIT_PROLONG'].mean().reset_index(), on='SK_ID_CURR',how='left')
test = pd.merge(test,loan.groupby(['SK_ID_CURR'])['CNT_CREDIT_PROLONG'].mean().reset_index(), on='SK_ID_CURR',how='left')

사람 아이디 별로 대출받은 횟수도 추가한다.

In [34]:
#대출 받은 횟수
train = pd.merge(train,loan.groupby(['SK_ID_CURR']).size().reset_index().rename(columns={0:'COUNT'}))
test = pd.merge(test,loan.groupby(['SK_ID_CURR']).size().reset_index().rename(columns={0:'COUNT'}))

## 불필요한 변수들 제거

In [35]:
train.columns

Index(['SK_ID_CURR', 'TARGET', 'CODE_GENDER', 'FLAG_OWN_CAR',
       'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT',
       'AMT_ANNUITY', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE',
       'NAME_EDUCATION_TYPE', 'NAME_HOUSING_TYPE',
       'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'CNT_FAM_MEMBERS',
       'HOUR_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'EXT_SOURCE_1',
       'EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_LAST_PHONE_CHANGE',
       'AMT_REQ_CREDIT_BUREAU_YEAR', 'AMT_CREDIT_TO_ANNUITY_RATIO',
       'AMT_CREDIT_SUM', 'DAYS_CREDIT', 'CNT_CREDIT_PROLONG', 'COUNT'],
      dtype='object')

In [36]:
train.dtypes

SK_ID_CURR                       int64
TARGET                           int64
CODE_GENDER                    float64
FLAG_OWN_CAR                     int64
FLAG_OWN_REALTY                  int64
CNT_CHILDREN                     int64
AMT_INCOME_TOTAL               float64
AMT_CREDIT                     float64
AMT_ANNUITY                    float64
NAME_TYPE_SUITE                 object
NAME_INCOME_TYPE                object
NAME_EDUCATION_TYPE             object
NAME_HOUSING_TYPE               object
REGION_POPULATION_RELATIVE     float64
DAYS_BIRTH                       int64
DAYS_EMPLOYED                  float64
DAYS_ID_PUBLISH                  int64
OWN_CAR_AGE                    float64
CNT_FAM_MEMBERS                float64
HOUR_APPR_PROCESS_START          int64
ORGANIZATION_TYPE               object
EXT_SOURCE_1                   float64
EXT_SOURCE_2                   float64
EXT_SOURCE_3                   float64
DAYS_LAST_PHONE_CHANGE         float64
AMT_REQ_CREDIT_BUREAU_YEA

object의 column들을과 알수 없는 변수 column들을 추후 모델 해석을 위해 제거합니다.

In [37]:
object_col = ['CODE_GENDER','FLAG_OWN_CAR', 'NAME_TYPE_SUITE','NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_HOUSING_TYPE','ORGANIZATION_TYPE']
unknown_col = ['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']

In [38]:
train = train.drop(object_col,axis = 1)
train = train.drop(unknown_col,axis = 1)

test = test.drop(object_col,axis = 1)
test = test.drop(unknown_col,axis = 1)

## 모델링

모델링 하기 전에 서로 상관관계가 높은 input 변수를 제거한다. 왜냐하면 input 변수가 높은 상관성을 가질 때, shap value는 제대로 된 설명력을 발휘하지 못하기 때문이다.

In [39]:
train.columns

Index(['SK_ID_CURR', 'TARGET', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY',
       'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'CNT_FAM_MEMBERS',
       'HOUR_APPR_PROCESS_START', 'DAYS_LAST_PHONE_CHANGE',
       'AMT_REQ_CREDIT_BUREAU_YEAR', 'AMT_CREDIT_TO_ANNUITY_RATIO',
       'AMT_CREDIT_SUM', 'DAYS_CREDIT', 'CNT_CREDIT_PROLONG', 'COUNT'],
      dtype='object')

In [40]:
input_col = ['FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY',
       'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'CNT_FAM_MEMBERS',
       'HOUR_APPR_PROCESS_START', 'DAYS_LAST_PHONE_CHANGE',
       'AMT_REQ_CREDIT_BUREAU_YEAR', 'AMT_CREDIT_TO_ANNUITY_RATIO',
       'AMT_CREDIT_SUM', 'DAYS_CREDIT', 'CNT_CREDIT_PROLONG', 'COUNT']

In [42]:
corr = train[input_col].corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_ID_PUBLISH,OWN_CAR_AGE,CNT_FAM_MEMBERS,HOUR_APPR_PROCESS_START,DAYS_LAST_PHONE_CHANGE,AMT_REQ_CREDIT_BUREAU_YEAR,AMT_CREDIT_TO_ANNUITY_RATIO,AMT_CREDIT_SUM,DAYS_CREDIT,CNT_CREDIT_PROLONG,COUNT
FLAG_OWN_REALTY,1.0,0.004368,0.000512,-0.047388,-0.003087,0.007268,-0.110141,-0.016104,0.006866,0.02991,0.010759,-0.112053,0.029512,0.090621,-0.09356,-0.002745,0.000174,-0.00979,0.008414
CNT_CHILDREN,0.004368,1.0,0.031787,0.004584,0.023912,-0.030151,0.34027,0.071713,-0.031796,-0.01001,0.884268,-0.005156,-0.00359,-0.03627,-0.025808,0.035864,0.026285,-0.012065,0.002649
AMT_INCOME_TOTAL,0.000512,0.031787,1.0,0.365903,0.445519,0.184945,0.071785,-0.030528,0.034613,-0.121152,0.030406,0.088055,-0.038259,0.03365,0.074494,0.241929,-0.013266,0.016117,0.116635
AMT_CREDIT,-0.047388,0.004584,0.365903,1.0,0.772711,0.094415,-0.040899,-0.075741,0.010953,-0.104716,0.064527,0.044119,-0.074191,-0.037685,0.658202,0.135435,-0.068411,-0.000384,0.046902
AMT_ANNUITY,-0.003087,0.023912,0.445519,0.772711,1.0,0.131086,0.021131,-0.040397,0.019053,-0.106253,0.076044,0.044996,-0.064976,0.001054,0.11704,0.128144,-0.052613,-0.005724,0.013588
REGION_POPULATION_RELATIVE,0.007268,-0.030151,0.184945,0.094415,0.131086,1.0,-0.019229,0.014822,0.005369,-0.085211,-0.02419,0.175761,-0.050975,0.016257,0.003954,0.077984,-0.010819,0.003701,-0.034289
DAYS_BIRTH,-0.110141,0.34027,0.071785,-0.040899,0.021131,-0.019229,1.0,0.345506,0.245414,-0.036979,0.293922,0.08998,0.07838,-0.074412,-0.090867,0.053205,0.201829,0.028239,-0.063004
DAYS_EMPLOYED,-0.016104,0.071713,-0.030528,-0.075741,-0.040397,0.014822,0.345506,1.0,0.082578,0.005907,0.036594,0.004561,0.137885,0.002834,-0.080358,-0.018131,0.160521,0.000124,-0.148704
DAYS_ID_PUBLISH,0.006866,-0.031796,0.034613,0.010953,0.019053,0.005369,0.245414,0.082578,1.0,-0.008155,-0.027486,0.027295,0.079271,-0.039274,-0.010789,0.033384,0.188471,0.022736,-0.100673
OWN_CAR_AGE,0.02991,-0.01001,-0.121152,-0.104716,-0.106253,-0.085211,-0.036979,0.005907,-0.008155,1.0,-0.035347,-0.07829,0.011068,-0.025215,-0.05513,-0.094781,-0.006665,0.011323,-0.025711


높은 상관성을 가지는 변수는 다음과 같다.

1. CNT_CHILDREN, CNT_FAM_MEMBERS
2. AMT_CREDIT_TO_ANNUITY_RATIO, AMT_CREDIT
3. AMT_ANNUITY, AMT_CREDIT

TARGET과 더 상관성이 높은 변수를 남기도록 한다.

In [43]:
train['CNT_CHILDREN'].corr(train['TARGET'])

0.02020408753274091

In [44]:
train['CNT_FAM_MEMBERS'].corr(train['TARGET'])

0.015364860802964925

In [45]:
del train['CNT_FAM_MEMBERS']
del test['CNT_FAM_MEMBERS']

In [46]:
print(train['AMT_CREDIT_TO_ANNUITY_RATIO'].corr(train['TARGET']))
print(train['AMT_CREDIT'].corr(train['TARGET']))

-0.0170488245810787
-0.01724199702034605


In [48]:
del train['AMT_CREDIT']
del test['AMT_CREDIT']

In [49]:
train.columns

Index(['SK_ID_CURR', 'TARGET', 'FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'AMT_ANNUITY', 'REGION_POPULATION_RELATIVE',
       'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE',
       'HOUR_APPR_PROCESS_START', 'DAYS_LAST_PHONE_CHANGE',
       'AMT_REQ_CREDIT_BUREAU_YEAR', 'AMT_CREDIT_TO_ANNUITY_RATIO',
       'AMT_CREDIT_SUM', 'DAYS_CREDIT', 'CNT_CREDIT_PROLONG', 'COUNT'],
      dtype='object')

In [50]:
input_col= ['FLAG_OWN_REALTY', 'CNT_CHILDREN',
       'AMT_INCOME_TOTAL', 'AMT_ANNUITY', 'REGION_POPULATION_RELATIVE',
       'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE',
       'HOUR_APPR_PROCESS_START', 'DAYS_LAST_PHONE_CHANGE',
       'AMT_REQ_CREDIT_BUREAU_YEAR', 'AMT_CREDIT_TO_ANNUITY_RATIO',
       'AMT_CREDIT_SUM', 'DAYS_CREDIT', 'CNT_CREDIT_PROLONG', 'COUNT']

In [51]:
from xgboost import XGBClassifier

In [52]:
model = XGBClassifier(n_estimators = 100, learning_rate = 0.1)
model.fit(train[input_col],train['TARGET'])





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', use_label_encoder=True,
              validate_parameters=1, verbosity=None)

In [53]:
model.predict(test[input_col])

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)