
# **数据挖掘——Home Credit Default Risk**

Authors：李林（3120220938）、杨洋（3220211141）、敬甲男（3220221052）、李翰杰（3120220936）

github地址：https://github.com/leealim/kaggle-Home-Credit-Default-Risk

---

## 特征工程——特征选择

首先，处理合并表后的缺失值。  
然后，通过多种方法筛选特征

> 相关性排除
> 特征重要性排除

---


In [1]:
# 引入本部分所需要的包，并定义需要的值和函数

import pandas as pd
import numpy as np
import os

source_dir="..\\data\\table_merge"
result_dir="..\\data\\choosing"

app_tr_path = source_dir+"\\application_train.csv"
app_te_path = source_dir+"\\application_test.csv"
bur_path = source_dir+"\\bureau.csv"
bur_bal_path = source_dir+"\\bureau_balance.csv"
pos_path = source_dir+"\\POS_CASH_balance.csv"
cre_path = source_dir+"\\credit_card_balance.csv"
pre_path = source_dir+"\\previous_application.csv"
ins_path = source_dir+"\\installments_payments.csv"
hom_path = "..\\data\\home-credit-default-risk\\HomeCredit_columns_description.csv"  # 列描述表
hom = pd.read_csv(hom_path)

if not os.path.exists(result_dir):
    os.makedirs(result_dir)


In [2]:
app_tr = pd.read_csv(app_tr_path)
app_te = pd.read_csv(app_te_path)

In [3]:
app_tr.isnull().any(axis=0)

SK_ID_CURR                    False
NAME_CONTRACT_TYPE            False
FLAG_OWN_CAR                  False
FLAG_OWN_REALTY               False
CNT_CHILDREN                  False
                              ...  
WALLSMATERIAL_MODE_Wooden     False
EMERGENCYSTATE_MODE_MyNull    False
EMERGENCYSTATE_MODE_No        False
EMERGENCYSTATE_MODE_Yes       False
TARGET                        False
Length: 203, dtype: bool

In [4]:
app_te.isnull().any(axis=1)

0        False
1        False
2        False
3        False
4        False
         ...  
48739    False
48740    False
48741    False
48742    False
48743    False
Length: 48744, dtype: bool

发现存在大量缺失值，这里简单用其平均值替换

In [5]:
for c in app_tr.columns:
    mean_num=app_tr[c].mean()
    if app_tr[c].isnull().sum()>0:
        app_tr[c].fillna(mean_num,inplace=True)
    if str(c)!="TARGET" and app_te[c].isnull().sum()>0:
        app_te[c].fillna(mean_num,inplace=True)

In [6]:
app_tr.isnull().any(axis=0).sum()

0

In [7]:
# 根据相关性排除部分列

threshold = 0.9

corr_matrix = app_tr.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

app_tr = app_tr.drop(columns = to_drop)
app_te = app_te.drop(columns = to_drop)

In [8]:
# 根据特征重要性选择列

import lightgbm as lgb
from sklearn.model_selection import train_test_split
import re

In [9]:
# 准备数据和模型
train = app_tr.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
train_labels = app_tr['TARGET']

feature_importances = np.zeros(train.shape[1])

model = lgb.LGBMClassifier(objective='binary', boosting_type = 'goss', n_estimators = 10000, class_weight = 'balanced')

In [10]:
# 计算特征重要性
for i in range(2):
    train_features, valid_features, train_y, valid_y = train_test_split(train, train_labels, test_size = 0.25, random_state = i)
    model.fit(train_features, train_y, early_stopping_rounds=100, eval_set = [(valid_features, valid_y)], 
              eval_metric = 'auc', verbose = 200)
    feature_importances += model.feature_importances_

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1	valid_0's binary_logloss: 0.598139
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1	valid_0's binary_logloss: 0.598139


In [11]:
feature_importances = feature_importances / 2
feature_importances = pd.DataFrame({'feature': list(train.columns), 'importance': feature_importances}).sort_values('importance', ascending = False)

feature_importances.head()

Unnamed: 0,feature,importance
0,SK_ID_CURR,2.0
5,AMT_INCOME_TOTAL,1.0
190,TARGET,1.0
6,AMT_CREDIT,0.5
3,FLAG_OWN_REALTY,0.0


In [12]:
print(app_tr.shape)
print(app_te.shape)

(290621, 191)
(48744, 190)


In [13]:
feature_importances

Unnamed: 0,feature,importance
0,SK_ID_CURR,2.0
5,AMT_INCOME_TOTAL,1.0
190,TARGET,1.0
6,AMT_CREDIT,0.5
3,FLAG_OWN_REALTY,0.0
...,...,...
68,NAME_TYPE_SUITE_Spousepartner,0.0
69,NAME_TYPE_SUITE_Unaccompanied,0.0
70,NAME_INCOME_TYPE_Businessman,0.0
71,NAME_INCOME_TYPE_Commercialassociate,0.0


In [14]:
# 得到特征重要性大于0的特征

app_tr_res = pd.DataFrame()
app_te_res = pd.DataFrame()
for index, row in feature_importances.iterrows():
    if row['importance'] > 0.0:
        c = row['feature']
        app_tr_res[c] = app_tr[c]
        if c != 'TARGET':
            app_te_res[c] = app_te[c]

In [15]:
app_tr_res

Unnamed: 0,SK_ID_CURR,AMT_INCOME_TOTAL,TARGET,AMT_CREDIT
0,100002,202500.0,1,406597.5
1,100003,270000.0,0,1293502.5
2,100004,67500.0,0,135000.0
3,100006,135000.0,0,312682.5
4,100007,121500.0,0,513000.0
...,...,...,...,...
290616,456249,112500.0,0,225000.0
290617,456251,157500.0,0,254700.0
290618,456252,72000.0,0,269550.0
290619,456254,171000.0,1,370107.0


In [16]:
app_te_res

Unnamed: 0,SK_ID_CURR,AMT_INCOME_TOTAL,AMT_CREDIT
0,100001,135000.0,568800.0
1,100005,99000.0,222768.0
2,100013,202500.0,663264.0
3,100028,315000.0,1575000.0
4,100038,180000.0,625500.0
...,...,...,...
48739,456221,121500.0,412560.0
48740,456222,157500.0,622413.0
48741,456223,202500.0,315000.0
48742,456224,225000.0,450000.0


In [17]:
# 使用均值归一化

for c in app_tr.columns:
    mean_num=app_tr[c].mean()
    std_num=app_tr[c].std()
    if str(c)=="TARGET" or str(c)=="SK_ID_CURR":
        continue
    app_tr[c]=(app_tr[c]-mean_num)/ std_num
    app_te[c]=(app_te[c]-mean_num)/ std_num
app_tr

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,REGION_POPULATION_RELATIVE,DAYS_BIRTH,...,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_Yes,TARGET
0,100002,-0.316760,-0.712885,0.642653,-0.579391,0.417004,-0.472921,-0.165293,-0.138066,1.521449,...,-0.062699,-0.174720,-0.086574,-0.074521,-0.072318,-0.518730,1.944590,-0.133929,-0.087935,1
1,100003,-0.316760,-0.712885,-1.556045,-0.579391,1.184640,1.796178,0.614127,-1.257267,-0.150922,...,-0.062699,5.723434,-0.086574,-0.074521,-0.072318,-0.518730,-0.514245,-0.133929,-0.087935,0
2,100004,3.156958,1.402745,0.642653,-0.579391,-1.118267,-1.167788,-1.437432,-0.781203,-0.673195,...,-0.062699,-0.174720,-0.086574,-0.074521,-0.072318,-0.518730,-0.514245,-0.133929,-0.087935,0
3,100006,-0.316760,-0.712885,0.642653,-0.579391,-0.350631,-0.713197,0.188061,-0.928841,-0.663807,...,-0.062699,-0.174720,-0.086574,-0.074521,-0.072318,-0.518730,-0.514245,-0.133929,-0.087935,0
4,100007,-0.316760,-0.712885,0.642653,-0.579391,-0.504158,-0.200696,-0.366208,0.585234,-0.876059,...,-0.062699,-0.174720,-0.086574,-0.074521,-0.072318,-0.518730,-0.514245,-0.133929,-0.087935,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
290616,456249,-0.316760,-0.712885,0.642653,-0.579391,-0.606510,-0.937528,-0.353132,0.155229,-1.895418,...,-0.062699,-0.174720,-0.086574,-0.074521,-0.072318,1.927778,-0.514245,-0.133929,-0.087935,0
290617,456251,-0.316760,-0.712885,-1.556045,-0.579391,-0.094753,-0.861542,0.037216,0.871122,1.552131,...,-0.062699,-0.174720,-0.086574,-0.074521,-0.072318,-0.518730,1.944590,-0.133929,-0.087935,0
290618,456252,-0.316760,-0.712885,0.642653,-0.579391,-1.067091,-0.823549,-1.065262,0.328610,-1.069078,...,-0.062699,-0.174720,-0.086574,-0.074521,-0.072318,-0.518730,1.944590,-0.133929,-0.087935,0
290619,456254,-0.316760,-0.712885,0.642653,-0.579391,0.058774,-0.566280,-0.483886,-1.127305,0.949033,...,-0.062699,-0.174720,-0.086574,-0.074521,-0.072318,-0.518730,1.944590,-0.133929,-0.087935,1


In [18]:
# 使用均值归一化

for c in app_tr_res.columns:
    mean_num=app_tr_res[c].mean()
    std_num=app_tr_res[c].std()
    if str(c)=="TARGET" or str(c)=="SK_ID_CURR":
        continue
    app_tr_res[c]=(app_tr_res[c]-mean_num)/ std_num
app_tr_res

Unnamed: 0,SK_ID_CURR,AMT_INCOME_TOTAL,TARGET,AMT_CREDIT
0,100002,0.417004,1,-0.472921
1,100003,1.184640,0,1.796178
2,100004,-1.118267,0,-1.167788
3,100006,-0.350631,0,-0.713197
4,100007,-0.504158,0,-0.200696
...,...,...,...,...
290616,456249,-0.606510,0,-0.937528
290617,456251,-0.094753,0,-0.861542
290618,456252,-1.067091,0,-0.823549
290619,456254,0.058774,1,-0.566280


In [19]:
# 使用均值归一化

for c in app_te_res.columns:
    mean_num=app_te_res[c].mean()
    std_num=app_te_res[c].std()
    if str(c)=="TARGET" or str(c)=="SK_ID_CURR":
        continue
    app_te_res[c]=(app_te_res[c]-mean_num)/ std_num
app_te_res

Unnamed: 0,SK_ID_CURR,AMT_INCOME_TOTAL,AMT_CREDIT
0,100001,-0.427804,0.142474
1,100005,-0.782405,-0.804529
2,100013,0.237072,0.400998
3,100028,1.345200,2.896191
4,100038,0.015447,0.297648
...,...,...,...
48739,456221,-0.560780,-0.285116
48740,456222,-0.206179,0.289199
48741,456223,0.237072,-0.552113
48742,456224,0.458698,-0.182652


In [20]:
# 输出结果

app_tr_res.to_csv(result_dir+"\\application_train.csv",index=False)
app_te_res.to_csv(result_dir+"\\application_test.csv",index=False)