
# **数据挖掘——Home Credit Default Risk**

Authors：李林（3120220938）、杨洋（3220211141）、敬甲男（3220221052）、李翰杰（3120220936）

github地址：https://github.com/leealim/kaggle-Home-Credit-Default-Risk

---

## 特征工程——特征选择

首先，处理合并表后的缺失值。  
然后，通过多种方法筛选特征

> 相关性排除
> 特征重要性排除

---


In [1]:
# 引入本部分所需要的包，并定义需要的值和函数

import pandas as pd
import numpy as np
import os

source_dir="..\\data\\table_merge"
result_dir="..\\data\\choosing"

app_tr_path = source_dir+"\\application_train.csv"
app_te_path = source_dir+"\\application_test.csv"
bur_path = source_dir+"\\bureau.csv"
bur_bal_path = source_dir+"\\bureau_balance.csv"
pos_path = source_dir+"\\POS_CASH_balance.csv"
cre_path = source_dir+"\\credit_card_balance.csv"
pre_path = source_dir+"\\previous_application.csv"
ins_path = source_dir+"\\installments_payments.csv"
hom_path = "..\\data\\home-credit-default-risk\\HomeCredit_columns_description.csv"  # 列描述表
hom = pd.read_csv(hom_path)

if not os.path.exists(result_dir):
    os.makedirs(result_dir)


In [2]:
app_tr = pd.read_csv(app_tr_path)
app_te = pd.read_csv(app_te_path)

In [3]:
app_tr.isnull().any(axis=0)

SK_ID_CURR                    False
NAME_CONTRACT_TYPE            False
FLAG_OWN_CAR                  False
FLAG_OWN_REALTY               False
CNT_CHILDREN                  False
                              ...  
WALLSMATERIAL_MODE_Wooden     False
EMERGENCYSTATE_MODE_MyNull    False
EMERGENCYSTATE_MODE_No        False
EMERGENCYSTATE_MODE_Yes       False
TARGET                        False
Length: 203, dtype: bool

In [4]:
app_te.isnull().any(axis=1)

0        False
1        False
2        False
3        False
4        False
         ...  
48739    False
48740    False
48741    False
48742    False
48743    False
Length: 48744, dtype: bool

发现存在大量缺失值，这里简单用其平均值替换

In [5]:
for c in app_tr.columns:
    mean_num=app_tr[c].mean()
    if app_tr[c].isnull().sum()>0:
        app_tr[c].fillna(mean_num,inplace=True)
    if str(c)!="TARGET" and app_te[c].isnull().sum()>0:
        app_te[c].fillna(mean_num,inplace=True)

In [6]:
app_tr.isnull().any(axis=0).sum()

0

In [10]:
# 根据相关性排除部分列

threshold = 0.9

corr_matrix = app_tr.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

app_tr = app_tr.drop(columns = to_drop)
app_te = app_te.drop(columns = to_drop)
app_tr

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,REGION_POPULATION_RELATIVE,DAYS_BIRTH,...,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_Yes,TARGET
0,100002,0,0,1,0,202500.0,406597.5,24700.5,0.018801,-9461,...,0,0,0,0,0,0,1,0,0,1
1,100003,0,0,0,0,270000.0,1293502.5,35698.5,0.003541,-16765,...,0,1,0,0,0,0,0,0,0,0
2,100004,1,1,1,0,67500.0,135000.0,6750.0,0.010032,-19046,...,0,0,0,0,0,0,0,0,0,0
3,100006,0,0,1,0,135000.0,312682.5,29686.5,0.008019,-19005,...,0,0,0,0,0,0,0,0,0,0
4,100007,0,0,1,0,121500.0,513000.0,21865.5,0.028663,-19932,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
290616,456249,0,0,1,0,112500.0,225000.0,22050.0,0.022800,-24384,...,0,0,0,0,0,1,0,0,0,0
290617,456251,0,0,0,0,157500.0,254700.0,27558.0,0.032561,-9327,...,0,0,0,0,0,0,1,0,0,0
290618,456252,0,0,1,0,72000.0,269550.0,12001.5,0.025164,-20775,...,0,0,0,0,0,0,1,0,0,0
290619,456254,0,0,1,0,171000.0,370107.0,20205.0,0.005313,-11961,...,0,0,0,0,0,0,1,0,0,1


In [20]:

# 根据特征重要性选择列

from sklearn.model_selection import train_test_split
import xgboost
feature_importances=np.zeros(app_tr.iloc[:,1:-1].shape[1])
# import re
# train = train.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
model = xgboost.XGBClassifier()
for i in range(2):
    x_train, x_test, y_train, y_test = train_test_split(
        app_tr.iloc[:,1:-1], app_tr.iloc[:,-1], test_size=0.25, random_state=i)
    model.fit(x_train, y_train)
    feature_importances += model.feature_importances_

feature_importances = feature_importances / 2
feature_importances = pd.DataFrame({'feature': list(
    app_tr.iloc[:,1:-1].columns), 'importance': feature_importances}).sort_values('importance', ascending=False)



Unnamed: 0,feature,importance
29,EXT_SOURCE_3,0.030307
28,EXT_SOURCE_2,0.024228
61,CODE_GENDER_F,0.018737
76,NAME_EDUCATION_TYPE_Higher education,0.018405
1,FLAG_OWN_CAR,0.018315
...,...,...
42,FLAG_DOCUMENT_10,0.000000
69,NAME_INCOME_TYPE_Businessman,0.000000
161,ORGANIZATION_TYPE_Telecom,0.000000
162,ORGANIZATION_TYPE_Trade: type 1,0.000000


In [22]:
pd.set_option("display.max_rows", 200)
pd.set_option('max_colwidth',40)
del_fea=feature_importances.loc[feature_importances["importance"]==0]
del_fea

Unnamed: 0,feature,importance
48,FLAG_DOCUMENT_16,0.0
43,FLAG_DOCUMENT_11,0.0
44,FLAG_DOCUMENT_12,0.0
49,FLAG_DOCUMENT_17,0.0
52,FLAG_DOCUMENT_20,0.0
13,FLAG_MOBIL,0.0
45,FLAG_DOCUMENT_13,0.0
54,AMT_REQ_CREDIT_BUREAU_HOUR,0.0
50,FLAG_DOCUMENT_18,0.0
51,FLAG_DOCUMENT_19,0.0


In [None]:
app_tr = app_tr.drop(columns = del_fea["feature"])
app_te = app_te.drop(columns = del_fea["feature"])

In [47]:
print(app_tr.shape)
print(app_te.shape)

(290621, 191)
(48744, 190)


In [48]:
# 使用均值归一化

for c in app_tr.columns:
    mean_num=app_tr[c].mean()
    std_num=app_tr[c].std()
    if str(c)=="TARGET" or str(c)=="SK_ID_CURR":
        continue
    app_tr[c]=(app_tr[c]-mean_num)/ std_num
    app_te[c]=(app_te[c]-mean_num)/ std_num
app_tr

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,REGION_POPULATION_RELATIVE,DAYS_BIRTH,...,HOUSETYPE_MODE_terraced house,WALLSMATERIAL_MODE_Block,WALLSMATERIAL_MODE_Mixed,WALLSMATERIAL_MODE_Monolithic,WALLSMATERIAL_MODE_Others,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,EMERGENCYSTATE_MODE_Yes,TARGET
0,100002,-0.316760,-0.712885,0.642653,-0.579391,0.417004,-0.472921,-0.165293,-0.138066,1.521449,...,-0.062699,-0.174720,-0.086574,-0.074521,-0.072318,-0.518730,1.944590,-0.133929,-0.087935,1
1,100003,-0.316760,-0.712885,-1.556045,-0.579391,1.184640,1.796178,0.614127,-1.257267,-0.150922,...,-0.062699,5.723434,-0.086574,-0.074521,-0.072318,-0.518730,-0.514245,-0.133929,-0.087935,0
2,100004,3.156958,1.402745,0.642653,-0.579391,-1.118267,-1.167788,-1.437432,-0.781203,-0.673195,...,-0.062699,-0.174720,-0.086574,-0.074521,-0.072318,-0.518730,-0.514245,-0.133929,-0.087935,0
3,100006,-0.316760,-0.712885,0.642653,-0.579391,-0.350631,-0.713197,0.188061,-0.928841,-0.663807,...,-0.062699,-0.174720,-0.086574,-0.074521,-0.072318,-0.518730,-0.514245,-0.133929,-0.087935,0
4,100007,-0.316760,-0.712885,0.642653,-0.579391,-0.504158,-0.200696,-0.366208,0.585234,-0.876059,...,-0.062699,-0.174720,-0.086574,-0.074521,-0.072318,-0.518730,-0.514245,-0.133929,-0.087935,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
290616,456249,-0.316760,-0.712885,0.642653,-0.579391,-0.606510,-0.937528,-0.353132,0.155229,-1.895418,...,-0.062699,-0.174720,-0.086574,-0.074521,-0.072318,1.927778,-0.514245,-0.133929,-0.087935,0
290617,456251,-0.316760,-0.712885,-1.556045,-0.579391,-0.094753,-0.861542,0.037216,0.871122,1.552131,...,-0.062699,-0.174720,-0.086574,-0.074521,-0.072318,-0.518730,1.944590,-0.133929,-0.087935,0
290618,456252,-0.316760,-0.712885,0.642653,-0.579391,-1.067091,-0.823549,-1.065262,0.328610,-1.069078,...,-0.062699,-0.174720,-0.086574,-0.074521,-0.072318,-0.518730,1.944590,-0.133929,-0.087935,0
290619,456254,-0.316760,-0.712885,0.642653,-0.579391,0.058774,-0.566280,-0.483886,-1.127305,0.949033,...,-0.062699,-0.174720,-0.086574,-0.074521,-0.072318,-0.518730,1.944590,-0.133929,-0.087935,1


In [49]:
# 输出结果

app_tr.to_csv(result_dir+"\\application_train.csv",index=False)
app_te.to_csv(result_dir+"\\application_test.csv",index=False)