
# **数据挖掘——Home Credit Default Risk**

Authors：李林（3120220938）、杨洋（3220211141）、敬甲男（3220221052）、李翰杰（3120220936）

github地址：https://github.com/leealim/kaggle-Home-Credit-Default-Risk

---

## 数据预处理——缺失值处理

共八张表，逐个进行处理：
- application_{train|test}.csv:客户申请表
- bureau.csv/bureau_balance.csv:客户历史借款记录
- POS_CASH_balance.csv:客户POS和现金贷款历史
- credit_card_balance.csv:客户信用卡的snapshot历史
- previous_application.csv:客户历史申请记录
- installments_payments.csv:客户信用卡还款记录

---


In [18]:
# 引入本部分所需要的包，并定义需要的值和函数

import pandas as pd
import numpy as np
import os


source_dir=".\\"
if not os.path.exists(source_dir):
    raise Exception('请补充数据集！')
result_dir=".\\data\\miss_value_handling"

app_tr_path = source_dir+"\\application_train.csv"
app_te_path = source_dir+"\\application_test.csv"
bur_path = source_dir+"\\bureau.csv"
bur_bal_path = source_dir+"\\bureau_balance.csv"
pos_path = source_dir+"\\POS_CASH_balance.csv"
cre_path = source_dir+"\\credit_card_balance.csv"
pre_path = source_dir+"\\previous_application.csv"
ins_path = source_dir+"\\installments_payments.csv"



In [4]:
hom_path = source_dir+"\\HomeCredit_columns_description.csv"  # 列描述表
hom = pd.read_csv(hom_path)



if not os.path.exists(result_dir):
    os.makedirs(result_dir)


In [5]:
def missing_values_table(df, table_name):
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table = mis_val_table.rename(
        columns={0: 'Missing Values',
                 1: '% of Total Values'})
    mis_val_table = mis_val_table.sort_values(
        '% of Total Values', ascending=False).round(1)
    miss_num = (mis_val_table["Missing Values"] != 0).sum()
    print("Total " + str(miss_num) + " columns missing values")
    mis_val_table = mis_val_table.drop(index=mis_val_table[miss_num:].index)
    mis_val_table = mis_val_table.merge(
        hom, how="left", left_index=True, right_on='Row')
    mis_val_table = mis_val_table.drop(columns=['Unnamed: 0'])
    mis_val_table = mis_val_table.drop(
        index=mis_val_table.loc[mis_val_table["Table"] != table_name].index)
    mis_val_table = mis_val_table.reindex(
        columns=["Row", "Description", "Special", "Missing Values", "% of Total Values"])
    mis_val_table = mis_val_table.reset_index(drop=True)
    mis_val_table = mis_val_table.merge(
        df.describe().T, how="left", left_on="Row", right_index=True)
    return mis_val_table



### 1. **application_{train|test}.csv**

In [5]:
# 查看训练数据的基本数据特征

app_tr = pd.read_csv(app_tr_path)
app_tr.describe()


Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
count,307511.0,307511.0,307511.0,307511.0,307511.0,307499.0,307233.0,307511.0,307511.0,307511.0,...,307511.0,307511.0,307511.0,307511.0,265992.0,265992.0,265992.0,265992.0,265992.0,265992.0
mean,278180.518577,0.080729,0.417052,168797.9,599026.0,27108.573909,538396.2,0.020868,-16036.995067,63815.045904,...,0.00813,0.000595,0.000507,0.000335,0.006402,0.007,0.034362,0.267395,0.265474,1.899974
std,102790.175348,0.272419,0.722121,237123.1,402490.8,14493.737315,369446.5,0.013831,4363.988632,141275.766519,...,0.089798,0.024387,0.022518,0.018299,0.083849,0.110757,0.204685,0.916002,0.794056,1.869295
min,100002.0,0.0,0.0,25650.0,45000.0,1615.5,40500.0,0.00029,-25229.0,-17912.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,189145.5,0.0,0.0,112500.0,270000.0,16524.0,238500.0,0.010006,-19682.0,-2760.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,278202.0,0.0,0.0,147150.0,513531.0,24903.0,450000.0,0.01885,-15750.0,-1213.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,367142.5,0.0,1.0,202500.0,808650.0,34596.0,679500.0,0.028663,-12413.0,-289.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
max,456255.0,1.0,19.0,117000000.0,4050000.0,258025.5,4050000.0,0.072508,-7489.0,365243.0,...,1.0,1.0,1.0,1.0,4.0,9.0,8.0,27.0,261.0,25.0


In [6]:
# 缺失值分析

t = missing_values_table(app_tr, "application_{train|test}.csv")
pd.set_option("display.max_rows", 20)
pd.set_option('max_colwidth',40)
t


Total 67 columns missing values


Unnamed: 0,Row,Description,Special,Missing Values,% of Total Values,count,mean,std,min,25%,50%,75%,max
0,COMMONAREA_MEDI,Normalized information about buildin...,normalized,214865,69.9,92646.0,0.044595,0.076144,0.000000e+00,0.007900,0.020800,0.051300,1.000
1,COMMONAREA_AVG,Normalized information about buildin...,normalized,214865,69.9,92646.0,0.044621,0.076036,0.000000e+00,0.007800,0.021100,0.051500,1.000
2,COMMONAREA_MODE,Normalized information about buildin...,normalized,214865,69.9,92646.0,0.042553,0.074445,0.000000e+00,0.007200,0.019000,0.049000,1.000
3,NONLIVINGAPARTMENTS_MODE,Normalized information about buildin...,normalized,213514,69.4,93997.0,0.008076,0.046276,0.000000e+00,0.000000,0.000000,0.003900,1.000
4,NONLIVINGAPARTMENTS_AVG,Normalized information about buildin...,normalized,213514,69.4,93997.0,0.008809,0.047732,0.000000e+00,0.000000,0.000000,0.003900,1.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
62,EXT_SOURCE_2,Normalized score from external data ...,normalized,660,0.2,306851.0,0.514393,0.191060,8.173617e-08,0.392457,0.565961,0.663617,0.855
63,AMT_GOODS_PRICE,For consumer loans it is the price o...,,278,0.1,307233.0,538396.207429,369446.460540,4.050000e+04,238500.000000,450000.000000,679500.000000,4050000.000
64,AMT_ANNUITY,Loan annuity,,12,0.0,307499.0,27108.573909,14493.737315,1.615500e+03,16524.000000,24903.000000,34596.000000,258025.500
65,CNT_FAM_MEMBERS,How many family members does client ...,,2,0.0,307509.0,2.152665,0.910682,1.000000e+00,2.000000,2.000000,3.000000,20.000


In [7]:
#获取较小的缺失值列信息

t_small=t.loc[t["% of Total Values"]<3]
t_small


Unnamed: 0,Row,Description,Special,Missing Values,% of Total Values,count,mean,std,min,25%,50%,75%,max
57,NAME_TYPE_SUITE,Who was accompanying client when he ...,,1292,0.4,,,,,,,,
58,OBS_30_CNT_SOCIAL_CIRCLE,How many observation of client's soc...,,1021,0.3,306490.0,1.422245,2.400989,0.0,0.0,0.0,2.0,348.0
59,DEF_30_CNT_SOCIAL_CIRCLE,How many observation of client's soc...,,1021,0.3,306490.0,0.143421,0.446698,0.0,0.0,0.0,0.0,34.0
60,OBS_60_CNT_SOCIAL_CIRCLE,How many observation of client's soc...,,1021,0.3,306490.0,1.405292,2.379803,0.0,0.0,0.0,2.0,344.0
61,DEF_60_CNT_SOCIAL_CIRCLE,How many observation of client's soc...,,1021,0.3,306490.0,0.100049,0.362291,0.0,0.0,0.0,0.0,24.0
62,EXT_SOURCE_2,Normalized score from external data ...,normalized,660,0.2,306851.0,0.514393,0.19106,8.173617e-08,0.392457,0.565961,0.663617,0.855
63,AMT_GOODS_PRICE,For consumer loans it is the price o...,,278,0.1,307233.0,538396.207429,369446.46054,40500.0,238500.0,450000.0,679500.0,4050000.0
64,AMT_ANNUITY,Loan annuity,,12,0.0,307499.0,27108.573909,14493.737315,1615.5,16524.0,24903.0,34596.0,258025.5
65,CNT_FAM_MEMBERS,How many family members does client ...,,2,0.0,307509.0,2.152665,0.910682,1.0,2.0,2.0,3.0,20.0
66,DAYS_LAST_PHONE_CHANGE,How many days before application did...,,1,0.0,307510.0,-962.858788,826.808487,-4292.0,-1570.0,-757.0,-274.0,0.0


In [8]:
#获取较大的缺失值列信息

t_large=t.loc[t["% of Total Values"]>3]
t_large

Unnamed: 0,Row,Description,Special,Missing Values,% of Total Values,count,mean,std,min,25%,50%,75%,max
0,COMMONAREA_MEDI,Normalized information about buildin...,normalized,214865,69.9,92646.0,0.044595,0.076144,0.0,0.0079,0.0208,0.0513,1.0
1,COMMONAREA_AVG,Normalized information about buildin...,normalized,214865,69.9,92646.0,0.044621,0.076036,0.0,0.0078,0.0211,0.0515,1.0
2,COMMONAREA_MODE,Normalized information about buildin...,normalized,214865,69.9,92646.0,0.042553,0.074445,0.0,0.0072,0.0190,0.0490,1.0
3,NONLIVINGAPARTMENTS_MODE,Normalized information about buildin...,normalized,213514,69.4,93997.0,0.008076,0.046276,0.0,0.0000,0.0000,0.0039,1.0
4,NONLIVINGAPARTMENTS_AVG,Normalized information about buildin...,normalized,213514,69.4,93997.0,0.008809,0.047732,0.0,0.0000,0.0000,0.0039,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,AMT_REQ_CREDIT_BUREAU_DAY,Number of enquiries to Credit Bureau...,,41519,13.5,265992.0,0.007000,0.110757,0.0,0.0000,0.0000,0.0000,9.0
53,AMT_REQ_CREDIT_BUREAU_WEEK,Number of enquiries to Credit Bureau...,,41519,13.5,265992.0,0.034362,0.204685,0.0,0.0000,0.0000,0.0000,8.0
54,AMT_REQ_CREDIT_BUREAU_MON,Number of enquiries to Credit Bureau...,,41519,13.5,265992.0,0.267395,0.916002,0.0,0.0000,0.0000,0.0000,27.0
55,AMT_REQ_CREDIT_BUREAU_QRT,Number of enquiries to Credit Bureau...,,41519,13.5,265992.0,0.265474,0.794056,0.0,0.0000,0.0000,0.0000,261.0


可以发现，残缺值数量差距很大，对于小于百分之一的残缺值，我们采取删去对应行的措施。


In [9]:
#删去部分行

app_tr.dropna(subset=t_small["Row"],
          axis=0, # axis=0表示删除行；
          how='any', # how=any表示若列name、age中，任意一个出现空值，就删掉该行
          inplace=True # inplace=True表示在原df上进行修改；
          )
app_tr = app_tr.reset_index(drop=True)
app_tr

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304526,456251,0,Cash loans,M,N,N,0,157500.0,254700.0,27558.0,...,0,0,0,0,,,,,,
304527,456252,0,Cash loans,F,N,Y,0,72000.0,269550.0,12001.5,...,0,0,0,0,,,,,,
304528,456253,0,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,...,0,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0
304529,456254,1,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


对于残缺值比较大的行，我们逐一进行研究处理。首先，对每个特征融入统计数据。

In [10]:
t = missing_values_table(app_tr, "application_{train|test}.csv")
pd.set_option("display.max_rows", 400)
pd.set_option('max_colwidth',400)
t




Total 57 columns missing values


Unnamed: 0,Row,Description,Special,Missing Values,% of Total Values,count,mean,std,min,25%,50%,75%,max
0,COMMONAREA_MEDI,"Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor",normalized,212870,69.9,91661.0,0.044544,0.076043,0.0,0.0079,0.0208,0.0513,1.0
1,COMMONAREA_AVG,"Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor",normalized,212870,69.9,91661.0,0.044564,0.075932,0.0,0.0078,0.0211,0.0514,1.0
2,COMMONAREA_MODE,"Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor",normalized,212870,69.9,91661.0,0.04251,0.074343,0.0,0.0072,0.019,0.0489,1.0
3,NONLIVINGAPARTMENTS_MODE,"Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor",normalized,211544,69.5,92987.0,0.008061,0.046265,0.0,0.0,0.0,0.0039,1.0
4,NONLIVINGAPARTMENTS_AVG,"Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor",normalized,211544,69.5,92987.0,0.008795,0.047732,0.0,0.0,0.0,0.0039,1.0
5,NONLIVINGAPARTMENTS_MEDI,"Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor",normalized,211544,69.5,92987.0,0.008637,0.047412,0.0,0.0,0.0,0.0039,1.0
6,FONDKAPREMONT_MODE,"Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor",normalized,208352,68.4,,,,,,,,
7,LIVINGAPARTMENTS_MODE,"Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor",normalized,208259,68.4,96272.0,0.105537,0.097673,0.0,0.0542,0.0762,0.1313,1.0
8,LIVINGAPARTMENTS_AVG,"Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor",normalized,208259,68.4,96272.0,0.100662,0.092368,0.0,0.0504,0.0756,0.121,1.0
9,LIVINGAPARTMENTS_MEDI,"Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor",normalized,208259,68.4,96272.0,0.101845,0.093431,0.0,0.0513,0.0761,0.1231,1.0


可以发现还存在一些存在大量残缺值的分类数据。对于这些数据，在转化为数值数据时，多转化一个类别。
另外，这里面有着大量缺失的房产数据，可以简化这些特征的拥有总和为一个特征，表明拥有房产的可信度。



In [11]:
#填补分类数据缺失值

temp=t.drop(columns=["Special"]).isnull().T.any()
temp.loc[temp==True].index
rows=t.loc[temp.loc[temp==True].index].Row

for col in rows: 
    app_tr[str(col)] = app_tr[str(col)].fillna(value="MyNull")
    
app_tr[str(rows.iloc[0])]

0         reg oper account
1         reg oper account
2                   MyNull
3                   MyNull
4                   MyNull
                ...       
304526    reg oper account
304527    reg oper account
304528    reg oper account
304529              MyNull
304530              MyNull
Name: FONDKAPREMONT_MODE, Length: 304531, dtype: object

In [12]:
t=missing_values_table(app_tr,"application_{train|test}.csv")
pd.set_option("display.max_rows", 20)
pd.set_option('max_colwidth',40)
t

Total 52 columns missing values


Unnamed: 0,Row,Description,Special,Missing Values,% of Total Values,count,mean,std,min,25%,50%,75%,max
0,COMMONAREA_AVG,Normalized information about buildin...,normalized,212870,69.9,91661.0,0.044564,0.075932,0.0,0.0078,0.0211,0.0514,1.0
1,COMMONAREA_MEDI,Normalized information about buildin...,normalized,212870,69.9,91661.0,0.044544,0.076043,0.0,0.0079,0.0208,0.0513,1.0
2,COMMONAREA_MODE,Normalized information about buildin...,normalized,212870,69.9,91661.0,0.042510,0.074343,0.0,0.0072,0.0190,0.0489,1.0
3,NONLIVINGAPARTMENTS_MODE,Normalized information about buildin...,normalized,211544,69.5,92987.0,0.008061,0.046265,0.0,0.0000,0.0000,0.0039,1.0
4,NONLIVINGAPARTMENTS_AVG,Normalized information about buildin...,normalized,211544,69.5,92987.0,0.008795,0.047732,0.0,0.0000,0.0000,0.0039,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
47,AMT_REQ_CREDIT_BUREAU_DAY,Number of enquiries to Credit Bureau...,,41108,13.5,263423.0,0.006981,0.110358,0.0,0.0000,0.0000,0.0000,9.0
48,AMT_REQ_CREDIT_BUREAU_WEEK,Number of enquiries to Credit Bureau...,,41108,13.5,263423.0,0.034484,0.204615,0.0,0.0000,0.0000,0.0000,8.0
49,AMT_REQ_CREDIT_BUREAU_MON,Number of enquiries to Credit Bureau...,,41108,13.5,263423.0,0.267782,0.915330,0.0,0.0000,0.0000,0.0000,27.0
50,AMT_REQ_CREDIT_BUREAU_QRT,Number of enquiries to Credit Bureau...,,41108,13.5,263423.0,0.266127,0.795735,0.0,0.0000,0.0000,0.0000,261.0


In [13]:
#重新理解房产数据,暂时存储删除列和test表合并删除，并将删除列的空值填补任意值。
print(app_tr.shape)
t_house=t.loc[t["Description"]=="Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor"]
temp=app_tr.loc[:,t_house["Row"].tolist()].isnull().sum(axis=1)
app_tr["MY_HOUSE_OWN"]=temp
app_tr_drop_list=t_house["Row"].tolist()
for col in app_tr_drop_list: 
    app_tr[col] = app_tr[col].fillna(value=0)
app_tr

(304531, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,MY_HOUSE_OWN
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,43
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,,,,,,,43
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,43
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
304526,456251,0,Cash loans,M,N,N,0,157500.0,254700.0,27558.0,...,0,0,0,,,,,,,0
304527,456252,0,Cash loans,F,N,Y,0,72000.0,269550.0,12001.5,...,0,0,0,,,,,,,0
304528,456253,0,Cash loans,F,N,Y,0,153000.0,677664.0,29979.0,...,0,0,0,1.0,0.0,0.0,1.0,0.0,1.0,3
304529,456254,1,Cash loans,F,N,Y,0,171000.0,370107.0,20205.0,...,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,27


In [14]:
t=missing_values_table(app_tr,"application_{train|test}.csv")
t

Total 9 columns missing values


Unnamed: 0,Row,Description,Special,Missing Values,% of Total Values,count,mean,std,min,25%,50%,75%,max
0,OWN_CAR_AGE,Age of client's car,,200912,66.0,103619.0,12.070682,11.935821,0.0,5.0,9.0,15.0,91.0
1,EXT_SOURCE_1,Normalized score from external data ...,normalized,171652,56.4,132879.0,0.501986,0.211049,0.014568,0.333967,0.505819,0.674901,0.962693
2,EXT_SOURCE_3,Normalized score from external data ...,normalized,60251,19.8,244280.0,0.510764,0.194843,0.000527,0.37065,0.535276,0.669057,0.89601
3,AMT_REQ_CREDIT_BUREAU_YEAR,Number of enquiries to Credit Bureau...,,41108,13.5,263423.0,1.905904,1.869645,0.0,0.0,1.0,3.0,25.0
4,AMT_REQ_CREDIT_BUREAU_QRT,Number of enquiries to Credit Bureau...,,41108,13.5,263423.0,0.266127,0.795735,0.0,0.0,0.0,0.0,261.0
5,AMT_REQ_CREDIT_BUREAU_MON,Number of enquiries to Credit Bureau...,,41108,13.5,263423.0,0.267782,0.91533,0.0,0.0,0.0,0.0,27.0
6,AMT_REQ_CREDIT_BUREAU_WEEK,Number of enquiries to Credit Bureau...,,41108,13.5,263423.0,0.034484,0.204615,0.0,0.0,0.0,0.0,8.0
7,AMT_REQ_CREDIT_BUREAU_DAY,Number of enquiries to Credit Bureau...,,41108,13.5,263423.0,0.006981,0.110358,0.0,0.0,0.0,0.0,9.0
8,AMT_REQ_CREDIT_BUREAU_HOUR,Number of enquiries to Credit Bureau...,,41108,13.5,263423.0,0.006385,0.083786,0.0,0.0,0.0,0.0,4.0


目前，还有9列缺失值，可以发现OWN_CAR_AGE是车辆拥有年限数据，缺失值可以置零   
可以看到EXT_SOURCE_1和EXT_SOURCE_3比较平滑，直接用平均值代替  
AMT_REQ_CREDIT_BUREAU的一列数据用0填补，缺失值可能说明这些客户并未有对应的enquiries

In [15]:
#处理OWN_CAR_AGE

app_tr["OWN_CAR_AGE"] = app_tr["OWN_CAR_AGE"].fillna(value=0)
app_tr["OWN_CAR_AGE"]


0          0.0
1          0.0
2         26.0
3          0.0
4          0.0
          ... 
304526     0.0
304527     0.0
304528     0.0
304529     0.0
304530     0.0
Name: OWN_CAR_AGE, Length: 304531, dtype: float64

In [16]:
#处理EXT_SOURCE_1和EXT_SOURCE_3

tr_1_mean_val = app_tr["EXT_SOURCE_1"].mean()
app_tr["EXT_SOURCE_1"].fillna(tr_1_mean_val, inplace=True)
tr_2_mean_val = app_tr["EXT_SOURCE_3"].mean()
app_tr["EXT_SOURCE_3"].fillna(tr_2_mean_val, inplace=True)


In [17]:
t=missing_values_table(app_tr,"application_{train|test}.csv")
t

Total 6 columns missing values


Unnamed: 0,Row,Description,Special,Missing Values,% of Total Values,count,mean,std,min,25%,50%,75%,max
0,AMT_REQ_CREDIT_BUREAU_YEAR,Number of enquiries to Credit Bureau...,,41108,13.5,263423.0,1.905904,1.869645,0.0,0.0,1.0,3.0,25.0
1,AMT_REQ_CREDIT_BUREAU_QRT,Number of enquiries to Credit Bureau...,,41108,13.5,263423.0,0.266127,0.795735,0.0,0.0,0.0,0.0,261.0
2,AMT_REQ_CREDIT_BUREAU_MON,Number of enquiries to Credit Bureau...,,41108,13.5,263423.0,0.267782,0.91533,0.0,0.0,0.0,0.0,27.0
3,AMT_REQ_CREDIT_BUREAU_WEEK,Number of enquiries to Credit Bureau...,,41108,13.5,263423.0,0.034484,0.204615,0.0,0.0,0.0,0.0,8.0
4,AMT_REQ_CREDIT_BUREAU_DAY,Number of enquiries to Credit Bureau...,,41108,13.5,263423.0,0.006981,0.110358,0.0,0.0,0.0,0.0,9.0
5,AMT_REQ_CREDIT_BUREAU_HOUR,Number of enquiries to Credit Bureau...,,41108,13.5,263423.0,0.006385,0.083786,0.0,0.0,0.0,0.0,4.0


In [18]:
# 处理 AMT_REQ_CREDIT_BUREAU

for s in t["Row"].tolist():
    app_tr[s].fillna(0, inplace=True)

In [19]:
t=missing_values_table(app_tr,"application_{train|test}.csv")
t

Total 0 columns missing values


Unnamed: 0,Row,Description,Special,Missing Values,% of Total Values,count,mean,std,min,25%,50%,75%,max


至此，主表缺失值处理完毕。
此外，也对app_te表进行处理


In [20]:
app_te = pd.read_csv(app_te_path)

t = missing_values_table(app_te, "application_{train|test}.csv")
t_small = t.loc[t["% of Total Values"] < 3]
t_large = t.loc[t["% of Total Values"] > 3]
for c in t_small["Row"].tolist():
    if app_tr[c].dtype == object:
        cmax = max(app_tr[c])
        app_te[c].fillna(cmax,
                  inplace=True  # inplace=True表示在原df上进行修改；
                  )
    else:
        mean_num = app_tr[c].mean()
        app_te[c].fillna(mean_num,
                  inplace=True  # inplace=True表示在原df上进行修改；
                  )
    

t = missing_values_table(app_te, "application_{train|test}.csv")
temp = t.drop(columns=["Special"]).isnull().T.any()
temp.loc[temp == True].index
rows = t.loc[temp.loc[temp == True].index].Row
for col in rows:
    app_te[str(col)] = app_te[str(col)].fillna(value="MyNull")

t = missing_values_table(app_te, "application_{train|test}.csv")
t_house = t.loc[t["Description"] ==
                "Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor"]
temp = app_te.loc[:, t_house["Row"].tolist()].isnull().sum(axis=1)
app_te["MY_HOUSE_OWN"] = temp
app_te_drop_list = t_house["Row"].tolist()
for col in app_te_drop_list:
    app_te[col] = app_te[col].fillna(value=0)

t = missing_values_table(app_te, "application_{train|test}.csv")
app_te.loc[:,t_small["Row"].tolist()]


Total 64 columns missing values
Total 57 columns missing values
Total 52 columns missing values
Total 9 columns missing values


Unnamed: 0,NAME_TYPE_SUITE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,AMT_ANNUITY,EXT_SOURCE_2
0,Unaccompanied,0.0,0.0,0.0,0.0,20560.5,0.789654
1,Unaccompanied,0.0,0.0,0.0,0.0,17370.0,0.291656
2,Unaccompanied,0.0,0.0,0.0,0.0,69777.0,0.699787
3,Unaccompanied,0.0,0.0,0.0,0.0,49018.5,0.509677
4,Unaccompanied,0.0,0.0,0.0,0.0,32067.0,0.425687
...,...,...,...,...,...,...,...
48739,Unaccompanied,0.0,1.0,1.0,0.0,17473.5,0.648575
48740,Unaccompanied,0.0,2.0,2.0,0.0,31909.5,0.684596
48741,Unaccompanied,0.0,0.0,0.0,0.0,33205.5,0.632770
48742,Family,0.0,0.0,0.0,0.0,25128.0,0.445701


In [21]:
# 可以发现和train表是一致的

app_te["OWN_CAR_AGE"] = app_te["OWN_CAR_AGE"].fillna(value=0)
app_te["EXT_SOURCE_1"].fillna(tr_1_mean_val, inplace=True)
app_te["EXT_SOURCE_3"].fillna(tr_2_mean_val, inplace=True)
t=missing_values_table(app_te,"application_{train|test}.csv")
for s in t["Row"].tolist():
    app_te[s].fillna(0, inplace=True)
t=missing_values_table(app_te,"application_{train|test}.csv")
t

Total 6 columns missing values
Total 0 columns missing values


Unnamed: 0,Row,Description,Special,Missing Values,% of Total Values,count,mean,std,min,25%,50%,75%,max


In [22]:
# 最后把准备删除的行给删除了

app_tr.drop(columns=list(set(app_tr_drop_list+app_te_drop_list)),inplace=True)
app_te.drop(columns=list(set(app_tr_drop_list+app_te_drop_list)),inplace=True)
print(app_tr.shape)
print(app_te.shape)

(304531, 80)
(48744, 79)


In [22]:
# 结果保存

app_tr.to_csv(result_dir+"\\application_train.csv",index=False)
app_te.to_csv(result_dir+"\\application_test.csv",index=False)


### 2. **previous_application.csv**

In [23]:
pre = pd.read_csv(pre_path)
pre.describe()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,HOUR_APPR_PROCESS_START,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,...,RATE_INTEREST_PRIVILEGED,DAYS_DECISION,SELLERPLACE_AREA,CNT_PAYMENT,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
count,1670214.0,1670214.0,1297979.0,1670214.0,1670213.0,774370.0,1284699.0,1670214.0,1670214.0,774370.0,...,5951.0,1670214.0,1670214.0,1297984.0,997149.0,997149.0,997149.0,997149.0,997149.0,997149.0
mean,1923089.0,278357.2,15955.12,175233.9,196114.0,6697.402,227847.3,12.48418,0.9964675,0.079637,...,0.773503,-880.6797,313.9511,16.05408,342209.855039,13826.269337,33767.774054,76582.403064,81992.343838,0.33257
std,532598.0,102814.8,14782.14,292779.8,318574.6,20921.5,315396.6,3.334028,0.05932963,0.107823,...,0.100879,779.0997,7127.443,14.56729,88916.115834,72444.869708,106857.034789,149647.415123,153303.516729,0.471134
min,1000001.0,100001.0,0.0,0.0,0.0,-0.9,0.0,0.0,0.0,-1.5e-05,...,0.37315,-2922.0,-1.0,0.0,-2922.0,-2892.0,-2801.0,-2889.0,-2874.0,0.0
25%,1461857.0,189329.0,6321.78,18720.0,24160.5,0.0,50841.0,10.0,1.0,0.0,...,0.715645,-1300.0,-1.0,6.0,365243.0,-1628.0,-1242.0,-1314.0,-1270.0,0.0
50%,1923110.0,278714.5,11250.0,71046.0,80541.0,1638.0,112320.0,12.0,1.0,0.051605,...,0.835095,-581.0,3.0,12.0,365243.0,-831.0,-361.0,-537.0,-499.0,0.0
75%,2384280.0,367514.0,20658.42,180360.0,216418.5,7740.0,234000.0,15.0,1.0,0.108909,...,0.852537,-280.0,82.0,24.0,365243.0,-411.0,129.0,-74.0,-44.0,1.0
max,2845382.0,456255.0,418058.1,6905160.0,6905160.0,3060045.0,6905160.0,23.0,1.0,1.0,...,1.0,-1.0,4000000.0,84.0,365243.0,365243.0,365243.0,365243.0,365243.0,1.0


In [24]:
# 缺失值分析

t = missing_values_table(pre, "previous_application.csv")
pd.set_option("display.max_rows", 20)
pd.set_option('max_colwidth',40)
t

Total 16 columns missing values


Unnamed: 0,Row,Description,Special,Missing Values,% of Total Values,count,mean,std,min,25%,50%,75%,max
0,RATE_INTEREST_PRIVILEGED,Interest rate normalized on previous...,normalized,1664263,99.6,5951.0,0.773503,0.100879,0.37315,0.715645,0.835095,0.852537,1.0
1,RATE_INTEREST_PRIMARY,Interest rate normalized on previous...,normalized,1664263,99.6,5951.0,0.188357,0.087671,0.034781,0.160716,0.189122,0.19333,1.0
2,AMT_DOWN_PAYMENT,Down payment on the previous applica...,,895844,53.6,774370.0,6697.402139,20921.49541,-0.9,0.0,1638.0,7740.0,3060045.0
3,RATE_DOWN_PAYMENT,Down payment rate normalized on prev...,normalized,895844,53.6,774370.0,0.079637,0.107823,-1.5e-05,0.0,0.051605,0.108909,1.0
4,NAME_TYPE_SUITE,Who accompanied client when applying...,,820405,49.1,,,,,,,,
5,NFLAG_INSURED_ON_APPROVAL,Did the client requested insurance d...,,673065,40.3,997149.0,0.33257,0.471134,0.0,0.0,0.0,1.0,1.0
6,DAYS_TERMINATION,Relative to application date of curr...,time only relative to the application,673065,40.3,997149.0,81992.343838,153303.516729,-2874.0,-1270.0,-499.0,-44.0,365243.0
7,DAYS_LAST_DUE,Relative to application date of curr...,time only relative to the application,673065,40.3,997149.0,76582.403064,149647.415123,-2889.0,-1314.0,-537.0,-74.0,365243.0
8,DAYS_LAST_DUE_1ST_VERSION,Relative to application date of curr...,time only relative to the application,673065,40.3,997149.0,33767.774054,106857.034789,-2801.0,-1242.0,-361.0,129.0,365243.0
9,DAYS_FIRST_DUE,Relative to application date of curr...,time only relative to the application,673065,40.3,997149.0,13826.269337,72444.869708,-2892.0,-1628.0,-831.0,-411.0,365243.0


In [25]:
#获取较小的缺失值列信息

t_small=t.loc[t["% of Total Values"]<3]
t_small


Unnamed: 0,Row,Description,Special,Missing Values,% of Total Values,count,mean,std,min,25%,50%,75%,max
14,PRODUCT_COMBINATION,Detailed product combination of the ...,,346,0.0,,,,,,,,
15,AMT_CREDIT,Final credit amount on the previous ...,,1,0.0,1670213.0,196114.021218,318574.616546,0.0,24160.5,80541.0,216418.5,6905160.0


In [26]:
#获取较大的缺失值列信息

t_large=t.loc[t["% of Total Values"]>3]
t_large

Unnamed: 0,Row,Description,Special,Missing Values,% of Total Values,count,mean,std,min,25%,50%,75%,max
0,RATE_INTEREST_PRIVILEGED,Interest rate normalized on previous...,normalized,1664263,99.6,5951.0,0.773503,0.100879,0.37315,0.715645,0.835095,0.852537,1.0
1,RATE_INTEREST_PRIMARY,Interest rate normalized on previous...,normalized,1664263,99.6,5951.0,0.188357,0.087671,0.034781,0.160716,0.189122,0.19333,1.0
2,AMT_DOWN_PAYMENT,Down payment on the previous applica...,,895844,53.6,774370.0,6697.402139,20921.49541,-0.9,0.0,1638.0,7740.0,3060045.0
3,RATE_DOWN_PAYMENT,Down payment rate normalized on prev...,normalized,895844,53.6,774370.0,0.079637,0.107823,-1.5e-05,0.0,0.051605,0.108909,1.0
4,NAME_TYPE_SUITE,Who accompanied client when applying...,,820405,49.1,,,,,,,,
5,NFLAG_INSURED_ON_APPROVAL,Did the client requested insurance d...,,673065,40.3,997149.0,0.33257,0.471134,0.0,0.0,0.0,1.0,1.0
6,DAYS_TERMINATION,Relative to application date of curr...,time only relative to the application,673065,40.3,997149.0,81992.343838,153303.516729,-2874.0,-1270.0,-499.0,-44.0,365243.0
7,DAYS_LAST_DUE,Relative to application date of curr...,time only relative to the application,673065,40.3,997149.0,76582.403064,149647.415123,-2889.0,-1314.0,-537.0,-74.0,365243.0
8,DAYS_LAST_DUE_1ST_VERSION,Relative to application date of curr...,time only relative to the application,673065,40.3,997149.0,33767.774054,106857.034789,-2801.0,-1242.0,-361.0,129.0,365243.0
9,DAYS_FIRST_DUE,Relative to application date of curr...,time only relative to the application,673065,40.3,997149.0,13826.269337,72444.869708,-2892.0,-1628.0,-831.0,-411.0,365243.0


可以发现，残缺值数量差距很大，对于小于百分之一的残缺值，我们采取删去对应行的措施。

In [27]:
#删去部分行

pre.dropna(subset=t_small["Row"],
          axis=0, # axis=0表示删除行；
          how='any', # how=any表示若列name、age中，任意一个出现空值，就删掉该行
          inplace=True # inplace=True表示在原df上进行修改；
          )
pre = pre.reset_index(drop=True)
pre

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,...,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.430,17145.0,17145.0,0.0,17145.0,SATURDAY,15,...,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,...,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,...,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,...,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,...,XNA,24.0,high,Cash Street: high,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1669862,2300464,352015,Consumer loans,14704.290,267295.5,311400.0,0.0,267295.5,WEDNESDAY,12,...,Furniture,30.0,low_normal,POS industry with interest,365243.0,-508.0,362.0,-358.0,-351.0,0.0
1669863,2357031,334635,Consumer loans,6622.020,87750.0,64291.5,29250.0,87750.0,TUESDAY,15,...,Furniture,12.0,middle,POS industry with interest,365243.0,-1604.0,-1274.0,-1304.0,-1297.0,0.0
1669864,2659632,249544,Consumer loans,11520.855,105237.0,102523.5,10525.5,105237.0,MONDAY,12,...,Consumer electronics,10.0,low_normal,POS household with interest,365243.0,-1457.0,-1187.0,-1187.0,-1181.0,0.0
1669865,2785582,400317,Cash loans,18821.520,180000.0,191880.0,,180000.0,WEDNESDAY,9,...,XNA,12.0,low_normal,Cash X-Sell: low,365243.0,-1155.0,-825.0,-825.0,-817.0,1.0


对于残缺值比较大的行，我们逐一进行研究处理。首先，对每个特征融入统计数据。

In [28]:
t = missing_values_table(pre, "previous_application.csv")
pd.set_option("display.max_rows", 400)
pd.set_option('max_colwidth',400)
t

Total 14 columns missing values


Unnamed: 0,Row,Description,Special,Missing Values,% of Total Values,count,mean,std,min,25%,50%,75%,max
0,RATE_INTEREST_PRIVILEGED,Interest rate normalized on previous credit,normalized,1663916,99.6,5951.0,0.773503,0.100879,0.37315,0.715645,0.835095,0.852537,1.0
1,RATE_INTEREST_PRIMARY,Interest rate normalized on previous credit,normalized,1663916,99.6,5951.0,0.188357,0.087671,0.034781,0.160716,0.189122,0.19333,1.0
2,RATE_DOWN_PAYMENT,Down payment rate normalized on previous credit,normalized,895497,53.6,774370.0,0.079637,0.107823,-1.5e-05,0.0,0.051605,0.108909,1.0
3,AMT_DOWN_PAYMENT,Down payment on the previous application,,895497,53.6,774370.0,6697.402139,20921.49541,-0.9,0.0,1638.0,7740.0,3060045.0
4,NAME_TYPE_SUITE,Who accompanied client when applying for the previous application,,820058,49.1,,,,,,,,
5,NFLAG_INSURED_ON_APPROVAL,Did the client requested insurance during the previous application,,672718,40.3,997149.0,0.33257,0.471134,0.0,0.0,0.0,1.0,1.0
6,DAYS_FIRST_DRAWING,Relative to application date of current application when was the first disbursement of the previous application,time only relative to the application,672718,40.3,997149.0,342209.855039,88916.115834,-2922.0,365243.0,365243.0,365243.0,365243.0
7,DAYS_FIRST_DUE,Relative to application date of current application when was the first due supposed to be of the previous application,time only relative to the application,672718,40.3,997149.0,13826.269337,72444.869708,-2892.0,-1628.0,-831.0,-411.0,365243.0
8,DAYS_LAST_DUE_1ST_VERSION,Relative to application date of current application when was the first due of the previous application,time only relative to the application,672718,40.3,997149.0,33767.774054,106857.034789,-2801.0,-1242.0,-361.0,129.0,365243.0
9,DAYS_LAST_DUE,Relative to application date of current application when was the last due date of the previous application,time only relative to the application,672718,40.3,997149.0,76582.403064,149647.415123,-2889.0,-1314.0,-537.0,-74.0,365243.0


可以发现其中NAME_TYPE_SUITE为分类数据，表示之前申请时有谁陪同，缺失值可以置为Unaccompanied。

In [32]:
#填补分类数据缺失值

pre["NAME_TYPE_SUITE"]=pre["NAME_TYPE_SUITE"].fillna("Unaccompanied")

In [33]:
pre["NAME_TYPE_SUITE"].head(10)

0      Unaccompanied
1      Unaccompanied
2    Spouse, partner
3      Unaccompanied
4      Unaccompanied
5             Family
6      Unaccompanied
7      Unaccompanied
8      Unaccompanied
9      Unaccompanied
Name: NAME_TYPE_SUITE, dtype: object

In [34]:
t=missing_values_table(pre,"previous_application.csv")
pd.set_option("display.max_rows", 20)
pd.set_option('max_colwidth',40)
t

Total 13 columns missing values


Unnamed: 0,Row,Description,Special,Missing Values,% of Total Values,count,mean,std,min,25%,50%,75%,max
0,RATE_INTEREST_PRIMARY,Interest rate normalized on previous...,normalized,1663916,99.6,5951.0,0.188357,0.087671,0.034781,0.160716,0.189122,0.19333,1.0
1,RATE_INTEREST_PRIVILEGED,Interest rate normalized on previous...,normalized,1663916,99.6,5951.0,0.773503,0.100879,0.37315,0.715645,0.835095,0.852537,1.0
2,AMT_DOWN_PAYMENT,Down payment on the previous applica...,,895497,53.6,774370.0,6697.402139,20921.49541,-0.9,0.0,1638.0,7740.0,3060045.0
3,RATE_DOWN_PAYMENT,Down payment rate normalized on prev...,normalized,895497,53.6,774370.0,0.079637,0.107823,-1.5e-05,0.0,0.051605,0.108909,1.0
4,NFLAG_INSURED_ON_APPROVAL,Did the client requested insurance d...,,672718,40.3,997149.0,0.33257,0.471134,0.0,0.0,0.0,1.0,1.0
5,DAYS_TERMINATION,Relative to application date of curr...,time only relative to the application,672718,40.3,997149.0,81992.343838,153303.516729,-2874.0,-1270.0,-499.0,-44.0,365243.0
6,DAYS_LAST_DUE,Relative to application date of curr...,time only relative to the application,672718,40.3,997149.0,76582.403064,149647.415123,-2889.0,-1314.0,-537.0,-74.0,365243.0
7,DAYS_LAST_DUE_1ST_VERSION,Relative to application date of curr...,time only relative to the application,672718,40.3,997149.0,33767.774054,106857.034789,-2801.0,-1242.0,-361.0,129.0,365243.0
8,DAYS_FIRST_DUE,Relative to application date of curr...,time only relative to the application,672718,40.3,997149.0,13826.269337,72444.869708,-2892.0,-1628.0,-831.0,-411.0,365243.0
9,DAYS_FIRST_DRAWING,Relative to application date of curr...,time only relative to the application,672718,40.3,997149.0,342209.855039,88916.115834,-2922.0,365243.0,365243.0,365243.0,365243.0


其中，RATE_INTEREST_PRIMARY和RATE_INTEREST_PRIVILEGED比较平滑，缺失值可以用平均值代替；
其他数据为上次申请相关数据，缺失值可能代表这是第一次申请，可以置0。

In [40]:
#处理RATE_INTEREST_PRIMARY和RATE_INTEREST_PRIVILEGED

pre_1_mean=pre["RATE_INTEREST_PRIMARY"].mean()
pre["RATE_INTEREST_PRIMARY"].fillna(pre_1_mean, inplace=True)

pre_2_mean=pre["RATE_INTEREST_PRIVILEGED"].mean()
pre["RATE_INTEREST_PRIVILEGED"].fillna(pre_2_mean, inplace=True)



0.7735025434991628

In [44]:
#处理其它数据

pre["AMT_DOWN_PAYMENT"]=pre["AMT_DOWN_PAYMENT"].fillna(value=0)
pre["RATE_DOWN_PAYMENT"]=pre["RATE_DOWN_PAYMENT"].fillna(value=0)
pre["NFLAG_INSURED_ON_APPROVAL"]=pre["NFLAG_INSURED_ON_APPROVAL"].fillna(value=0)
pre["DAYS_TERMINATION"]=pre["DAYS_TERMINATION"].fillna(value=0)
pre["DAYS_LAST_DUE"]=pre["DAYS_LAST_DUE"].fillna(value=0)
pre["DAYS_LAST_DUE_1ST_VERSION"]=pre["DAYS_LAST_DUE_1ST_VERSION"].fillna(value=0)
pre["DAYS_FIRST_DUE"]=pre["DAYS_FIRST_DUE"].fillna(value=0)
pre["DAYS_FIRST_DRAWING"]=pre["DAYS_FIRST_DRAWING"].fillna(value=0)
pre["AMT_GOODS_PRICE"]=pre["AMT_GOODS_PRICE"].fillna(value=0)
pre["AMT_ANNUITY"]=pre["AMT_ANNUITY"].fillna(value=0)
pre["CNT_PAYMENT"]=pre["CNT_PAYMENT"].fillna(value=0)

In [46]:
t = missing_values_table(pre, "previous_application.csv")
t

Total 0 columns missing values


Unnamed: 0,Row,Description,Special,Missing Values,% of Total Values,count,mean,std,min,25%,50%,75%,max


In [47]:
# 结果保存

pre.to_csv(result_dir+"\\previous_application.csv",index=False)


### 3. **POS_CASH_balance.csv**

In [48]:
pos = pd.read_csv(pos_path)
pos.describe()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,SK_DPD,SK_DPD_DEF
count,10001360.0,10001360.0,10001360.0,9975287.0,9975271.0,10001360.0,10001360.0
mean,1903217.0,278403.9,-35.01259,17.08965,10.48384,11.60693,0.6544684
std,535846.5,102763.7,26.06657,11.99506,11.10906,132.714,32.76249
min,1000001.0,100001.0,-96.0,1.0,0.0,0.0,0.0
25%,1434405.0,189550.0,-54.0,10.0,3.0,0.0,0.0
50%,1896565.0,278654.0,-28.0,12.0,7.0,0.0,0.0
75%,2368963.0,367429.0,-13.0,24.0,14.0,0.0,0.0
max,2843499.0,456255.0,-1.0,92.0,85.0,4231.0,3595.0


In [49]:
# 缺失值分析

t = missing_values_table(pos, "POS_CASH_balance.csv")
pd.set_option("display.max_rows", 20)
pd.set_option('max_colwidth',40)
t

Total 2 columns missing values


Unnamed: 0,Row,Description,Special,Missing Values,% of Total Values,count,mean,std,min,25%,50%,75%,max
0,CNT_INSTALMENT_FUTURE,Installments left to pay on the prev...,,26087,0.3,9975271.0,10.48384,11.109058,0.0,3.0,7.0,14.0,85.0
1,CNT_INSTALMENT,Term of previous credit (can change ...,,26071,0.3,9975287.0,17.08965,11.995056,1.0,10.0,12.0,24.0,92.0


可以发现，残缺值数量小于百分之一，我们采取删去对应行的措施。

In [51]:
#删去部分行

pos.dropna(subset=t["Row"],
          axis=0, # axis=0表示删除行；
          how='any', # how=any表示若列name、age中，任意一个出现空值，就删掉该行
          inplace=True # inplace=True表示在原df上进行修改；
          )
pos = pos.reset_index(drop=True)
pos

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0
...,...,...,...,...,...,...,...,...
9975169,2448283,226558,-20,6.0,0.0,Active,843,0
9975170,1717234,141565,-19,12.0,0.0,Active,602,0
9975171,1283126,315695,-21,10.0,0.0,Active,609,0
9975172,1082516,450255,-22,12.0,0.0,Active,614,0


In [53]:
t = missing_values_table(pos, "POS_CASH_balance.csv")
t

Total 0 columns missing values


Unnamed: 0,Row,Description,Special,Missing Values,% of Total Values,count,mean,std,min,25%,50%,75%,max


In [54]:
# 结果保存

pos.to_csv(result_dir+"\\POS_CASH_balance.csv",index=False)


### 4. **credit_card_balance.csv**

In [6]:
# 查看训练数据的基本数据特征

cre= pd.read_csv(cre_path)
cre.describe()


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,...,AMT_RECEIVABLE_PRINCIPAL,AMT_RECIVABLE,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,SK_DPD,SK_DPD_DEF
count,3840312.0,3840312.0,3840312.0,3840312.0,3840312.0,3090496.0,3840312.0,3090496.0,3090496.0,3535076.0,...,3840312.0,3840312.0,3840312.0,3090496.0,3840312.0,3090496.0,3090496.0,3535076.0,3840312.0,3840312.0
mean,1904504.0,278324.2,-34.52192,58300.16,153808.0,5961.325,7433.388,288.1696,2968.805,3540.204,...,55965.88,58088.81,58098.29,0.309449,0.7031439,0.004812496,0.5594791,20.82508,9.283667,0.331622
std,536469.5,102704.5,26.66775,106307.0,165145.7,28225.69,33846.08,8201.989,20796.89,5600.154,...,102533.6,105965.4,105971.8,1.100401,3.190347,0.08263861,3.240649,20.05149,97.5157,21.47923
min,1000018.0,100006.0,-96.0,-420250.2,0.0,-6827.31,-6211.62,0.0,0.0,0.0,...,-423305.8,-420250.2,-420250.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1434385.0,189517.0,-55.0,0.0,45000.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0
50%,1897122.0,278396.0,-28.0,0.0,112500.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,0.0
75%,2369328.0,367580.0,-11.0,89046.69,180000.0,0.0,0.0,0.0,0.0,6633.911,...,85359.24,88899.49,88914.51,0.0,0.0,0.0,0.0,32.0,0.0,0.0
max,2843496.0,456250.0,-1.0,1505902.0,1350000.0,2115000.0,2287098.0,1529847.0,2239274.0,202882.0,...,1472317.0,1493338.0,1493338.0,51.0,165.0,12.0,165.0,120.0,3260.0,3260.0


In [7]:
# 缺失值分析

t = missing_values_table(cre, "credit_card_balance.csv")
pd.set_option("display.max_rows", 20)
pd.set_option('max_colwidth',40)
t


Total 9 columns missing values


Unnamed: 0,Row,Description,Special,Missing Values,% of Total Values,count,mean,std,min,25%,50%,75%,max
0,AMT_PAYMENT_CURRENT,How much did the client pay during t...,,767988,20.0,3072324.0,10280.537702,36078.084953,0.0,152.37,2702.7,9000.0,4289207.445
1,AMT_DRAWINGS_ATM_CURRENT,Amount drawing at ATM during the mon...,,749816,19.5,3090496.0,5961.324822,28225.688579,-6827.31,0.0,0.0,0.0,2115000.0
2,CNT_DRAWINGS_POS_CURRENT,Number of drawings for goods during ...,,749816,19.5,3090496.0,0.559479,3.240649,0.0,0.0,0.0,0.0,165.0
3,AMT_DRAWINGS_OTHER_CURRENT,Amount of other drawings during the ...,,749816,19.5,3090496.0,288.169582,8201.989345,0.0,0.0,0.0,0.0,1529847.0
4,AMT_DRAWINGS_POS_CURRENT,Amount drawing or buying goods durin...,,749816,19.5,3090496.0,2968.804848,20796.887047,0.0,0.0,0.0,0.0,2239274.16
5,CNT_DRAWINGS_OTHER_CURRENT,Number of other drawings during this...,,749816,19.5,3090496.0,0.004812,0.082639,0.0,0.0,0.0,0.0,12.0
6,CNT_DRAWINGS_ATM_CURRENT,Number of drawings at ATM during thi...,,749816,19.5,3090496.0,0.309449,1.100401,0.0,0.0,0.0,0.0,51.0
7,CNT_INSTALMENT_MATURE_CUM,Number of paid installments on the p...,,305236,7.9,3535076.0,20.825084,20.051494,0.0,4.0,15.0,32.0,120.0
8,AMT_INST_MIN_REGULARITY,Minimal installment for this month o...,,305236,7.9,3535076.0,3540.204129,5600.154122,0.0,0.0,0.0,6633.91125,202882.005


可以发现，这些属性都和上一次贷款或这个月内的上一次贷款相关，缺失的原因可能是因为这是本月内第一次申请贷款，因此可将缺失值置为0。

In [13]:
cre[t["Row"]]=cre[t["Row"]].fillna(value=0)

In [14]:
t = missing_values_table(cre, "credit_card_balance.csv")
pd.set_option("display.max_rows", 20)
pd.set_option('max_colwidth',40)
t

Total 0 columns missing values


Unnamed: 0,Row,Description,Special,Missing Values,% of Total Values,count,mean,std,min,25%,50%,75%,max


In [15]:
# 结果保存

cre.to_csv(result_dir+"\\credit_card_balance.csv",index=False)

### 5. **installments_payments.csv**

In [16]:
# 查看训练数据的基本数据特征

ins= pd.read_csv(ins_path)
ins.describe()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
count,13605400.0,13605400.0,13605400.0,13605400.0,13605400.0,13602500.0,13605400.0,13602500.0
mean,1903365.0,278444.9,0.8566373,18.8709,-1042.27,-1051.114,17050.91,17238.22
std,536202.9,102718.3,1.035216,26.66407,800.9463,800.5859,50570.25,54735.78
min,1000001.0,100001.0,0.0,1.0,-2922.0,-4921.0,0.0,0.0
25%,1434191.0,189639.0,0.0,4.0,-1654.0,-1662.0,4226.085,3398.265
50%,1896520.0,278685.0,1.0,8.0,-818.0,-827.0,8884.08,8125.515
75%,2369094.0,367530.0,1.0,19.0,-361.0,-370.0,16710.21,16108.42
max,2843499.0,456255.0,178.0,277.0,-1.0,-1.0,3771488.0,3771488.0


In [19]:
# 缺失值分析

t = missing_values_table(ins, "installments_payments.csv")
pd.set_option("display.max_rows", 20)
pd.set_option('max_colwidth',40)
t

Total 2 columns missing values


Unnamed: 0,Row,Description,Special,Missing Values,% of Total Values,count,mean,std,min,25%,50%,75%,max
0,DAYS_ENTRY_PAYMENT,When was the installments of previou...,time only relative to the application,2905,0.0,13602496.0,-1051.113684,800.585883,-4921.0,-1662.0,-827.0,-370.0,-1.0
1,AMT_PAYMENT,What the client actually paid on pre...,,2905,0.0,13602496.0,17238.22325,54735.783981,0.0,3398.265,8125.515,16108.425,3771487.845


可以发现，缺失值数量均小于百分之一，可以删去对应行

In [20]:
#删去部分行

ins.dropna(subset=t["Row"],
          axis=0, # axis=0表示删除行；
          how='any', # how=any表示若列name、age中，任意一个出现空值，就删掉该行
          inplace=True # inplace=True表示在原df上进行修改；
          )
ins = ins.reset_index(drop=True)
ins

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NUM_INSTALMENT_VERSION,NUM_INSTALMENT_NUMBER,DAYS_INSTALMENT,DAYS_ENTRY_PAYMENT,AMT_INSTALMENT,AMT_PAYMENT
0,1054186,161674,1.0,6,-1180.0,-1187.0,6948.360,6948.360
1,1330831,151639,0.0,34,-2156.0,-2156.0,1716.525,1716.525
2,2085231,193053,2.0,1,-63.0,-63.0,25425.000,25425.000
3,2452527,199697,1.0,3,-2418.0,-2426.0,24350.130,24350.130
4,2714724,167756,1.0,2,-1383.0,-1366.0,2165.040,2160.585
...,...,...,...,...,...,...,...,...
13602491,2006721,442291,1.0,3,-1311.0,-1318.0,2934.225,2934.225
13602492,1126000,428449,0.0,12,-301.0,-302.0,6793.470,6750.000
13602493,1519070,444122,1.0,5,-399.0,-407.0,4363.830,4363.830
13602494,2784672,444977,0.0,4,-157.0,-157.0,373.005,373.005


In [21]:
t = missing_values_table(ins, "installments_payments.csv")
pd.set_option("display.max_rows", 20)
pd.set_option('max_colwidth',40)
t

Total 0 columns missing values


Unnamed: 0,Row,Description,Special,Missing Values,% of Total Values,count,mean,std,min,25%,50%,75%,max


In [22]:
# 结果保存

ins.to_csv(result_dir+"\\installments_payments.csv",index=False)