# 信贷申请评分卡_A卡案例

主要项目流程：数据获取、数据清洗、特征工程、模型建立、模型评价、模型优化等

In [1]:
import pandas as pd

# 1.数据获取

In [2]:
# 数据基本描述：信贷申请数据--42535个样本，144个特征

In [3]:
data = pd.read_excel('LoanStats_2018Q3.xlsx')
data.shape

(42535, 144)

In [4]:
# 特征列表
feature_list = list(data)
print(feature_list)

['id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_title', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'issue_d', 'loan_status', 'pymnt_plan', 'url', 'desc', 'purpose', 'title', 'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line', 'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_d', 'last_pymnt_amnt', 'next_pymnt_d', 'last_credit_pull_d', 'collections_12_mths_ex_med', 'mths_since_last_major_derog', 'policy_code', 'application_type', 'annual_inc_joint', 'dti_joint', 'verification_status_joint', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m', 'open_act_il', 'open_il_12m', 'open_il_24m', 'mths

In [5]:
df = data.copy()

In [6]:
df.head()

Unnamed: 0,id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_title,...,hardship_payoff_balance_amount,hardship_last_payment_amount,disbursement_method,debt_settlement_flag,debt_settlement_flag_date,settlement_status,settlement_date,settlement_amount,settlement_percentage,settlement_term
0,1,5000,5000,4975.0,36 months,0.1065,162.87,B,B2,,...,,,Cash,N,NaT,,NaT,,,
1,2,2500,2500,2500.0,60 months,0.1527,59.83,C,C4,Ryder,...,,,Cash,N,NaT,,NaT,,,
2,3,2400,2400,2400.0,36 months,0.1596,84.33,C,C5,,...,,,Cash,N,NaT,,NaT,,,
3,4,10000,10000,10000.0,36 months,0.1349,339.31,C,C1,AIR RESOURCES BOARD,...,,,Cash,N,NaT,,NaT,,,
4,5,3000,3000,3000.0,60 months,0.1269,67.79,B,B5,University Medical Group,...,,,Cash,N,NaT,,NaT,,,


# 2.数据清洗

In [7]:
# 2.1 申请评分卡使用的数据不能是借款人借款后的信息，需删除18个特征
del_feature_list = ['sub_grade','grade','initial_list_status','out_prncp','out_prncp_inv','total_pymnt','total_pymnt_inv',
                    'total_rec_prncp','total_rec_int','total_rec_late_fee','recoveries','collection_recovery_fee','last_pymnt_d',
                    'last_pymnt_amnt','last_credit_pull_d','collections_12_mths_ex_med','policy_code','disbursement_method']
print(df.shape)
df_01 = df.copy().drop(columns=del_feature_list, axis=1)
df_01.shape

(42535, 144)


(42535, 126)

In [9]:
# 2.2 删除客户隐私的字段:['emp_title','title']
customer_info = ['emp_title','title']
df_02 = df_01.drop(columns=customer_info, axis=1)
df_02.shape

(42535, 124)

In [10]:
# 2.3 去重，删除空行
df_03 = df_02.drop_duplicates('id').dropna(axis=0, how='all')
df_03.shape

(42535, 124)

In [13]:
# 2.4 删除缺失率大于0.95的字段
# is_null = [df_03[i_feature].isnull().sum() for i_feature in list(df_03)]
cols_null = []
for col_null in df_03.columns:
    if df_03[col_null].isnull().sum() > (df_03.shape[0])*0.95:
        cols_null.append(col_null)
print(len(cols_null))
df_04 = df_03.drop(cols_null, axis=1)
df_04.shape

86


(42535, 38)

In [27]:
# 例：唯一值
# 2.5 判断每个特征值的分布，若存在某一个值占比超过95%，则删除(分箱前提)
col_handle = []
for col_ in df_04.columns:
    if df_04[col_].value_counts().max() > df_04.shape[0]*0.95:
        col_handle.append(col_)
print(col_handle)
df_05 = df_04.drop(col_handle, axis=1)
df_05.shape

['pymnt_plan', 'application_type', 'acc_now_delinq', 'chargeoff_within_12_mths', 'delinq_amnt', 'tax_liens', 'hardship_flag', 'debt_settlement_flag']


(42535, 30)

In [28]:
# 2.6 删除id号
df_06 = df_05.drop('id', axis=1)
df_06.shape

(42535, 29)

In [29]:
# 2.7 剩余特征列表
df_06.columns

Index(['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate',
       'installment', 'emp_length', 'home_ownership', 'annual_inc',
       'verification_status', 'issue_d', 'loan_status', 'desc', 'purpose',
       'zip_code', 'addr_state', 'dti', 'delinq_2yrs', 'earliest_cr_line',
       'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record',
       'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc',
       'next_pymnt_d', 'pub_rec_bankruptcies'],
      dtype='object')

In [76]:
# 2.8 查看借款期限，并选出借款36期的样本
print(df_06['term'].value_counts())
df_06['term'] = df_06['term'].replace(' 36 months', 36).replace(' 60 months', 60)
df_final = df_06[df_06['term']==36]
df_final.shape

36    31534
60    11001
Name: term, dtype: int64


(31534, 29)

In [78]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31534 entries, 0 to 42534
Data columns (total 29 columns):
loan_amnt                 31534 non-null int64
funded_amnt               31534 non-null int64
funded_amnt_inv           31534 non-null float64
term                      31534 non-null int64
int_rate                  31534 non-null float64
installment               31534 non-null float64
emp_length                30699 non-null object
home_ownership            31534 non-null object
annual_inc                31530 non-null float64
verification_status       31534 non-null object
issue_d                   31534 non-null datetime64[ns]
loan_status               31534 non-null object
desc                      22055 non-null object
purpose                   31534 non-null object
zip_code                  31534 non-null object
addr_state                31534 non-null object
dti                       31534 non-null float64
delinq_2yrs               31505 non-null object
earliest_cr_line 

In [84]:
# 2.9 数据类型转换：object转成int64等
def obj_to_int(df_final):
    obj_feature = []
    for obj_f in df_final.columns:
        if df_final[obj_f].dtypes == 'object':
            obj_feature.append(obj_f)
    print(obj_feature)
    return obj_feature
        
obj_feature = obj_to_int(df_final)

['emp_length', 'home_ownership', 'verification_status', 'loan_status', 'desc', 'purpose', 'zip_code', 'addr_state', 'delinq_2yrs', 'total_acc']


In [87]:
df_final[obj_feature]

Unnamed: 0,emp_length,home_ownership,verification_status,loan_status,desc,purpose,zip_code,addr_state,delinq_2yrs,total_acc
0,10+ years,RENT,Verified,Fully Paid,Borrower added on 12/22/11 > I need to upgra...,credit_card,860xx,AZ,0,9
2,10+ years,RENT,Not Verified,Fully Paid,,small_business,606xx,IL,0,10
3,10+ years,RENT,Source Verified,Fully Paid,Borrower added on 12/21/11 > to pay for prop...,other,917xx,CA,0,37
5,3 years,RENT,Source Verified,Fully Paid,,wedding,852xx,AZ,0,12
7,9 years,RENT,Source Verified,Fully Paid,Borrower added on 12/16/11 > Downpayment for...,car,900xx,CA,0,4
11,10+ years,OWN,Source Verified,Fully Paid,,debt_consolidation,913xx,CA,0,34
12,< 1 year,RENT,Source Verified,Charged Off,Borrower added on 12/15/11 > Plan to pay off...,debt_consolidation,245xx,VA,0,9
13,3 years,RENT,Source Verified,Fully Paid,Borrower added on 12/19/11 > I intend to pay...,credit_card,606xx,IL,0,11
14,3 years,RENT,Source Verified,Charged Off,,other,951xx,CA,0,29
15,< 1 year,RENT,Not Verified,Fully Paid,,debt_consolidation,641xx,MO,0,23
