### 特征选择
进行特征工程处理，并采用IV值和随机森林等选择特征

 #### IV值选择特征

In [20]:
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

In [21]:
def CalcV(Xvar, Yvar):
    N_0 = np.sum(Yvar==0)
    N_1 = np.sum(Yvar==1)
    N_0_group = np.zeros(np.unique(Xvar).shape)
    N_1_group = np.zeros(np.unique(Xvar).shape)
    iv = 0
    
    for i in range(len(np.unique(Xvar))):
        N_0_group[i] = Yvar[(Xvar==np.unique(Xvar)[i]) & (Yvar==0)].count()
        N_1_group[i] = Yvar[(Xvar==np.unique(Xvar)[i]) & (Yvar==1)].count()
    iv = np.sum((N_0_group/N_0 - N_1_group/N_1) * np.log((N_0_group/N_0) / (N_1_group/N_1)))
   
    
    # 处理iv值是极端的情况
    if iv >= 1.0:
        iv = 1.0
    
    return iv

def caliv_batch(df, Yvar):
    iv_list = []
    for col in df.columns:
        iv = CalcV(df[col], Yvar)
        iv_list.append(iv)
    names = list(df.columns)
    iv_df = pd.DataFrame({'Var': names, 'IV': iv_list}, columns=['Var', 'IV'])
    return iv_df, iv_list

df_data = pd.read_csv('data_cleaned.csv')
im_iv, ivl = caliv_batch(df_data.iloc[:, :-1], df_data.iloc[:, -1])
im_iv.to_excel('IV result.xlsx', index=0)

#### 随机森林选择特征

In [22]:
from sklearn.ensemble import RandomForestClassifier

In [24]:
X_train, Y_train = df_data.iloc[:, :-1], df_data.iloc[:, -1]
feat_labels = X_train.columns
forest = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=1)
forest.fit(X_train, Y_train)
importance = forest.feature_importances_
imp_result = np.argsort(importance)[::-1]

for i in range(X_train.shape[1]):
   print("%2d. %-*s %f" % (i+1, 30, feat_labels[i], importance[imp_result[i]]))

 1. custid                         0.051275
 2. low_volume_percent             0.045285
 3. middle_volume_percent          0.035271
 4. take_amount_in_later_12_month_highest 0.029851
 5. trans_amount_increase_rate_lately 0.024138
 6. trans_activity_month           0.021701
 7. trans_activity_day             0.017378
 8. transd_mcc                     0.017197
 9. trans_days_interval_filter     0.016149
10. trans_days_interval            0.015715
11. regional_mobility              0.015530
12. student_feature                0.015319
13. repayment_capability           0.015216
14. is_high_user                   0.015036
15. number_of_trans_from_2011      0.015015
16. first_transaction_time         0.014967
17. historical_trans_amount        0.014405
18. historical_trans_day           0.014325
19. rank_trad_1_month              0.014247
20. trans_amount_3_month           0.014062
21. avg_consume_less_12_valid_month 0.013939
22. abs                            0.013813
23. top_trans_count_l

In [29]:
result_output = pd.DataFrame([feat_labels, importance[imp_result]]).T
result_output.columns = ['Var', 'importance']
result_output.to_csv('RandomForestResult.csv', index=0)