# 基本信息

In [1]:
from sklearn.metrics import confusion_matrix,precision_score,recall_score,f1_score,roc_auc_score,precision_recall_curve,roc_curve
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
from sklearn.model_selection import GridSearchCV,StratifiedKFold,KFold,GroupKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures,OneHotEncoder,LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib
from tensorflow import keras
import graphviz
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
import lightgbm as lgb
import random

sns.set(style="white", color_codes=True)
#显示所有列
pd.set_option('display.max_columns', None)
#取消科学计数，显示小数点后1位
pd.set_option('float_format', lambda x: '%.1f' % x)
# pd.set_option('display.max_rows', None)
seed = 100
np.random.seed(seed)
random.seed(seed)

In [2]:
trainfile = 'train.csv'
train_data = pd.read_csv(trainfile)
print("rows:",train_data.shape[0]," columns:", train_data.shape[1])

rows: 1521787  columns: 23


In [3]:
testfile = 'test.csv'
test_data = pd.read_csv(testfile)
print("rows:",test_data.shape[0]," columns:", test_data.shape[1])

rows: 421665  columns: 22


In [4]:
# 改时间为小时
def change_time(data):
    time = data['loctm']
    newtime = []
    for i in time:
        k = int(i/10000)
        newtime.append(k)
    data['loctm'] = newtime

In [5]:
change_time(train_data)
change_time(test_data)

In [6]:
train_data = train_data.replace(['N','Y'],[0,1])
test_data = test_data.replace(['N','Y'],[0,1])

In [7]:
combine_features = ['acqic','stocn','scity','mcc','mchno','csmcu','ecfg','etymd','stscd','loctm']
cb_feature = []
for i in range(len(combine_features)):
    if i == 10 :
        break
    for j in combine_features[i+1:]:
        combine_fe = combine_features[i] +'__'+ j
        cb_feature.append(combine_fe)

In [8]:
# features interaction 并使用 labelencoder
cb_feature.append('bacno__cano')
print(cb_feature)
print(len(cb_feature))

for feature in cb_feature:
    f1,f2 = feature.split('__')
    train_data[feature] = train_data[f1].astype(str) + '_' + train_data[f2].astype(str)
    test_data[feature] = test_data[f1].astype(str) + '_' + test_data[f2].astype(str)

    le = LabelEncoder()
    le.fit(list(train_data[feature].astype(str).values) + list(test_data[feature].astype(str).values))
    train_data[feature] = le.transform(list(train_data[feature].astype(str).values))
    test_data[feature] = le.transform(list(test_data[feature].astype(str).values))

['acqic__stocn', 'acqic__scity', 'acqic__mcc', 'acqic__mchno', 'acqic__csmcu', 'acqic__ecfg', 'acqic__etymd', 'acqic__stscd', 'acqic__loctm', 'stocn__scity', 'stocn__mcc', 'stocn__mchno', 'stocn__csmcu', 'stocn__ecfg', 'stocn__etymd', 'stocn__stscd', 'stocn__loctm', 'scity__mcc', 'scity__mchno', 'scity__csmcu', 'scity__ecfg', 'scity__etymd', 'scity__stscd', 'scity__loctm', 'mcc__mchno', 'mcc__csmcu', 'mcc__ecfg', 'mcc__etymd', 'mcc__stscd', 'mcc__loctm', 'mchno__csmcu', 'mchno__ecfg', 'mchno__etymd', 'mchno__stscd', 'mchno__loctm', 'csmcu__ecfg', 'csmcu__etymd', 'csmcu__stscd', 'csmcu__loctm', 'ecfg__etymd', 'ecfg__stscd', 'ecfg__loctm', 'etymd__stscd', 'etymd__loctm', 'stscd__loctm', 'bacno__cano']
46


In [9]:
# Count Encoding 
ori_features = ['acqic','bacno','cano','stocn','scity','csmcu','mcc','mchno','ecfg','etymd','stscd','loctm']
frec_feature = ori_features + cb_feature
cf_feature = []

for feature in frec_feature:
    train_data[feature + '_count_full'] = train_data[feature].map(pd.concat([train_data[feature], test_data[feature]], ignore_index=True).value_counts(dropna=False))
    test_data[feature + '_count_full'] = test_data[feature].map(pd.concat([train_data[feature], test_data[feature]], ignore_index=True).value_counts(dropna=False))
    cf_feature.append(feature + '_count_full')
    
print(cf_feature)
print(len(cf_feature))

['acqic_count_full', 'bacno_count_full', 'cano_count_full', 'stocn_count_full', 'scity_count_full', 'csmcu_count_full', 'mcc_count_full', 'mchno_count_full', 'ecfg_count_full', 'etymd_count_full', 'stscd_count_full', 'loctm_count_full', 'acqic__stocn_count_full', 'acqic__scity_count_full', 'acqic__mcc_count_full', 'acqic__mchno_count_full', 'acqic__csmcu_count_full', 'acqic__ecfg_count_full', 'acqic__etymd_count_full', 'acqic__stscd_count_full', 'acqic__loctm_count_full', 'stocn__scity_count_full', 'stocn__mcc_count_full', 'stocn__mchno_count_full', 'stocn__csmcu_count_full', 'stocn__ecfg_count_full', 'stocn__etymd_count_full', 'stocn__stscd_count_full', 'stocn__loctm_count_full', 'scity__mcc_count_full', 'scity__mchno_count_full', 'scity__csmcu_count_full', 'scity__ecfg_count_full', 'scity__etymd_count_full', 'scity__stscd_count_full', 'scity__loctm_count_full', 'mcc__mchno_count_full', 'mcc__csmcu_count_full', 'mcc__ecfg_count_full', 'mcc__etymd_count_full', 'mcc__stscd_count_full', 

In [None]:
# Group by  \ train_test 需合并？
train_test = train_data.append(test_data)
for uid in ['bacno__cano']:
    for feature in ['acqic','csmcu','conam','loctm','ecfg','etymd','mcc','mchno','stocn','scity','stscd']:
        col_mean = train_test.groupby(uid)[feature].mean()
        train_data[feature + '_to_mean_' + uid] = train_data[uid].map(col_mean)
        test_data[feature + '_to_mean_' + uid] = test_data[uid].map(col_mean)
              
        col_std = train_test.groupby(uid)[feature].std(ddof = 0)
        train_data[feature + '_to_std_' + uid] = train_data[uid].map(col_std)
        test_data[feature + '_to_std_' + uid] = test_data[uid].map(col_std)
    
#     train_data['conam_devide_mean_' + feature] = train_data['conam']/ train_data.groupby([feature])['conam'].transform('mean')
#     test_data['conam_devide_mean_' + feature] = test_data['conam'] / test_data.groupby([feature])['conam'].transform('mean')

In [None]:
model1_features = ['acqic','stocn','scity','csmcu','conam','mcc','mchno','ecfg','etymd','stscd','loctm']

splits = 3
folds = GroupKFold(n_splits = splits)
split_groups = train_data['locdt']
auc = {}
test_feature = cb_feature + cf_feature
print(len(test_feature))
for col in test_feature:
    feature = model1_features.copy()
    feature.append(col)
    print(feature)
    X_train = train_data[feature].copy()
    Y_train = train_data['fraud_ind'].copy()
    oof = np.zeros(len(X_train))
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train.values, Y_train.values,groups=split_groups)):
        print("Fold {}".format(fold_))
        train_df, train_lb = X_train.iloc[trn_idx], Y_train.iloc[trn_idx]
        valid_df, valid_lb = X_train.iloc[val_idx], Y_train.iloc[val_idx]
    
    
        model = XGBClassifier(booster = 'gbtree', objective = 'binary:logistic',
                      learning_rate = 0.01, n_estimators = 2000, max_depth = 15, missing = -1,
                      subsample = 0.8, 
#                     colsample_bytree = 0.7,
                      random_state = seed,
                      tree_method='gpu_hist' )
        
        model.fit(train_df,train_lb,eval_set = [(train_df,train_lb),(valid_df,valid_lb)],eval_metric='auc',early_stopping_rounds = 200,verbose=200)
    
        valid_pre = model.predict_proba(valid_df)[:,1]       
        
        
        oof[val_idx] = valid_pre           
    print("-"*100,'\r\n')  
    auc[col] = roc_auc_score(Y_train,oof)

104
['acqic', 'stocn', 'scity', 'csmcu', 'conam', 'mcc', 'mchno', 'ecfg', 'etymd', 'stscd', 'loctm', 'acqic__stocn']
Fold 0
[0]	validation_0-auc:0.95131	validation_1-auc:0.957491
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 200 rounds.
[200]	validation_0-auc:0.97983	validation_1-auc:0.973637
[400]	validation_0-auc:0.985564	validation_1-auc:0.976086
[600]	validation_0-auc:0.990225	validation_1-auc:0.977019
[800]	validation_0-auc:0.993048	validation_1-auc:0.977468
Stopping. Best iteration:
[774]	validation_0-auc:0.992823	validation_1-auc:0.977499

Fold 1
[0]	validation_0-auc:0.957415	validation_1-auc:0.945193
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 200 rounds.
[200]	validation_0-auc:0.9811	validation_1-auc:0.968458
[400]	validation_0-auc:0.987388	validation_1-auc:0.972896
[600]	valida

Will train until validation_1-auc hasn't improved in 200 rounds.
[200]	validation_0-auc:0.98113	validation_1-auc:0.968469
[400]	validation_0-auc:0.987402	validation_1-auc:0.972913
[600]	validation_0-auc:0.990917	validation_1-auc:0.972889
[800]	validation_0-auc:0.993349	validation_1-auc:0.973743
[1000]	validation_0-auc:0.994515	validation_1-auc:0.973684
Stopping. Best iteration:
[903]	validation_0-auc:0.994047	validation_1-auc:0.973792

Fold 2
[0]	validation_0-auc:0.955059	validation_1-auc:0.949228
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 200 rounds.
[200]	validation_0-auc:0.978758	validation_1-auc:0.971097
[400]	validation_0-auc:0.984915	validation_1-auc:0.97558
[600]	validation_0-auc:0.990566	validation_1-auc:0.977131
[800]	validation_0-auc:0.993122	validation_1-auc:0.97683
Stopping. Best iteration:
[609]	validation_0-auc:0.990749	validation_1-auc:0.977188

------------------------

['acqic', 'stocn', 'scity', 'csmcu', 'conam', 'mcc', 'mchno', 'ecfg', 'etymd', 'stscd', 'loctm', 'stocn__scity']
Fold 0
[0]	validation_0-auc:0.950387	validation_1-auc:0.956301
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 200 rounds.
[200]	validation_0-auc:0.979589	validation_1-auc:0.973215
[400]	validation_0-auc:0.985431	validation_1-auc:0.976235
[600]	validation_0-auc:0.990215	validation_1-auc:0.977205
[800]	validation_0-auc:0.992916	validation_1-auc:0.977732
[1000]	validation_0-auc:0.994162	validation_1-auc:0.977639
Stopping. Best iteration:
[847]	validation_0-auc:0.993314	validation_1-auc:0.977761

Fold 1
[0]	validation_0-auc:0.957727	validation_1-auc:0.945289
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 200 rounds.
[200]	validation_0-auc:0.980855	validation_1-auc:0.96849
[400]	valida

[800]	validation_0-auc:0.992889	validation_1-auc:0.977752
[1000]	validation_0-auc:0.994135	validation_1-auc:0.977672
Stopping. Best iteration:
[854]	validation_0-auc:0.993303	validation_1-auc:0.977776

Fold 1
[0]	validation_0-auc:0.956948	validation_1-auc:0.943877
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 200 rounds.
[200]	validation_0-auc:0.980843	validation_1-auc:0.968465
[400]	validation_0-auc:0.987084	validation_1-auc:0.972695
[600]	validation_0-auc:0.990654	validation_1-auc:0.973036
[800]	validation_0-auc:0.993081	validation_1-auc:0.974051
[1000]	validation_0-auc:0.994342	validation_1-auc:0.973728
Stopping. Best iteration:
[810]	validation_0-auc:0.993166	validation_1-auc:0.974128

Fold 2
[0]	validation_0-auc:0.955237	validation_1-auc:0.950014
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't impr

[600]	validation_0-auc:0.991111	validation_1-auc:0.972759
[800]	validation_0-auc:0.993451	validation_1-auc:0.973677
[1000]	validation_0-auc:0.994603	validation_1-auc:0.973625
Stopping. Best iteration:
[913]	validation_0-auc:0.994206	validation_1-auc:0.973752

Fold 2
[0]	validation_0-auc:0.955589	validation_1-auc:0.951024
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 200 rounds.
[200]	validation_0-auc:0.978748	validation_1-auc:0.971515
[400]	validation_0-auc:0.98496	validation_1-auc:0.975625
[600]	validation_0-auc:0.990759	validation_1-auc:0.977208
[800]	validation_0-auc:0.993201	validation_1-auc:0.976914
Stopping. Best iteration:
[609]	validation_0-auc:0.990915	validation_1-auc:0.977261

---------------------------------------------------------------------------------------------------- 

['acqic', 'stocn', 'scity', 'csmcu', 'conam', 'mcc', 'mchno', 'ecfg', 'etymd', 'stscd', 'loctm', 'sc

['acqic', 'stocn', 'scity', 'csmcu', 'conam', 'mcc', 'mchno', 'ecfg', 'etymd', 'stscd', 'loctm', 'scity__stscd']
Fold 0
[0]	validation_0-auc:0.951274	validation_1-auc:0.957326
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 200 rounds.
[200]	validation_0-auc:0.979908	validation_1-auc:0.97373
[400]	validation_0-auc:0.985657	validation_1-auc:0.976142
[600]	validation_0-auc:0.9903	validation_1-auc:0.977026
[800]	validation_0-auc:0.993081	validation_1-auc:0.977384
Stopping. Best iteration:
[758]	validation_0-auc:0.99272	validation_1-auc:0.977437

Fold 1
[0]	validation_0-auc:0.957367	validation_1-auc:0.945204
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 200 rounds.
[200]	validation_0-auc:0.981175	validation_1-auc:0.968477
[400]	validation_0-auc:0.98746	validation_1-auc:0.972871
[600]	validation_

Will train until validation_1-auc hasn't improved in 200 rounds.
[200]	validation_0-auc:0.981145	validation_1-auc:0.968461
[400]	validation_0-auc:0.987417	validation_1-auc:0.972843
[600]	validation_0-auc:0.990801	validation_1-auc:0.97289
[800]	validation_0-auc:0.993317	validation_1-auc:0.97373
[1000]	validation_0-auc:0.994498	validation_1-auc:0.973677
Stopping. Best iteration:
[873]	validation_0-auc:0.993855	validation_1-auc:0.97382

Fold 2
[0]	validation_0-auc:0.955108	validation_1-auc:0.949372
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 200 rounds.
[200]	validation_0-auc:0.97865	validation_1-auc:0.971144
[400]	validation_0-auc:0.984923	validation_1-auc:0.975607
[600]	validation_0-auc:0.990539	validation_1-auc:0.977187
Stopping. Best iteration:
[585]	validation_0-auc:0.990311	validation_1-auc:0.977224

-----------------------------------------------------------------------------------

In [13]:
print("save oof file...")
k = pd.DataFrame([auc])
k.to_csv('forward_feature.csv',index=None)

save oof file...


In [None]:
feature_importances = pd.DataFrame()
feature_importances['feature'] = X_train.columns
feature_importances['average']  = clf.feature_importance()
plt.figure(figsize=(10, 10))
sns.barplot(data=feature_importances.sort_values(by='average', ascending=False).head(100), x='average', y='feature')
plt.title('TOP feature importance over cv folds average');

In [None]:
feature_importances = pd.DataFrame()
feature_importances['feature'] = X_train.columns

splits = 3
# folds = KFold(n_splits = splits, shuffle=True, random_state=seed)
folds = GroupKFold(n_splits = splits)
split_groups = train_data['locdt']

oof = np.zeros(len(X_train))
predictions = np.zeros(len(X_test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train.values, Y_train.values,groups=split_groups)):
    print("Fold {}".format(fold_))
    print(len(trn_idx),len(val_idx))
    train_df, train_lb = X_train.iloc[trn_idx], Y_train.iloc[trn_idx]
    valid_df, valid_lb = X_train.iloc[val_idx], Y_train.iloc[val_idx]
    
    trn_data = lgb.Dataset(train_df, label=train_lb)
    val_data = lgb.Dataset(valid_df, label=valid_lb)
    
    clf = lgb.train(params,
                    trn_data,
                    num_boost_round= 5000,
                    valid_sets = [trn_data, val_data],
                    verbose_eval=200,
                    early_stopping_rounds = 200)
    
    valid_pre = clf.predict(valid_df)
    oof[val_idx] = valid_pre
    feature_importances['fold_{}'.format(fold_ + 1)] = clf.feature_importance()
    predictions += clf.predict(X_test) / splits  
    print(predictions[:5])
    print("-"*100,'\r\n')

print("auc_oof:",roc_auc_score(Y_train,oof))
predict = [int(item>0.5) for  item in oof]
print("f1:",f1_score(Y_train,predict))   

In [None]:
feature_importances['average'] = feature_importances.mean(axis=1)
plt.figure(figsize=(10, 10))
sns.barplot(data=feature_importances.sort_values(by='average', ascending=False).head(100), x='average', y='feature')
plt.title('TOP feature importance over cv folds average');