In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import featuretools as ft
import lightgbm as lgb
import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from random import sample 
import pickle
import eli5
from eli5.sklearn import PermutationImportance
from lightgbm import LGBMClassifier
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import math

%matplotlib inline
RSEED = 50

# Load Original Features

In [93]:
feat_num = 439

In [80]:
# 取出feature文件
with open('./data/features%s_na.pickle'%(feat_num), 'rb') as handle:
    df_total_raw = pickle.load(handle)

In [81]:
# 取出要删掉的列
with open('./data/feat%s_rm_pm_importance100.pickle'%(437), 'rb') as handle:
    to_drop = pickle.load(handle)

In [96]:
# 删掉不用的feature
df_total = df_total_raw.drop(list(to_drop),axis=1)
print(df_total.shape)
df_total.head()

(1097231, 339)


Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_38,DeviceType,DeviceInfo,P_email,R_email,screen_width,pdc_amt_ratio,card1_fq_enc,addr1_fq_enc,card_mv_day_fq
0,2987000,0.0,86400,68.5,0,13926,-999.0,150.0,0,142.0,...,0,0,0,0,0,0,0.445929,56,43035.0,1
1,2987001,0.0,86401,29.0,0,2755,404.0,150.0,1,102.0,...,0,0,0,1,0,0,0.188788,1338,76902.0,1
2,2987002,0.0,86469,59.0,0,4663,490.0,150.0,2,166.0,...,0,0,0,2,0,0,0.384085,1794,48387.0,1
3,2987003,0.0,86499,50.0,0,18132,567.0,150.0,1,117.0,...,0,0,0,3,0,0,0.325496,7635,17455.0,11
4,2987004,0.0,86506,50.0,1,4497,514.0,150.0,1,102.0,...,1,1,1,1,0,1,0.686003,30,7107.0,1


In [97]:
# 把train和test文件分开
df_train = df_total[df_total['isFraud'].notnull()]
features_test = df_total[df_total['isFraud'].isnull()]
features_test_new = features_test.drop(columns = ['isFraud', 'TransactionID'])

In [84]:
# 区分train文件中的正例和负例
train_pos = df_train[df_train['isFraud']==1]
train_neg = df_train[df_train['isFraud']==0]
print(train_pos.shape,train_neg.shape)

(20663, 339) (569877, 339)


In [85]:
train_neg_sfd = train_neg.sample(frac=1)
train_neg_sfd.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_38,DeviceType,DeviceInfo,P_email,R_email,screen_width,pdc_amt_ratio,card1_fq_enc,addr1_fq_enc,card_mv_day_fq
470109,3457109,0.0,12141452,117.0,0,9353,555.0,150.0,2,226.0,...,0,0,0,0,0,0,0.76166,35,72580.0,1
288994,3275994,0.0,7101350,88.213,2,9633,130.0,185.0,2,138.0,...,2,2,3,6,2,0,2.118819,8909,-999.0,1
235160,3222160,0.0,5576973,226.0,0,17085,361.0,150.0,2,226.0,...,0,0,0,2,0,0,1.471242,166,38890.0,1
545430,3532430,0.0,14401516,213.0,0,16876,490.0,150.0,2,195.0,...,0,0,0,6,0,0,1.386613,648,77069.0,3
388546,3375546,0.0,9747712,652.27,0,14182,562.0,150.0,1,102.0,...,0,0,0,3,0,0,4.246224,687,35929.0,4


In [86]:
size = math.floor(train_neg.shape[0] * 0.2)
print(size)

113975


# Prepare Model

In [87]:
categorical = ['ProductCD', 'card2', 'card3', 'card4', 'card5','card6',
              'addr1','addr2','P_email','R_email','M1','M2','M3',
              'M4','M5','M6','M7','M8','M9','DeviceType','DeviceInfo','dow','hour',
              'Device_name','Device_version','screen_width','screen_height',
               'P_email_suffix','R_email_suffix','id_30_OS','id_30_version',
              'is_card_freq_Device','is_wide','is_long','is_zero','is_win8_vista',
              'is_windows_otheros','is_card_freq_pdc','is_card_freq_addr1'] 
ids = [ 'id_%s'%(i) for i in range(12,39)]
categorical = categorical + ids
categorical = list(set(categorical).intersection(df_total.columns))

In [88]:
params = {'num_leaves': 491,
          'min_child_weight': 0.03454472573214212,
          'feature_fraction': 0.3797454081646243,
          'bagging_fraction': 0.4181193142567742,
          'min_data_in_leaf': 106,
          'objective': 'binary',
          'max_depth': -1,
          'learning_rate': 0.006883242363721497,
          "boosting_type":"gbdt",#'goss'
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 0.3899927210061127,
          'reg_lambda': 0.6485237330340494,
          'random_state': 47
          #'num_threads':10
          #'device' :'gpu',
          #'is_unbalance':True
          #'scale_pos_weight':9
         }

# Train Model

In [90]:
predictions = np.zeros(len(features_test_new))
ave_auc = 0
splits = 5

In [91]:
for k in range(0,splits):
    print("Fold {}".format(k))
    # 生成采样数据集
    print('Generate Train Data')
    train_neg_sample = train_neg_sfd.iloc[size*k:size*(k+1)]
    df_train_sample = pd.concat([train_pos,train_neg_sample]).sort_index()
    # 定义label和feature
    labels_train = df_train_sample['isFraud']
    features_train = df_train_sample.drop(columns = ['isFraud', 'TransactionID'])
    valid_num = math.floor(features_train.shape[0]*0.8)
    print(features_train.shape,labels_train.shape,valid_num)
    
    print('Training Begin')
    # 训练数据
    train_set = lgb.Dataset(features_train.iloc[0:valid_num,:], label=labels_train.values[0:valid_num],categorical_feature=categorical)#
    valid_set = lgb.Dataset(features_train.iloc[valid_num:,:], label=labels_train.values[valid_num:],categorical_feature=categorical)#
    valid_results = {}
    model = lgb.train(params,train_set,num_boost_round = 10000, 
                  valid_sets = [train_set, valid_set],
                  verbose_eval=500,
                  early_stopping_rounds = 500,
                  evals_result=valid_results)
    print('Training Finish,Predicting')
    pred = model.predict(features_train.iloc[valid_num:,:])
    auc_score = roc_auc_score(labels_train.values[valid_num:], pred)
    ave_auc += auc_score / splits
    predictions += model.predict(features_test_new) / splits

Fold 0
Generate Train Data
(134638, 337) (134638,) 107710
Training Begin




Training until validation scores don't improve for 500 rounds.
[500]	training's auc: 0.984754	valid_1's auc: 0.920122
[1000]	training's auc: 0.997308	valid_1's auc: 0.92937
[1500]	training's auc: 0.999629	valid_1's auc: 0.930752
[2000]	training's auc: 0.999963	valid_1's auc: 0.93121
[2500]	training's auc: 0.999997	valid_1's auc: 0.931528
[3000]	training's auc: 1	valid_1's auc: 0.931536
Early stopping, best iteration is:
[2819]	training's auc: 0.999999	valid_1's auc: 0.931597
Training Finish,Predicting


ValueError: Input data must be 2 dimensional and non empty.

In [78]:
# feat438
ave_auc

0.9314257983202661

In [64]:
# feat439 drop transactionDT
ave_auc

0.9298575946302156

In [51]:
# feat439 baseline
ave_auc

0.9315174059157163

In [58]:
id_test = features_test['TransactionID']
submission = pd.DataFrame({'TransactionID': id_test, 
                            'isFraud': predictions})
submission.to_csv('./data/sub_feat%s_easyensemble.csv'%(feat_num), index = False)