In [1]:
%matplotlib inline
import pandas as pd
import lightgbm as lgb
import numpy as np
import os
from __future__ import division
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,roc_curve
from sklearn.model_selection import ParameterGrid
import datetime
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [2]:
df_train=pd.read_csv('data/atec_anti_fraud_train.csv', parse_dates=['date'])

In [3]:
df_train_clean=df_train.loc[df_train['label']!=-1]
del df_train

In [4]:
y=df_train_clean['label']
X=df_train_clean.drop(['id','label','date'],axis=1)
cut_date_train=datetime.datetime(2017,10,20)
cut_date_test=datetime.datetime(2017,11,1)

In [5]:
x_train=X[df_train_clean.date<=cut_date_train]
x_test=X[df_train_clean.date>=cut_date_test]
y_train=y[df_train_clean.date<=cut_date_train]
y_test=y[df_train_clean.date>=cut_date_test]

In [6]:
num_values=2000
category_feature=[]
for i in range(X.shape[1]):
    num = X['f'+str(i+1)].unique().shape[0]
    if num <=num_values:
        category_feature.append(i)

In [7]:
del df_train_clean, X, y

In [8]:
def atec_metric(preds, train_data):
    labels = train_data.get_label()
    fpr,tpr,threshhold=roc_curve(labels, preds)
    return 'atec_metric', 0.4*tpr[fpr<=0.001][-1] + 0.3*tpr[fpr<=0.005][-1] + 0.3*tpr[fpr<=0.01][-1], True

In [9]:
feature_name=['feasture_'+str(col+1) for col in range(x_train.shape[1])]
lgb_train=lgb.Dataset(x_train,y_train,free_raw_data=False,feature_name=feature_name, categorical_feature=category_feature)
lgb_valid=lgb.Dataset(x_test,y_test,reference=lgb_train,free_raw_data=False,feature_name=feature_name, categorical_feature=category_feature)

In [19]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'device': 'cpu',
    'verbose': 0,
    'early_stopping_round': 100,
    'learning_rate': 0.01,
    'num_leaves': 128,
    'lambda_l1': 0.006,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': [5],
    'bagging_seed':3,
    'feature_fration_seed':2
}


In [20]:
gbm = lgb.train(params, lgb_train, num_boost_round=10000, valid_sets=lgb_valid, valid_names='valid_set',feval=atec_metric)



[1]	valid_set's auc: 0.906756	valid_set's atec_metric: 0.334205
Training until validation scores don't improve for 100 rounds.
[2]	valid_set's auc: 0.907598	valid_set's atec_metric: 0.357752
[3]	valid_set's auc: 0.895386	valid_set's atec_metric: 0.365601
[4]	valid_set's auc: 0.901819	valid_set's atec_metric: 0.365407
[5]	valid_set's auc: 0.901023	valid_set's atec_metric: 0.374612
[6]	valid_set's auc: 0.90524	valid_set's atec_metric: 0.392151
[7]	valid_set's auc: 0.916416	valid_set's atec_metric: 0.403682
[8]	valid_set's auc: 0.919101	valid_set's atec_metric: 0.402326
[9]	valid_set's auc: 0.917304	valid_set's atec_metric: 0.407171
[10]	valid_set's auc: 0.918454	valid_set's atec_metric: 0.410078
[11]	valid_set's auc: 0.923234	valid_set's atec_metric: 0.411725
[12]	valid_set's auc: 0.923265	valid_set's atec_metric: 0.417248
[13]	valid_set's auc: 0.923234	valid_set's atec_metric: 0.409012
[14]	valid_set's auc: 0.924214	valid_set's atec_metric: 0.413469
[15]	valid_set's auc: 0.924058	valid_

[127]	valid_set's auc: 0.957416	valid_set's atec_metric: 0.460078
[128]	valid_set's auc: 0.957537	valid_set's atec_metric: 0.460756
[129]	valid_set's auc: 0.957954	valid_set's atec_metric: 0.459884
[130]	valid_set's auc: 0.958004	valid_set's atec_metric: 0.462016
[131]	valid_set's auc: 0.957629	valid_set's atec_metric: 0.461047
[132]	valid_set's auc: 0.95779	valid_set's atec_metric: 0.463081
[133]	valid_set's auc: 0.957854	valid_set's atec_metric: 0.463081
[134]	valid_set's auc: 0.957847	valid_set's atec_metric: 0.462209
[135]	valid_set's auc: 0.957967	valid_set's atec_metric: 0.462694
[136]	valid_set's auc: 0.958089	valid_set's atec_metric: 0.46657
[137]	valid_set's auc: 0.95816	valid_set's atec_metric: 0.464244
[138]	valid_set's auc: 0.958243	valid_set's atec_metric: 0.464922
[139]	valid_set's auc: 0.958293	valid_set's atec_metric: 0.466376
[140]	valid_set's auc: 0.958599	valid_set's atec_metric: 0.465504
[141]	valid_set's auc: 0.958623	valid_set's atec_metric: 0.464922
[142]	valid_s

[252]	valid_set's auc: 0.968802	valid_set's atec_metric: 0.487403
[253]	valid_set's auc: 0.968817	valid_set's atec_metric: 0.487306
[254]	valid_set's auc: 0.968815	valid_set's atec_metric: 0.486143
[255]	valid_set's auc: 0.968845	valid_set's atec_metric: 0.486919
[256]	valid_set's auc: 0.968969	valid_set's atec_metric: 0.486822
[257]	valid_set's auc: 0.969007	valid_set's atec_metric: 0.486047
[258]	valid_set's auc: 0.969083	valid_set's atec_metric: 0.48595
[259]	valid_set's auc: 0.969115	valid_set's atec_metric: 0.485465
[260]	valid_set's auc: 0.969227	valid_set's atec_metric: 0.485465
[261]	valid_set's auc: 0.969328	valid_set's atec_metric: 0.484981
[262]	valid_set's auc: 0.969352	valid_set's atec_metric: 0.484593
[263]	valid_set's auc: 0.969622	valid_set's atec_metric: 0.486919
[264]	valid_set's auc: 0.969699	valid_set's atec_metric: 0.488275
[265]	valid_set's auc: 0.969732	valid_set's atec_metric: 0.488953
[266]	valid_set's auc: 0.969749	valid_set's atec_metric: 0.48905
[267]	valid_

[377]	valid_set's auc: 0.97436	valid_set's atec_metric: 0.510562
[378]	valid_set's auc: 0.974362	valid_set's atec_metric: 0.511531
[379]	valid_set's auc: 0.974404	valid_set's atec_metric: 0.511434
[380]	valid_set's auc: 0.974422	valid_set's atec_metric: 0.510853
[381]	valid_set's auc: 0.974447	valid_set's atec_metric: 0.510756
[382]	valid_set's auc: 0.974465	valid_set's atec_metric: 0.511725
[383]	valid_set's auc: 0.974493	valid_set's atec_metric: 0.510562
[384]	valid_set's auc: 0.974495	valid_set's atec_metric: 0.510465
[385]	valid_set's auc: 0.974514	valid_set's atec_metric: 0.510368
[386]	valid_set's auc: 0.974862	valid_set's atec_metric: 0.512209
[387]	valid_set's auc: 0.974938	valid_set's atec_metric: 0.511822
[388]	valid_set's auc: 0.975085	valid_set's atec_metric: 0.512984
[389]	valid_set's auc: 0.975279	valid_set's atec_metric: 0.513081
[390]	valid_set's auc: 0.975687	valid_set's atec_metric: 0.515019
[391]	valid_set's auc: 0.975711	valid_set's atec_metric: 0.514244
[392]	valid

[502]	valid_set's auc: 0.979203	valid_set's atec_metric: 0.511143
[503]	valid_set's auc: 0.97923	valid_set's atec_metric: 0.510853
[504]	valid_set's auc: 0.979242	valid_set's atec_metric: 0.510562
[505]	valid_set's auc: 0.979258	valid_set's atec_metric: 0.510174
[506]	valid_set's auc: 0.979266	valid_set's atec_metric: 0.510174
[507]	valid_set's auc: 0.97929	valid_set's atec_metric: 0.510853
[508]	valid_set's auc: 0.979293	valid_set's atec_metric: 0.509302
[509]	valid_set's auc: 0.979313	valid_set's atec_metric: 0.509109
[510]	valid_set's auc: 0.979309	valid_set's atec_metric: 0.509012
[511]	valid_set's auc: 0.979305	valid_set's atec_metric: 0.508721
[512]	valid_set's auc: 0.979332	valid_set's atec_metric: 0.509012
[513]	valid_set's auc: 0.979369	valid_set's atec_metric: 0.509399
[514]	valid_set's auc: 0.979378	valid_set's atec_metric: 0.509787
[515]	valid_set's auc: 0.979363	valid_set's atec_metric: 0.508721
[516]	valid_set's auc: 0.97942	valid_set's atec_metric: 0.509787
[517]	valid_s