In [9]:
%matplotlib inline
import pandas as pd
import lightgbm as lgb
import numpy as np
import os
from __future__ import division
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,roc_curve
from sklearn.model_selection import ParameterGrid
from imblearn.over_sampling import RandomOverSampler
import datetime
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [15]:
df_train=pd.read_csv('data/atec_anti_fraud_train.csv', parse_dates=['date'])

In [16]:
df_train_clean=df_train.loc[df_train['label']!=-1]
del df_train

In [18]:
y=df_train_clean['label']
X=df_train_clean.drop(['id','label','date'],axis=1)
cut_date_train=datetime.datetime(2017,10,20)
cut_date_test=datetime.datetime(2017,11,1)

In [19]:
x_train=X[df_train_clean.date<=cut_date_train].values
x_test=X[df_train_clean.date>=cut_date_test].values
y_train=y[df_train_clean.date<=cut_date_train].values
y_test=y[df_train_clean.date>=cut_date_test].values

In [21]:
num_values=2000
category_feature=[]
for i in range(X.shape[1]):
    num = X['f'+str(i+1)].unique().shape[0]
    if num <=num_values:
        category_feature.append(i)

In [22]:
del df_train_clean, X, y

In [23]:
def atec_metric(preds, train_data):
    labels = train_data.get_label()
    fpr,tpr,threshhold=roc_curve(labels, preds)
    return 'atec_metric', 0.4*tpr[fpr<=0.001][-1] + 0.3*tpr[fpr<=0.005][-1] + 0.3*tpr[fpr<=0.01][-1], True

In [55]:
num_0=sum(y_train==0)
res=RandomOverSampler({0:num_0,1:int(num_0*0.1)})
x_resample,y_resample=res.fit_sample(x_train,y_train)

In [56]:
feature_name=['feasture_'+str(col+1) for col in range(x_train.shape[1])]
#W_weight=9*(1-y_train)+1
lgb_train=lgb.Dataset(x_resample,y_resample,free_raw_data=False,feature_name=feature_name, categorical_feature=category_feature)
lgb_valid=lgb.Dataset(x_test,y_test,reference=lgb_train,free_raw_data=False,feature_name=feature_name, categorical_feature=category_feature)

In [57]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'device': 'cpu',
    'verbose': 0,
    'early_stopping_round': 100,
    'learning_rate': 0.01,
    'num_leaves': 128,
    'lambda_l1': 0.006,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': [5],
    'bagging_seed':3,
    'feature_fration_seed':2
}


In [58]:
gbm = lgb.train(params, lgb_train, num_boost_round=10000, valid_sets=[lgb_train,lgb_valid], valid_names=['train_set','valid_set'],feval=atec_metric)



[1]	train_set's auc: 0.972747	train_set's atec_metric: 0.394136	valid_set's auc: 0.960088	valid_set's atec_metric: 0.288178
Training until validation scores don't improve for 100 rounds.
[2]	train_set's auc: 0.979482	train_set's atec_metric: 0.434658	valid_set's auc: 0.968686	valid_set's atec_metric: 0.356202
[3]	train_set's auc: 0.980176	train_set's atec_metric: 0.435913	valid_set's auc: 0.969742	valid_set's atec_metric: 0.352713
[4]	train_set's auc: 0.980136	train_set's atec_metric: 0.436632	valid_set's auc: 0.969849	valid_set's atec_metric: 0.351453
[5]	train_set's auc: 0.980012	train_set's atec_metric: 0.437091	valid_set's auc: 0.969615	valid_set's atec_metric: 0.353779
[6]	train_set's auc: 0.980676	train_set's atec_metric: 0.514063	valid_set's auc: 0.970323	valid_set's atec_metric: 0.375
[7]	train_set's auc: 0.98154	train_set's atec_metric: 0.52137	valid_set's auc: 0.970428	valid_set's atec_metric: 0.372965
[8]	train_set's auc: 0.981726	train_set's atec_metric: 0.524822	valid_set'

[67]	train_set's auc: 0.987597	train_set's atec_metric: 0.574117	valid_set's auc: 0.973487	valid_set's atec_metric: 0.449322
[68]	train_set's auc: 0.987621	train_set's atec_metric: 0.577374	valid_set's auc: 0.973492	valid_set's atec_metric: 0.455426
[69]	train_set's auc: 0.987655	train_set's atec_metric: 0.578072	valid_set's auc: 0.973602	valid_set's atec_metric: 0.456008
[70]	train_set's auc: 0.987679	train_set's atec_metric: 0.580212	valid_set's auc: 0.973617	valid_set's atec_metric: 0.459205
[71]	train_set's auc: 0.987711	train_set's atec_metric: 0.582304	valid_set's auc: 0.973709	valid_set's atec_metric: 0.456492
[72]	train_set's auc: 0.987739	train_set's atec_metric: 0.583794	valid_set's auc: 0.97382	valid_set's atec_metric: 0.46095
[73]	train_set's auc: 0.987764	train_set's atec_metric: 0.584226	valid_set's auc: 0.97393	valid_set's atec_metric: 0.459593
[74]	train_set's auc: 0.987792	train_set's atec_metric: 0.586198	valid_set's auc: 0.973953	valid_set's atec_metric: 0.462694
[75

[133]	train_set's auc: 0.990636	train_set's atec_metric: 0.638588	valid_set's auc: 0.976619	valid_set's atec_metric: 0.475581
[134]	train_set's auc: 0.990676	train_set's atec_metric: 0.639175	valid_set's auc: 0.976654	valid_set's atec_metric: 0.47626
[135]	train_set's auc: 0.990702	train_set's atec_metric: 0.640174	valid_set's auc: 0.97671	valid_set's atec_metric: 0.475775
[136]	train_set's auc: 0.990756	train_set's atec_metric: 0.640319	valid_set's auc: 0.976515	valid_set's atec_metric: 0.473837
[137]	train_set's auc: 0.990802	train_set's atec_metric: 0.642204	valid_set's auc: 0.97656	valid_set's atec_metric: 0.478004
[138]	train_set's auc: 0.990834	train_set's atec_metric: 0.643952	valid_set's auc: 0.976568	valid_set's atec_metric: 0.477422
[139]	train_set's auc: 0.99086	train_set's atec_metric: 0.644832	valid_set's auc: 0.976599	valid_set's atec_metric: 0.475097
[140]	train_set's auc: 0.99089	train_set's atec_metric: 0.646406	valid_set's auc: 0.97667	valid_set's atec_metric: 0.47315

[199]	train_set's auc: 0.993301	train_set's atec_metric: 0.694377	valid_set's auc: 0.977967	valid_set's atec_metric: 0.497384
[200]	train_set's auc: 0.993322	train_set's atec_metric: 0.695628	valid_set's auc: 0.977983	valid_set's atec_metric: 0.496415
[201]	train_set's auc: 0.993397	train_set's atec_metric: 0.696211	valid_set's auc: 0.978035	valid_set's atec_metric: 0.497674
[202]	train_set's auc: 0.993452	train_set's atec_metric: 0.696788	valid_set's auc: 0.978064	valid_set's atec_metric: 0.49719
[203]	train_set's auc: 0.993489	train_set's atec_metric: 0.697022	valid_set's auc: 0.978065	valid_set's atec_metric: 0.496512
[204]	train_set's auc: 0.993522	train_set's atec_metric: 0.697241	valid_set's auc: 0.978064	valid_set's atec_metric: 0.497384
[205]	train_set's auc: 0.993535	train_set's atec_metric: 0.69739	valid_set's auc: 0.978098	valid_set's atec_metric: 0.495736
[206]	train_set's auc: 0.993562	train_set's atec_metric: 0.698675	valid_set's auc: 0.978106	valid_set's atec_metric: 0.4

[265]	train_set's auc: 0.99519	train_set's atec_metric: 0.736911	valid_set's auc: 0.978918	valid_set's atec_metric: 0.495252
[266]	train_set's auc: 0.995209	train_set's atec_metric: 0.73752	valid_set's auc: 0.978944	valid_set's atec_metric: 0.495155
[267]	train_set's auc: 0.995259	train_set's atec_metric: 0.73806	valid_set's auc: 0.978971	valid_set's atec_metric: 0.496802
[268]	train_set's auc: 0.995278	train_set's atec_metric: 0.739388	valid_set's auc: 0.979038	valid_set's atec_metric: 0.49564
[269]	train_set's auc: 0.995296	train_set's atec_metric: 0.739882	valid_set's auc: 0.979041	valid_set's atec_metric: 0.49438
[270]	train_set's auc: 0.995314	train_set's atec_metric: 0.740607	valid_set's auc: 0.979053	valid_set's atec_metric: 0.495736
[271]	train_set's auc: 0.995349	train_set's atec_metric: 0.741145	valid_set's auc: 0.979084	valid_set's atec_metric: 0.496512
[272]	train_set's auc: 0.995366	train_set's atec_metric: 0.742	valid_set's auc: 0.979086	valid_set's atec_metric: 0.49719
[

[331]	train_set's auc: 0.996482	train_set's atec_metric: 0.781139	valid_set's auc: 0.980005	valid_set's atec_metric: 0.503585
[332]	train_set's auc: 0.996495	train_set's atec_metric: 0.7814	valid_set's auc: 0.980016	valid_set's atec_metric: 0.503295
[333]	train_set's auc: 0.996506	train_set's atec_metric: 0.781839	valid_set's auc: 0.980024	valid_set's atec_metric: 0.50436
[334]	train_set's auc: 0.996518	train_set's atec_metric: 0.782527	valid_set's auc: 0.980024	valid_set's atec_metric: 0.50436
[335]	train_set's auc: 0.996535	train_set's atec_metric: 0.783094	valid_set's auc: 0.980024	valid_set's atec_metric: 0.503973
[336]	train_set's auc: 0.996546	train_set's atec_metric: 0.783668	valid_set's auc: 0.980034	valid_set's atec_metric: 0.504264
[337]	train_set's auc: 0.996561	train_set's atec_metric: 0.783698	valid_set's auc: 0.980048	valid_set's atec_metric: 0.503876
[338]	train_set's auc: 0.996571	train_set's atec_metric: 0.784639	valid_set's auc: 0.980064	valid_set's atec_metric: 0.504

[397]	train_set's auc: 0.997296	train_set's atec_metric: 0.816302	valid_set's auc: 0.97912	valid_set's atec_metric: 0.505039
[398]	train_set's auc: 0.997302	train_set's atec_metric: 0.816964	valid_set's auc: 0.979115	valid_set's atec_metric: 0.50436
[399]	train_set's auc: 0.99731	train_set's atec_metric: 0.817415	valid_set's auc: 0.979121	valid_set's atec_metric: 0.503488
[400]	train_set's auc: 0.997317	train_set's atec_metric: 0.818126	valid_set's auc: 0.979101	valid_set's atec_metric: 0.504167
[401]	train_set's auc: 0.997325	train_set's atec_metric: 0.818548	valid_set's auc: 0.979087	valid_set's atec_metric: 0.503391
[402]	train_set's auc: 0.997333	train_set's atec_metric: 0.81865	valid_set's auc: 0.97908	valid_set's atec_metric: 0.502326
[403]	train_set's auc: 0.997339	train_set's atec_metric: 0.819034	valid_set's auc: 0.97907	valid_set's atec_metric: 0.502519
[404]	train_set's auc: 0.997346	train_set's atec_metric: 0.81946	valid_set's auc: 0.979092	valid_set's atec_metric: 0.502326