In [1]:
%matplotlib inline
import pandas as pd
import lightgbm as lgb
import numpy as np
import os
from __future__ import division
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,roc_curve
from sklearn.model_selection import ParameterGrid
import datetime
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [2]:
df_train=pd.read_csv('data/atec_anti_fraud_train.csv', parse_dates=['date'])

In [3]:
df_train_clean=df_train.loc[df_train['label']!=-1]
del df_train

In [4]:
y=df_train_clean['label']
X=df_train_clean.drop(['id','label','date'],axis=1)
cut_date_train=datetime.datetime(2017,10,20)
cut_date_test=datetime.datetime(2017,11,1)

In [5]:
x_train=X[df_train_clean.date<=cut_date_train]
x_test=X[df_train_clean.date>=cut_date_test]
y_train=y[df_train_clean.date<=cut_date_train]
y_test=y[df_train_clean.date>=cut_date_test]

In [6]:
num_values=2000
category_feature=[]
for i in range(X.shape[1]):
    num = X['f'+str(i+1)].unique().shape[0]
    if num <=num_values:
        category_feature.append(i)

In [7]:
del df_train_clean, X, y

In [8]:
def atec_metric(preds, train_data):
    labels = train_data.get_label()
    fpr,tpr,threshhold=roc_curve(labels, preds)
    return 'atec_metric', 0.4*tpr[fpr<=0.001][-1] + 0.3*tpr[fpr<=0.005][-1] + 0.3*tpr[fpr<=0.01][-1], True

In [9]:
feature_name=['feasture_'+str(col+1) for col in range(x_train.shape[1])]
lgb_train=lgb.Dataset(x_train,y_train,free_raw_data=False,feature_name=feature_name, categorical_feature=category_feature)
lgb_valid=lgb.Dataset(x_test,y_test,reference=lgb_train,free_raw_data=False,feature_name=feature_name, categorical_feature=category_feature)

In [10]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'device': 'cpu',
    'verbose': 0,
    'early_stopping_round': 100,
    'learning_rate': 0.01
}


In [13]:
params_grid = {
    'num_leaves': [64,128],
    #'lambda_l2': 0.002*np.arange(11),
    'lambda_l1': 0.002*np.arange(11),
    'feature_fraction': [0.9],
    'bagging_fraction': [0.8],
    'bagging_freq': [5],
}
params_list=ParameterGrid(params_grid)

In [14]:
best_params = []
scores = dict()
best_score = 0
for p in list(params_list):
    p0 = params.copy()
    p0.update(p)
    gbm = lgb.train(p0, lgb_train, num_boost_round=10000, valid_sets=lgb_valid, valid_names='valid_set', feval=atec_metric, verbose_eval=False)
    score = atec_metric(gbm.predict(x_test), lgb_valid)[1]
    print("params: ", p, "\n"+"score: ", score)
    if score > best_score:
        best_score = score
        best_params = p0



params:  {'bagging_fraction': 0.8, 'bagging_freq': 5, 'feature_fraction': 0.9, 'lambda_l1': 0.0, 'num_leaves': 64} 
score:  0.5111434108527132
params:  {'bagging_fraction': 0.8, 'bagging_freq': 5, 'feature_fraction': 0.9, 'lambda_l1': 0.0, 'num_leaves': 128} 
score:  0.5147286821705426
params:  {'bagging_fraction': 0.8, 'bagging_freq': 5, 'feature_fraction': 0.9, 'lambda_l1': 0.002, 'num_leaves': 64} 
score:  0.5095930232558139
params:  {'bagging_fraction': 0.8, 'bagging_freq': 5, 'feature_fraction': 0.9, 'lambda_l1': 0.002, 'num_leaves': 128} 
score:  0.5151162790697674
params:  {'bagging_fraction': 0.8, 'bagging_freq': 5, 'feature_fraction': 0.9, 'lambda_l1': 0.004, 'num_leaves': 64} 
score:  0.5107558139534883
params:  {'bagging_fraction': 0.8, 'bagging_freq': 5, 'feature_fraction': 0.9, 'lambda_l1': 0.004, 'num_leaves': 128} 
score:  0.5121124031007751
params:  {'bagging_fraction': 0.8, 'bagging_freq': 5, 'feature_fraction': 0.9, 'lambda_l1': 0.006, 'num_leaves': 64} 
score:  0.510

In [15]:
print(best_score,best_params)

0.5185077519379845 {'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'device': 'cpu', 'verbose': 0, 'learning_rate': 0.01, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'feature_fraction': 0.9, 'lambda_l1': 0.006, 'num_leaves': 128}
