In [None]:
# 导入基本库
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

pd.set_option('display.width', 10000)
pd.set_option('display.max_columns',10000)
pd.set_option('display.max_rows',10000)

plt.style.use("bmh")
plt.rc('font', family='MicroSoft YaHei', size=13)

warnings.filterwarnings('ignore')

In [None]:
# 导入数据
data_path = '../移动广告反欺诈算法挑战赛/'

test_df = pd.read_csv(data_path + 'round1_iflyad_anticheat_testdata_feature.txt', delimiter="\t")
train_df = pd.read_csv(data_path + 'round1_iflyad_anticheat_traindata.txt', delimiter="\t")

df_data = pd.concat([train_df, test_df], axis=0)
df_data.head(5)

In [None]:
train_df.shape[0], test_df.shape[0]

In [None]:
# 查看label分布情况
train_df['label'].value_counts()

In [None]:
# data attribute
df_data.info()

In [None]:
for i,name in enumerate(df_data.columns):
    name_sum = df_data[name].value_counts().shape[0]
    print("{:2}、{:15}      The number of types of features is：{}".format(i + 1, name, name_sum))

In [None]:
from sklearn.preprocessing import LabelEncoder


object_cols = list(df_data.dtypes[df_data.dtypes==np.object].index)
for col in object_cols:
    if col!='sid':
        lab = LabelEncoder()
        df_data[col] = lab.fit_transform(df_data[col].astype(str))

In [None]:
drop_list = ['sid', 'label', 'nginxtime', 'ip', 'macmd5', 'imeimd5', 'adidmd5']

train_data = df_data.loc[:train_df.shape[0]-1]
test_data = df_data.loc[train_df.shape[0]:]

train_label = train_data['label']
train_fea = train_data.drop(drop_list, axis=1)

result_df = pd.DataFrame()
result_df['sid'] = test_data['sid']
test_fea = test_data.drop(drop_list, axis=1)

In [None]:
lgb_param = {
    'learning_rate': 0.01,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'feature_fraction': 0.6,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'num_leaves': 1000,
    'verbose': -1,
    'max_depth': -1,
    'seed':2019,
}

In [None]:
import lightgbm as lgb
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold


def eval_func(y_pred, train_data):
    y_true = train_data.get_label()
    score = f1_score(y_true, np.round(y_pred))
    return 'f1', score, True

fold = 5
skf = StratifiedKFold(n_splits=fold, shuffle=True, random_state=2019)


oof_lgb = np.zeros(train_fea.shape[0])
prediction_lgb=np.zeros(test_fea.shape[0])

for i, (train_index, test_index) in enumerate(skf.split(train_fea, train_label)):
    print('fold:', i+1)
    
    X_train, X_valid = train_fea.loc[train_index], train_fea.loc[test_index]
    y_train, y_vaild = train_label.loc[train_index], train_label.loc[test_index]
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_valid = lgb.Dataset(X_valid, y_vaild, reference=lgb_train)
    
    lgb_model = lgb.train(lgb_param, lgb_train, num_boost_round=40000, valid_sets=[lgb_train, lgb_valid], 
                      valid_names = ['train', 'valid'], feval=eval_func, early_stopping_rounds=200, verbose_eval=100)

    
    oof_lgb[test_index] += lgb_model.predict(train_fea.loc[test_index], num_iteration=lgb_model.best_iteration)
    prediction_lgb += lgb_model.predict(test_fea, num_iteration=lgb_model.best_iteration)
    
    dis_df = test_df[['sid']]
    dis_df['label'] = prediction_lgb
    
    dis_df['label'] = dis_df['label'].apply(lambda x: 1 if x/(i+1) > 0.5 else 0) 
    print('****************************************************************************************************')
    print('information of fold {}'.format(str(i+1))
    print('roc_auc_score: ', roc_auc_score(train_label.loc[index_record], oof_lgb[index_record])) 
    print('f1_score: ' ,f1_score(train_label.loc[index_record], np.round(oof_lgb[index_record])))
    print('label distribution:', dis_df['label'].value_counts()) 
    print('====================================================================================================')

In [None]:
# write to csv
sub = test_df[['sid']]
sub['label'] = prediction_lgb / 5
sub['label'] = sub['label'].apply(lambda x: 1 if x > 0.5 else 0) 
sub.to_csv('baseline.csv', index=None) 