## 1.导入工具库

In [56]:
import os
import pandas as pd
import numpy as np
from datetime import timedelta, datetime
from sklearn.preprocessing import OneHotEncoder
from scipy import sparse
from sklearn.metrics import f1_score
from lightgbm import LGBMClassifier

import warnings
warnings.filterwarnings("ignore")

import logging
logging.basicConfig(level = logging.WARNING,
                    format = '%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')

## 2.数据集

|特征类别|变量|数据类型|解释|
|-------|-----|-----|-----|
|数据类别|变量|数值格式|解释|
|基本数据|sid|string|样本id/请求会话sid|
|基本数据|label|int|是否作弊|
|媒体信息|pkgname|string|包名(MD5加密)|
|媒体信息|ver|string|app版本|
|媒体信息|adunitshowid|string|对外广告位ID（MD5加密）|
|媒体信息|mediashowid|string|对外媒体ID（MD5加密）|
|媒体信息|apptype|int|app所属分类|
|时间|nginxtime|bigint|请求到达服务时间，单位ms|
|IP信息|ip|string|客户端IP地址|
|IP信息|city|string|城市|
|IP信息|province|string|省份|
IP信息|reqrealip|string|请求的http协议头携带IP，有可能是下|游服务器的ip|
|设备信息|adidmd5|string|Adroid ID的MD5值|
|设备信息|imeimd5|string|imei的MD5值|
|设备信息|idfamd5|string|idfa的MD5值|
|设备信息|openudidmd5|string|openudid的MD5值|
|设备信息|macmd5|string|mac的MD5值|
设备信息|dvctype|int|设备类型 0 – 未知,1 – PC,2 – 手机, 3– 平板,4– 电视盒,5– 智能电视,6 – 可穿戴设备,7 – |智能家电,8 - 音箱,9 - 智能硬件|
|设备信息|model|string|机型|
|设备信息|make|string|厂商|
设备信息|ntt|int|网络类型 0-未知, 1-有线网, 2-WIFI, |3-蜂窝网络未知, 4-2G, 5-3G, 6–4G|
设备信息|carrier|string|运营商 0-未知, 46000-移动, |46001-联通, 46003-电信|
|设备信息|os|string|操作系统 : Android, iOS|
|设备信息|osv|string|操作系统版本|
|设备信息|orientation|int|横竖屏:0竖屏，1横屏|
|设备信息|lan|string|语言|
|设备信息|h|int|设备高|
|设备信息|w|int|设备宽|
|设备信息|ppi|int|屏幕密度|

## 3.读取数据集

In [58]:
print("=" * 40)
print("read data")
print("=" * 40)

# 读取 train data 和 test data
path = "/Users/zfwang/data/mldata/aichallenge_2019_ad_fraud/data/"
df_train = pd.read_csv(path + "round1_iflyad_anticheat_traindata.txt", sep = "\t")
df_test = pd.read_csv(path + "round1_iflyad_anticheat_testdata_feature.txt", sep = "\t")

# 合并 df_train 和 df_test
df_uni = pd.concat([df_train, df_test], ignore_index = True)
df_uni["label"] = df_uni["label"].fillna(-1).astype(int)

# 查看 df_train, df_test, df_uni
with pd.option_context("display.max_columns", None):
    print(df_uni.head())
print("=" * 23)
print("当前数据集的形状：")
print("=" * 23)
print("train data shape:", df_train.shape)
print("test data shape:", df_test.shape)
print("unioned train and test data shape:", df_uni.shape)

read data
                            adidmd5                      adunitshowid  \
0  eb4fa0cf77cdd57a6993a4e42c8fa4cf  907d0f8c29663840491577a21c7b612a   
1  a4ae6efcfaee62fb6da44a9c05753259  10199dc8fea2e173525bc3151bd84312   
2  3404f799628be2cf8fda7428aac2cca4  83f2ecfe65f936f5f2ed59f8e8ff1d01   
3  941517d46b7261d98592425672bb86f5  9f1eadd9092b19bc86ee0cacde1c867f   
4  fc08a6724db4c2dcf9b5af3a4e0b6f38  2af944462e43cd2f59acbbfd37445413   

   apptype  carrier  city  dvctype       h idfamd5  \
0    280.0  46000.0  石家庄市      2.0  2340.0   empty   
1    319.0      0.0   开封市      2.0  1080.0   empty   
2    273.0  46000.0   长春市      2.0  2196.0   empty   
3    265.0      0.0   曲靖市      2.0     0.0   empty   
4    336.0  46000.0   深圳市      2.0  2244.0   empty   

                            imeimd5               ip  label    lan  \
0  930b3a8ecff4f586c63fec5b96693f32    183.197.47.83      1  zh-CN   
1  6e38deaa1f7b4118015c550bb21913d2    106.34.14.149      1    NaN   
2  3fa42b06332c7

## 4.特征工程

### 4.1 特征分类

In [61]:
print("=" * 23)
print("特征分类")
print("=" * 23)
# 待处理特征
cat_cols = ['pkgname', 'ver', 'adunitshowid', 'mediashowid', 'apptype', 'ip',
            'reqrealip', 'city', 'province', 'adidmd5', 'imeimd5', 'idfamd5',
            'openudidmd5', 'macmd5', 'dvctype', 'model', 'make', 'ntt',
            'carrier', 'os', 'osv', 'orientation', 'lan', 'h', 'w', 'ppi']
# 待删除特征
drop_cols = ['sid', 'label', 'nginxtime']

特征分类


### 4.2 缺失值填充

In [62]:
# 对含有缺失值的特征用`null_value`进行填充
print("=" * 23)
print('fill null')
print("=" * 23)
for cat_col in cat_cols:
    if df_uni[cat_col].isnull().sum() > 0:
        df_uni[cat_col].fillna('null_value', inplace=True)

fill null


### 4.3 生成特征

In [63]:
print("=" * 23)
print("generate some features:")
print("=" * 23)
def gen_value_counts(data, col):
    print('value counts', col)
    df_tmp = pd.DataFrame(data[col].value_counts().reset_index())
    df_tmp.columns = [col, 'tmp']
    r = pd.merge(data, df_tmp, how = 'left', on = col)['tmp']
    return r.fillna(0)

value_counts_col = ['pkgname', 'adunitshowid', 'ip', 'reqrealip',
                    'adidmd5', 'imeimd5', 'idfamd5', 'macmd5']

for col in value_counts_col:
    df_uni['vc_' + col] = gen_value_counts(df_uni, col)

print("=" * 23)
print("当前df_uni的形状：")
print("=" * 23)
print(df_uni.shape)

generate some features:
value counts pkgname
value counts adunitshowid
value counts ip
value counts reqrealip
value counts adidmd5
value counts imeimd5
value counts idfamd5
value counts macmd5
当前df_uni的形状：
(1100000, 37)


### 4.4 

In [64]:
print("=" * 23)
print('cut features:')
print("=" * 23)
def cut_col(data, col_name, cut_list):
    print('cutting', col_name)

    def _trans(array):
        count = array['box_counts']
        for box in cut_list:
            if count <= box:
                return 'count_' + str(box)
        return array[col_name]

    df_counts = pd.DataFrame(data[col_name].value_counts())
    df_counts.columns = ['box_counts']
    df_counts[col_name] = df_counts.index
    df = pd.merge(data, df_counts, on = col_name, how = 'left')
    column = df.apply(_trans, axis = 1)
    return column

cut_col_dict = {
    ('pkgname', 'ver', 'reqrealip', 'adidmd5', 'imeimd5', 'openudidmd5', 'macmd5', 'model', 'make'): [3],
    ('ip',): [3, 5, 10],
}

for cut_cols, cut_list in cut_col_dict.items():
    for col in cut_cols:
        df_uni[col] = cut_col(df_uni, col, cut_list)

cut features:
cutting pkgname
cutting ver
cutting reqrealip
cutting adidmd5
cutting imeimd5
cutting openudidmd5
cutting macmd5
cutting model
cutting make
cutting ip


### 4.5 日期特征处理

In [65]:
print("=" * 23)
print('feature time')
print("=" * 23)
# 将`请求到达服务时间`转换为"s",并且转换为北京时间
df_uni['datetime'] = pd.to_datetime(df_uni['nginxtime'] / 1000, unit = 's') + timedelta(hours = 8)

# 提取`请求到达服务时间`的`小时`
df_uni['hour'] = df_uni['datetime'].dt.hour

# 计算当前`请求到达服务时间`距离第一次请求到达服务时间的天数
df_uni['day'] = df_uni['datetime'].dt.day - df_uni['datetime'].dt.day.min()

# 生成`hour(时)`特征
cat_cols += ['hour']
drop_cols += ['datetime', 'day']

feature time


### 4.6 

In [66]:
print("=" * 23)
print('post process')
print("=" * 23)
for col in cat_cols:
    df_uni[col] = df_uni[col].map(dict(zip(df_uni[col].unique(), range(0, df_uni[col].nunique()))))

post process


### 4.7 重新创建 train, validation, test 数据集

In [68]:
all_train_index = (df_uni['day'] <= 6).values
train_index     = (df_uni['day'] <= 5).values
valid_index     = (df_uni['day'] == 6).values
test_index      = (df_uni['day'] == 7).values
train_label     = (df_uni['label']).values
print("all_train_index.shape:", all_train_index.shape)
print("train_index.shape:", train_index.shape)
print("valid_index.shape:", valid_index.shape)
print("test_index.shape:", test_index.shape)
print("train_label.shape:", train_label.shape)

all_train_index.shape: (1100000,)
train_index.shape: (1100000,)
valid_index.shape: (1100000,)
test_index.shape: (1100000,)
train_label.shape: (1100000,)


### 4.8 删除无用特征

In [69]:
for col in drop_cols:
    if col in df_uni.columns:
        df_uni.drop([col], axis=1, inplace=True)
print("=" * 23)
print("当前df_uni的形状：")
print("=" * 23)
print(df_uni.shape)

当前df_uni的形状：
(1100000, 35)


### 4.9 类别特征One-Hot编码

In [None]:
ohe = OneHotEncoder()
mtx_cat = ohe.fit_transform(df_uni[cat_cols])

num_cols = list(set(df_uni.columns).difference(set(cat_cols)))
mtx_num = sparse.csr_matrix(df_uni[num_cols].astype(float).values)
mtx_uni = sparse.hstack([mtx_num, mtx_cat])
mtx_uni = mtx_uni.tocsr()

### 4.10 模型数据准备

In [70]:
def col_filter(mtx_train, y_train, mtx_test, func=chi2, percentile=90):
    feature_select = SelectPercentile(func, percentile=percentile)
    feature_select.fit(mtx_train, y_train)
    mtx_train = feature_select.transform(mtx_train)
    mtx_test = feature_select.transform(mtx_test)
    return mtx_train, mtx_test

all_train_x, test_x = col_filter(
    mtx_uni[all_train_index, :],
    train_label[all_train_index],
    mtx_uni[test_index, :]
)

train_x = all_train_x[train_index[:all_train_x.shape[0]], :]
train_y = train_label[train_index]

val_x = all_train_x[valid_index[:all_train_x.shape[0]], :]
val_y = train_label[valid_index]

NameError: name 'all_train_x' is not defined

## 5.训练模型

### 5.1 定义模型评估指标(metric)

In [71]:
# 混淆矩阵
def confusion_matrix():
    pass

# 精度和召回率
def precision_recall():
    pass

# f1 score
def lgb_f1(labels, preds):
    score = f1_score(labels, np.round(preds))
    return 'f1', score, True

### 5.2 模型训练

In [None]:
print("=" * 23)
print('training...')
print("=" * 23)

# models
lgb = LGBMClassifier(random_seed = 2019, 
                     n_jobs = -1, 
                     objective = 'binary',
                     learning_rate = 0.1, 
                     n_estimators = 4000, 
                     num_leaves = 64, 
                     max_depth = -1,
                     min_child_samples = 20, 
                     min_child_weight = 9, 
                     subsample_freq = 1,
                     subsample = 0.8, 
                     colsample_bytree = 0.8, 
                     reg_alpha = 1, 
                     reg_lambda = 5)

lgb.fit(
    train_x,
    train_y,
    eval_set = [(train_x, train_y), (val_x, val_y)],
    eval_names = ['train', 'val'],
    eval_metric = lgb_f1,
    early_stopping_rounds = 100,
    verbose = 10,
)

print('best score', lgb.best_score_)

## 6.模型预测

In [None]:
print("=" * 23)
print("predict:")
print("=" * 23)
all_train_y = train_label[all_train_index]
lgb.n_estimators = lgb.best_iteration_
lgb.fit(all_train_x, all_train_y)
test_y = lgb.predict(test_x)

## 7.生成submission文件

In [None]:
print("=" * 23)
print("generate submission file")
print("=" * 23)
df_sub = pd.concat([df_test["sid"], pd.Series(test_y)], axis = 1)
df_sub.columns = ["sid", "label"]
df_sub.to_csv("submit-{}.csv".format(datetime.now().strftime("%m%d_%H%M%S")), 
              sep = ",", 
              index = False)