In [1]:
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# 由于电脑内存的缘故,无法将所有数据集加载进来,只取部分数据,另外没有进行预测,只进行了交叉验证

dtypes = {
        'ip': 'uint32',
        'app': 'uint16',
        'device': 'uint16',
        'os': 'uint16',
        'channel': 'uint16',
        'is_attributed': 'uint8',
        'click_id': 'uint32',
    }

# 去除attributed_time字段,该字段和is_attributed重合
x_train = pd.read_csv("./data/train.csv", nrows=10000000, parse_dates=['click_time'], dtype=dtypes,
                     usecols=['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed'])

print("train shape: {}".format(x_train.shape))
print("train columns: {}".format(x_train.columns))

train shape: (10000000, 7)
train columns: Index(['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed'], dtype='object')


In [4]:
# 0-1的比例为533
x_train.device.value_counts()

1       9381146
2        456617
3032     104393
0         46476
59         1618
40          462
6           458
16          334
18          247
33          204
21          190
154         151
3033        151
37          145
30          126
46          123
114         122
7           121
88          117
109         113
67          111
748         103
136          96
78           95
82           95
97           95
374          95
50           91
211          81
60           75
         ...   
924           1
919           1
915           1
913           1
911           1
903           1
901           1
892           1
886           1
875           1
870           1
972           1
977           1
984           1
1047          1
1067          1
1059          1
1058          1
1055          1
1053          1
1048          1
1046          1
1002          1
1044          1
1036          1
1016          1
1014          1
1012          1
1003          1
674           1
Name: device, Length: 94

In [5]:
# 0-1的比例为792
x_train.is_attributed[x_train.device.isin([1, 2, 3032])].value_counts()

0    9929624
1      12532
Name: is_attributed, dtype: int64

In [83]:
# 从点击时间提取出日,时两个特征
x_train['day'] = x_train['click_time'].dt.day.astype('uint8')
x_train['hour'] = x_train['click_time'].dt.hour.astype('uint8')
gc.collect()
x_train.head()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed,day,hour
0,83230,3,1,13,379,2017-11-06 14:32:21,0,6,14
1,17357,3,1,19,379,2017-11-06 14:33:34,0,6,14
2,35810,3,1,13,379,2017-11-06 14:34:12,0,6,14
3,45745,14,1,13,478,2017-11-06 14:34:52,0,6,14
4,161007,3,1,13,379,2017-11-06 14:35:08,0,6,14


In [84]:
# 对ip,day,hour做聚合
gp = x_train[['ip', 'day', 'hour', 'channel']].groupby(by=['ip', 'day', 'hour'])[['channel']].count().reset_index().rename(index=str, columns={"channel": 'qty'})
x_train = x_train.merge(gp, on=['ip', 'day', 'hour'], how='left')
del gp
gc.collect()
x_train.head()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed,day,hour,qty
0,83230,3,1,13,379,2017-11-06 14:32:21,0,6,14,1
1,17357,3,1,19,379,2017-11-06 14:33:34,0,6,14,1
2,35810,3,1,13,379,2017-11-06 14:34:12,0,6,14,1
3,45745,14,1,13,478,2017-11-06 14:34:52,0,6,14,1
4,161007,3,1,13,379,2017-11-06 14:35:08,0,6,14,1


In [85]:
# 对ip,app做聚合
gp = x_train[['ip', 'app', 'channel']].groupby(by=['ip', 'app'])[['channel']].count().reset_index().rename(index=str, columns={"channel": 'ip_app_count'})
x_train = x_train.merge(gp, on=['ip', 'app'], how='left')
del gp
gc.collect()
x_train.head()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed,day,hour,qty,ip_app_count
0,83230,3,1,13,379,2017-11-06 14:32:21,0,6,14,1,206
1,17357,3,1,19,379,2017-11-06 14:33:34,0,6,14,1,138
2,35810,3,1,13,379,2017-11-06 14:34:12,0,6,14,1,39
3,45745,14,1,13,478,2017-11-06 14:34:52,0,6,14,1,487
4,161007,3,1,13,379,2017-11-06 14:35:08,0,6,14,1,26


In [86]:
# 对ip,app,os做聚合
gp = x_train[['ip', 'app', 'os', 'channel']].groupby(by=['ip', 'app', 'os'])[['channel']].count().reset_index().rename(index=str, columns={"channel": 'ip_app_os_count'})
x_train = x_train.merge(gp, on=['ip', 'app', 'os'], how='left')
del gp
gc.collect()
x_train.head()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed,day,hour,qty,ip_app_count,ip_app_os_count
0,83230,3,1,13,379,2017-11-06 14:32:21,0,6,14,1,206,80
1,17357,3,1,19,379,2017-11-06 14:33:34,0,6,14,1,138,23
2,35810,3,1,13,379,2017-11-06 14:34:12,0,6,14,1,39,7
3,45745,14,1,13,478,2017-11-06 14:34:52,0,6,14,1,487,116
4,161007,3,1,13,379,2017-11-06 14:35:08,0,6,14,1,26,4


In [87]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000000 entries, 0 to 9999999
Data columns (total 12 columns):
ip                 uint32
app                uint16
device             uint16
os                 uint16
channel            uint16
click_time         datetime64[ns]
is_attributed      uint8
day                uint8
hour               uint8
qty                int64
ip_app_count       int64
ip_app_os_count    int64
dtypes: datetime64[ns](1), int64(3), uint16(4), uint32(1), uint8(3)
memory usage: 524.5 MB


In [88]:
# 将一些字段的分配内存设置小
x_train['qty'] = x_train['qty'].astype('uint16')
x_train['ip_app_count'] = x_train['ip_app_count'].astype('uint16')
x_train['ip_app_os_count'] = x_train['ip_app_os_count'].astype('uint16')
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000000 entries, 0 to 9999999
Data columns (total 12 columns):
ip                 uint32
app                uint16
device             uint16
os                 uint16
channel            uint16
click_time         datetime64[ns]
is_attributed      uint8
day                uint8
hour               uint8
qty                uint16
ip_app_count       uint16
ip_app_os_count    uint16
dtypes: datetime64[ns](1), uint16(7), uint32(1), uint8(3)
memory usage: 352.9 MB


In [89]:
x_train.head()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed,day,hour,qty,ip_app_count,ip_app_os_count
0,83230,3,1,13,379,2017-11-06 14:32:21,0,6,14,1,206,80
1,17357,3,1,19,379,2017-11-06 14:33:34,0,6,14,1,138,23
2,35810,3,1,13,379,2017-11-06 14:34:12,0,6,14,1,39,7
3,45745,14,1,13,478,2017-11-06 14:34:52,0,6,14,1,487,116
4,161007,3,1,13,379,2017-11-06 14:35:08,0,6,14,1,26,4


In [90]:
x_val = x_train[:1000000]
x_train = x_train[1000000:]
print("validate shape: {}".format(x_val.shape))
print("train shape: {}".format(x_train.shape))

validate shape: (1000000, 12)
train shape: (9000000, 12)


In [91]:
import lightgbm as lgb
def lgb_modelfit_nocv(params, dtrain, dvalid, predictors, target='target', objective='binary', metrics='auc',
                 feval=None, early_stopping_rounds=20, num_boost_round=3000, verbose_eval=10, categorical_features=None):
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': objective,
        'metric':metrics,
        'learning_rate': 0.01,
        #'is_unbalance': 'true',  #because training data is unbalance (replaced with scale_pos_weight)
        'num_leaves': 31,  # we should let it be smaller than 2^(max_depth)
        'max_depth': -1,  # -1 means no limit
        'min_child_samples': 20,  # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': 255,  # Number of bucketed bin for feature values
        'subsample': 0.6,  # Subsample ratio of the training instance.
        'subsample_freq': 0,  # frequence of subsample, <=0 means no enable
        'colsample_bytree': 0.3,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight': 5,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'subsample_for_bin': 200000,  # Number of samples for constructing bin
        'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        'reg_alpha': 0,  # L1 regularization term on weights
        'reg_lambda': 0,  # L2 regularization term on weights
        'nthread': 8,
        'verbose': 0,
    }

    lgb_params.update(params)

    print("preparing validation datasets")

    xgtrain = lgb.Dataset(dtrain[predictors].values, label=dtrain[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical_features
                          )
    xgvalid = lgb.Dataset(dvalid[predictors].values, label=dvalid[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical_features
                          )

    evals_results = {}

    bst1 = lgb.train(lgb_params, 
                     xgtrain, 
                     valid_sets=[xgtrain, xgvalid], 
                     valid_names=['train','valid'], 
                     evals_result=evals_results, 
                     num_boost_round=num_boost_round,
                     early_stopping_rounds=early_stopping_rounds,
                     verbose_eval=10, 
                     feval=feval)

    n_estimators = bst1.best_iteration
    print("\nModel Report")
    print("n_estimators : ", n_estimators)
    print(metrics+":", evals_results['valid'][metrics][n_estimators-1])

    return bst1

In [92]:
target = 'is_attributed'
predictors = ['app','device','os', 'channel', 'hour', 'day', 'qty', 'ip_app_count', 'ip_app_os_count']
categorical = ['app','device','os', 'channel', 'hour']
params = {
    'learning_rate': 0.1,
    #'is_unbalance': 'true', # replaced with scale_pos_weight argument
    'num_leaves': 7,  # we should let it be smaller than 2^(max_depth)
    'max_depth': 3,  # -1 means no limit
    'min_child_samples': 100,  # Minimum number of data need in a child(min_data_in_leaf)
    'max_bin': 100,  # Number of bucketed bin for feature values
    'subsample': 0.7,  # Subsample ratio of the training instance.
    'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
    'colsample_bytree': 0.7,  # Subsample ratio of columns when constructing each tree.
    'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
    'scale_pos_weight':99 # because training data is extremely unbalanced 
}

bst = lgb_modelfit_nocv(params, 
                        x_train, 
                        x_val, 
                        predictors, 
                        target, 
                        objective='binary', 
                        metrics='auc',
                        early_stopping_rounds=50, 
                        verbose_eval=True, 
                        num_boost_round=300, 
                        categorical_features=categorical)


preparing validation datasets




Training until validation scores don't improve for 50 rounds.
[10]	train's auc: 0.956842	valid's auc: 0.950153
[20]	train's auc: 0.962326	valid's auc: 0.955538
[30]	train's auc: 0.965	valid's auc: 0.957476
[40]	train's auc: 0.967409	valid's auc: 0.95961
[50]	train's auc: 0.969603	valid's auc: 0.961548
[60]	train's auc: 0.971086	valid's auc: 0.962994
[70]	train's auc: 0.972004	valid's auc: 0.963362
[80]	train's auc: 0.973033	valid's auc: 0.963827
[90]	train's auc: 0.973706	valid's auc: 0.964159
[100]	train's auc: 0.974197	valid's auc: 0.964323
[110]	train's auc: 0.974521	valid's auc: 0.964496
[120]	train's auc: 0.974937	valid's auc: 0.96476
[130]	train's auc: 0.975224	valid's auc: 0.964972
[140]	train's auc: 0.975565	valid's auc: 0.965039
[150]	train's auc: 0.975839	valid's auc: 0.965019
[160]	train's auc: 0.97611	valid's auc: 0.965025
[170]	train's auc: 0.976421	valid's auc: 0.965087
[180]	train's auc: 0.976593	valid's auc: 0.964884
Early stopping, best iteration is:
[133]	train's auc: