In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import gc
import time
import numpy as np
import pandas as pd
from sklearn.model_selection  import train_test_split
import xgboost as xgb
from xgboost import plot_importance
import matplotlib.pyplot as plt

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['test_supplement.csv', 'train_sample.csv', 'train.csv', 'sample_submission.csv', 'test.csv']


In [2]:
# 为什么train里没有attributed_time？从后面可以知道这一行缺失值太多了。故删除这列特征
train_columns = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']
test_columns  = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'click_id']
# 重新指定下数据内型，节省内存
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }

In [3]:
train = pd.read_csv("../input/train_sample.csv",dtype=dtypes)
train.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,29540,3,1,42,489,2017-11-08 03:57:46,,0
1,26777,11,1,25,319,2017-11-09 11:02:14,,0
2,140926,12,1,13,140,2017-11-07 04:36:14,,0
3,69375,2,1,19,377,2017-11-09 13:17:20,,0
4,119166,9,2,15,445,2017-11-07 12:11:37,,0


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
ip                 100000 non-null uint32
app                100000 non-null uint16
device             100000 non-null uint16
os                 100000 non-null uint16
channel            100000 non-null uint16
click_time         100000 non-null object
attributed_time    251 non-null object
is_attributed      100000 non-null uint8
dtypes: object(2), uint16(4), uint32(1), uint8(1)
memory usage: 2.8+ MB


In [5]:
train.isnull().sum()  # 100000行样本，attributed_time几乎全部丢失了，故不用这列特征

ip                     0
app                    0
device                 0
os                     0
channel                0
click_time             0
attributed_time    99749
is_attributed          0
dtype: int64

In [6]:
#import featuretools as tf 自动特征提取以后熟悉了再用


del train  # 删除前面创建的test数据集
import gc
gc.collect()

18

In [7]:
#选择我们命名的列train_columns为特征列
train = pd.read_csv("../input/train_sample.csv",usecols=train_columns, dtype=dtypes)
train.head()  # 比之前少了attributed_time这列特征

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed
0,29540,3,1,42,489,2017-11-08 03:57:46,0
1,26777,11,1,25,319,2017-11-09 11:02:14,0
2,140926,12,1,13,140,2017-11-07 04:36:14,0
3,69375,2,1,19,377,2017-11-09 13:17:20,0
4,119166,9,2,15,445,2017-11-07 12:11:37,0


In [8]:

# test = pd.read_csv("../input/test.csv",dtype=dtypes)
# test.head()   #内存瞬间上了1个多G

# del test  # 删除前面创建的test数据集
# import gc
# gc.collect()

In [9]:
#用我们自己指定的特征，其实这个test数据集与上面注释掉的test没区别
test = pd.read_csv("../input/test.csv",usecols=test_columns,dtype=dtypes)
test.head()

Unnamed: 0,click_id,ip,app,device,os,channel,click_time
0,0,5744,9,1,3,107,2017-11-10 04:00:00
1,1,119901,9,1,3,466,2017-11-10 04:00:00
2,2,72287,21,1,19,128,2017-11-10 04:00:00
3,3,78477,15,1,13,111,2017-11-10 04:00:00
4,4,123080,12,1,13,328,2017-11-10 04:00:00


In [10]:

# 给定数据集df，生成‘dow’和‘doy’特征后,删除'click_time', 'datetime'这两列特征,返回具有新特征的数据集df
def timeFeatures(df):
    # Make some new features with click_time column
    df['datetime'] = pd.to_datetime(df['click_time'])
    df['dow']      = df['datetime'].dt.dayofweek
    df["doy"]      = df["datetime"].dt.dayofyear
    #df["dteom"]    = df["datetime"].dt.daysinmonth - df["datetime"].dt.day
    df.drop(['click_time', 'datetime'], axis=1, inplace=True)  # 生成‘dow’和‘doy’特征后，删除'click_time', 'datetime'这两列特征
    return df  # 返回输入的数据集


In [11]:
train_for_fun = train.copy()

# 测试例子1
timeFeatures(train_for_fun)
train_for_fun.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,dow,doy
0,29540,3,1,42,489,0,2,312
1,26777,11,1,25,319,0,3,313
2,140926,12,1,13,140,0,1,311
3,69375,2,1,19,377,0,3,313
4,119166,9,2,15,445,0,1,311


In [12]:
# 测试例子2
def dataPreProcessTime(df):
    df['click_time'] = pd.to_datetime(df['click_time']).dt.date
    df['click_time'] = df['click_time'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
    return df
train_for_fun2 = train.copy()
dataPreProcessTime(train_for_fun2)
train_for_fun2.head()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed
0,29540,3,1,42,489,20171108,0
1,26777,11,1,25,319,20171109,0
2,140926,12,1,13,140,20171107,0
3,69375,2,1,19,377,20171109,0
4,119166,9,2,15,445,20171107,0


In [13]:
train = dataPreProcessTime(train)
test = dataPreProcessTime(test)
gc.collect()

59

In [14]:
# Drop the IP and the columns from target
y = train['is_attributed']
train.drop(['is_attributed'], axis=1, inplace=True)

In [15]:
def timeFeatures(df):
    # Make some new features with click_time column
    df['datetime'] = pd.to_datetime(df['click_time'])
    df['dow']      = df['datetime'].dt.dayofweek
    df["doy"]      = df["datetime"].dt.dayofyear
    #df["dteom"]    = df["datetime"].dt.daysinmonth - df["datetime"].dt.day
    df.drop(['click_time', 'datetime'], axis=1, inplace=True)
    return df

In [16]:
# Drop IP and ID from test rows
sub = pd.DataFrame()
sub['click_id'] = test['click_id'].astype('int')
test.drop(['click_id'], axis=1, inplace=True)
gc.collect()

21

In [17]:
# Some feature engineering
nrow_train = train.shape[0]
merge = pd.concat([train, test])
del train, test
gc.collect()

0

In [18]:
# Count the number of clicks by ip
ip_count = merge.groupby(['ip'])['app'].count().reset_index()
ip_count.columns = ['ip', 'clicks_by_ip']
merge = pd.merge(merge, ip_count, on='ip', how='left', sort=False)
merge['clicks_by_ip'] = merge['clicks_by_ip'].astype('uint16')
merge.drop('ip', axis=1, inplace=True)  #去掉ip特征值是由原因的，它的AUC贡献几乎没有
# Pranav Kernel: https://www.kaggle.com/pranav84/xgboost-on-hist-mode-ip-addresses-dropped
# Andy Kernel: https://www.kaggle.com/aharless/jo-o-s-xgboost-with-memory-usage-enhancements
# https://www.kaggle.com/aharless/predicting-with-just-ip-ranges-as-coded 此时AUC才

In [19]:
train = merge[:nrow_train]
test = merge[nrow_train:]

In [20]:
del merge
gc.collect()

42

In [21]:
# Set the params(this params from Pranav kernel) for xgboost model
params = {'eta': 0.3,
          'tree_method': "hist",
          'grow_policy': "lossguide",
          'max_leaves': 1400,  
          'max_depth': 0, 
          'subsample': 0.9, 
          'colsample_bytree': 0.7, 
          'colsample_bylevel':0.7,
          'min_child_weight':0,
          'alpha':4,
          'objective': 'binary:logistic', 
          'scale_pos_weight':9,
          'eval_metric': 'auc', 
          'nthread':8,
          'random_state': 99, 
          'silent': True}

In [22]:
# Change this for validation with 10% from train
using_test = True

if (using_test == False):
    # Get 10% of train dataset to use as validation
    # 原始训练集 = 训练集 + 验证集
    # 类似x1 = X_train, x2 = X_test, y1 = y_train, y2 = y_test
    x1, x2, y1, y2 = train_test_split(train, y, test_size=0.1, shuffle=False)
    del train, y
    dtrain = xgb.DMatrix(x1, y1)
    del x1, y1
    dvalid = xgb.DMatrix(x2, y2)
    del x2, y2 
    watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
    earlystop = 20
    nrounds = 300
    verbose = 2
else:
    dtrain = xgb.DMatrix(train, y)
    del train, y
    watchlist = [(dtrain, 'train')]
    verbose = 1
    earlystop = None
    nrounds = 14
gc.collect()

dtest = xgb.DMatrix(test)
del test
gc.collect()

model = xgb.train(params, dtrain, nrounds, watchlist, maximize=True, 
                  early_stopping_rounds=earlystop, verbose_eval=verbose)

del dtrain


[01:03:39] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
[0]	train-auc:0.924801
[1]	train-auc:0.922497
[2]	train-auc:0.931988
[3]	train-auc:0.941509
[4]	train-auc:0.948939
[5]	train-auc:0.941742
[6]	train-auc:0.95212
[7]	train-auc:0.952433
[8]	train-auc:0.947973
[9]	train-auc:0.948529
[10]	train-auc:0.948172
[11]	train-auc:0.980696
[12]	train-auc:0.986234
[13]	train-auc:0.987095


In [23]:
sub['is_attributed'] = model.predict(dtest, ntree_limit=model.best_ntree_limit)
# sub.to_csv('xgb_sub_2.csv',index=False)

In [27]:
train = pd.read_csv("../input/train_sample.csv",usecols=train_columns, dtype=dtypes)
y = train['is_attributed']
y.value_counts()

0    99749
1      251
Name: is_attributed, dtype: int64

In [37]:
sub[sub.is_attributed == 0.008108175]

Unnamed: 0,click_id,is_attributed
2,2,0.008108
87,87,0.008108
1475,1473,0.008108
1893,1893,0.008108
2021,2021,0.008108
2278,2278,0.008108
3197,3197,0.008108
3829,3828,0.008108
4239,4239,0.008108
4547,4547,0.008108


In [38]:
len(sub)

18790469