In [1]:
import gc
import time
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import plot_importance
import matplotlib.pyplot as plt
from logzero import logger
import sys

nthread = 8

def read_feather(name: str):
    logger.info("loading {}".format(name))
    df = pd.read_feather(name, nthreads=nthread).astype("float")
    logger.info(df.columns)
    return df

def show_top(var_names):
    print("{}{: >25}{}{: >12}{}".format('|','Variable Name','|','Memory','|'))
    print(" ------------------------------------ ")
    for var_name in var_names:
        memory = sys.getsizeof(eval(var_name))//1024//1024//1024
        if memory == 0:
            continue
        print("{}{: >25}{}{: >10}GB{}".format('|', var_name, '|', memory, '|'))

# Extract

小さいデータで学習して、データを間引く

In [2]:
ls -lha ../data/

total 217G
drwxrwxr-x 2 ubuntu ubuntu 4.0K May  2 12:51 [0m[01;34m.[0m/
drwxrwxr-x 7 ubuntu ubuntu 4.0K Apr 27 18:29 [01;34m..[0m/
-rw-rw-r-- 1 ubuntu ubuntu 3.3G Apr 25 19:19 basic_table
-rw-rw-r-- 1 ubuntu ubuntu 850M Apr 25 19:19 channel_categorical
-rw-rw-r-- 1 ubuntu ubuntu 1.9G Apr 25 19:19 click_count_table
-rw-rw-r-- 1 ubuntu ubuntu 802M Apr 25 19:19 click_time_interval_ip_device_os_app_hash
-rw-rw-r-- 1 ubuntu ubuntu 706M Apr 26 07:28 down_sampling_train
-rw-rw-r-- 1 ubuntu ubuntu 3.2G Apr 25 19:19 intervals_table
-rw-rw-r-- 1 ubuntu ubuntu  10G Apr 25 19:19 misc_counts
-rw-rw-r-- 1 ubuntu ubuntu  16G Apr 25 19:19 multi_basic
-rw-rw-r-- 1 ubuntu ubuntu  12G Apr 25 21:45 next_prev_channels
-rw-r--r-- 1 ubuntu ubuntu  16G Apr 28 04:12 prev_next_click_basic
-rw-r--r-- 1 ubuntu ubuntu  31G Apr 28 05:27 prev_next_click_for_2_columns
-rw-r--r-- 1 ubuntu ubuntu  31G Apr 28 15:36 prev_next_click_for_3_columns
-rw-r--r-- 1 ubuntu ubuntu  19G Apr 29 12:27 prev_next_click_for_45_col

In [3]:
prev_importances = [
 (1271, 'next_channel_by_device'),
 (1030, 'prev_channel_by_device'),
 (524, 'click_count_by_ip_os_device_dow_hour'),
 (522, 'second_of_day'),
 (487, 'ip_app_os_var'),
 (471, 'app'),
 (435, 'ip_dow_channel_var'),
 (410, 'channel'),
 (389, 'next_channel_by_ip_os'),
 (377, 'sin_of_day'),
 (373, 'cos_of_day'),
 (366, 'ip_app_count'),
 (353, 'device_app'),
 (351, 'click_count_by_ip'),
 (343, 'ip_app_channel_mean'),
 (332, 'ip_device_countuniq'),
 (332, 'click_time_interval_ip_device_os_app_hash'),
 (328, 'ip_app_countuniq'),
 (324, 'app_channel'),
 (321, 'ip_channel_countuniq'),
 (321, 'ip_app_os_countuniq'),
 (321, 'ip_app_channel_var'),
 (313, 'ip_device_os_app_countuniq'),
 (307, 'device_app_channel'),
 (307, 'click_time_interval_by_ip'),
 (299, 'click_count_by_ip_os_device_dow'),
 (297, 'next_channel_by_ip_os_device_app'),
 (292, 'rank_by_ip_os_device_dow'),
 (291, 'ip'),
 (278, 'next_channel_by_ip'),
 (273, 'app_channel_countuniq'),
 (272, 'ip_device_os_cumcount'),
 (258, 'ip_os'),
 (257, 'next_channel_by_ip_app'),
 (256, 'os'),
 (247, 'next_channel_by_ip_device'),
 (240, 'ip_dow_hour_count'),
 (235, 'next_channel_by_ip_os_device'),
 (229, 'ip_dow_hour_countuniq'),
 (228, 'minute'),
 (228, 'click_time_interval_by_ip_os_device'),
 (224, 'hour')]
prev_columns = [c for i, c in prev_importances]
prev_columns

['next_channel_by_device',
 'prev_channel_by_device',
 'click_count_by_ip_os_device_dow_hour',
 'second_of_day',
 'ip_app_os_var',
 'app',
 'ip_dow_channel_var',
 'channel',
 'next_channel_by_ip_os',
 'sin_of_day',
 'cos_of_day',
 'ip_app_count',
 'device_app',
 'click_count_by_ip',
 'ip_app_channel_mean',
 'ip_device_countuniq',
 'click_time_interval_ip_device_os_app_hash',
 'ip_app_countuniq',
 'app_channel',
 'ip_channel_countuniq',
 'ip_app_os_countuniq',
 'ip_app_channel_var',
 'ip_device_os_app_countuniq',
 'device_app_channel',
 'click_time_interval_by_ip',
 'click_count_by_ip_os_device_dow',
 'next_channel_by_ip_os_device_app',
 'rank_by_ip_os_device_dow',
 'ip',
 'next_channel_by_ip',
 'app_channel_countuniq',
 'ip_device_os_cumcount',
 'ip_os',
 'next_channel_by_ip_app',
 'os',
 'next_channel_by_ip_device',
 'ip_dow_hour_count',
 'next_channel_by_ip_os_device',
 'ip_dow_hour_countuniq',
 'minute',
 'click_time_interval_by_ip_os_device',
 'hour']

In [4]:
%%time

data_files = [
    "../data/basic_table",
    "../data/click_count_table",
    "../data/click_time_interval_ip_device_os_app_hash",
    "../data/intervals_table",
    "../data/misc_counts",
    "../data/multi_basic",
    "../data/next_prev_channels",
    "../data/prev_next_click_basic",
#     "../data/prev_next_click_for_2_columns",
#     "../data/prev_next_click_for_3_columns",
#     "../data/prev_next_click_for_45_columns",
    "../data/ranks_table",
    "../data/time_of_day",
    "../data/time_table",
]

dfs = []
for file in data_files:
    df = read_feather(file)
    dropping = [c for c in df.columns if c not in prev_columns]
    df.drop(columns=dropping, inplace=True)
    dfs.append(df)

[I 180502 14:13:30 <ipython-input-1-29f11ab86043>:14] loading ../data/basic_table
[I 180502 14:14:11 <ipython-input-1-29f11ab86043>:16] Index(['app', 'channel', 'click_id', 'device', 'ip', 'is_attributed', 'os'], dtype='object')
[I 180502 14:14:13 <ipython-input-1-29f11ab86043>:14] loading ../data/click_count_table
[I 180502 14:14:35 <ipython-input-1-29f11ab86043>:16] Index(['click_count_by_ip', 'click_count_by_ip_os_device_dow',
           'click_count_by_ip_os_device_dow_hour'],
          dtype='object')
[I 180502 14:14:37 <ipython-input-1-29f11ab86043>:14] loading ../data/click_time_interval_ip_device_os_app_hash
[I 180502 14:14:47 <ipython-input-1-29f11ab86043>:16] Index(['click_time_interval_ip_device_os_app_hash', 'first_click_interval_ip_device_os_app_hash'], dtype='object')
[I 180502 14:14:48 <ipython-input-1-29f11ab86043>:14] loading ../data/intervals_table
[I 180502 14:15:30 <ipython-input-1-29f11ab86043>:16] Index(['click_time_interval_by_ip', 'click_time_interval_by_ip_os_d

CPU times: user 1min 37s, sys: 2min 46s, total: 4min 24s
Wall time: 13min 12s


In [5]:
click_data = pd.concat(dfs, axis=1, copy=False)

In [6]:
del dfs
gc.collect()

105

In [7]:
show_top(dir())

|            Variable Name|      Memory|
 ------------------------------------ 
|               click_data|        63GB|
|                       df|         3GB|


In [8]:
click_data.columns

Index(['app', 'channel', 'ip', 'os', 'click_count_by_ip',
       'click_count_by_ip_os_device_dow',
       'click_count_by_ip_os_device_dow_hour',
       'click_time_interval_ip_device_os_app_hash',
       'click_time_interval_by_ip', 'click_time_interval_by_ip_os_device',
       'ip_device_os_cumcount', 'ip_channel_countuniq',
       'ip_dow_hour_countuniq', 'ip_app_countuniq', 'ip_app_os_countuniq',
       'ip_device_countuniq', 'app_channel_countuniq',
       'ip_device_os_app_countuniq', 'ip_dow_hour_count', 'ip_app_count',
       'ip_dow_channel_var', 'ip_app_os_var', 'ip_app_channel_var',
       'ip_app_channel_mean', 'ip_os', 'device_app', 'app_channel',
       'device_app_channel', 'next_channel_by_device',
       'prev_channel_by_device', 'next_channel_by_ip', 'next_channel_by_ip_os',
       'next_channel_by_ip_device', 'next_channel_by_ip_app',
       'next_channel_by_ip_os_device', 'next_channel_by_ip_os_device_app',
       'rank_by_ip_os_device_dow', 'second_of_day', 'sin_o

In [9]:
click_data["dow"] = read_feather("../data/time_table")["dow"]

[I 180502 14:31:30 <ipython-input-1-29f11ab86043>:14] loading ../data/time_table
[I 180502 14:31:33 <ipython-input-1-29f11ab86043>:16] Index(['dow', 'hour', 'minute', 'second'], dtype='object')


In [10]:
click_data = click_data[click_data["dow"] <= 3]

In [18]:
show_top(dir())

|            Variable Name|      Memory|
 ------------------------------------ 
|               click_data|        61GB|


In [17]:
del df

In [12]:
is_attributed = read_feather("../data/basic_table")[:click_data.shape[0]]

[I 180502 14:34:16 <ipython-input-1-29f11ab86043>:14] loading ../data/basic_table
[I 180502 14:34:56 <ipython-input-1-29f11ab86043>:16] Index(['app', 'channel', 'click_id', 'device', 'ip', 'is_attributed', 'os'], dtype='object')


In [13]:
click_data["is_attributed"] = is_attributed["is_attributed"]

In [14]:
del is_attributed
gc.collect()

7

In [15]:
click_data.to_feather("../data/tmp")

In [2]:
click_data = read_feather("../data/tmp")

[I 180502 13:57:55 <ipython-input-1-29f11ab86043>:14] loading ../data/tmp
[I 180502 14:07:42 <ipython-input-1-29f11ab86043>:16] Index(['app', 'channel', 'ip', 'os', 'click_count_by_ip',
           'click_count_by_ip_os_device_dow',
           'click_count_by_ip_os_device_dow_hour',
           'click_time_interval_ip_device_os_app_hash',
           'click_time_interval_by_ip', 'click_time_interval_by_ip_os_device',
           'ip_device_os_cumcount', 'ip_channel_countuniq',
           'ip_dow_hour_countuniq', 'ip_app_countuniq', 'ip_app_os_countuniq',
           'ip_device_countuniq', 'app_channel_countuniq',
           'ip_device_os_app_countuniq', 'ip_dow_hour_count', 'ip_app_count',
           'ip_app_os_count', 'ip_dow_channel_var', 'ip_app_os_var',
           'ip_app_channel_var', 'ip_app_channel_mean', 'ip_os_device_app_hash',
           'ip_os', 'device_app', 'app_channel', 'device_app_channel',
           'next_channel_by_device', 'prev_channel_by_device',
           'next_chann

In [None]:
%%time
def do_train(df, day):
    logger.info("train started")
    train = df[df["dow"] != day]
    logger.info("train created")
    test = df[df["dow"] == day]
    logger.info("test created")
    
    y_train = train["is_attributed"].astype("int")
    train.drop(columns=["is_attributed"], inplace=True)
    test.drop(columns=["is_attributed"], inplace=True)
    gc.collect()
    
    logger.info("dropped")
    train = train.values
    gc.collect()
    
    logger.info("Generating matrix...")
    dtrain = xgb.DMatrix(train, y_train, nthread=nthread)
    del train, y_train
    gc.collect()
    
    logger.info("testing...")
    params = {'eta': 0.6,
          'tree_method': "hist",
          'grow_policy': "lossguide",
          'max_leaves': 1400,
          'max_depth': 0,
          'subsample': 0.9,
          'colsample_bytree': 0.7,
          'colsample_bylevel': 0.7,
          'min_child_weight': 0,
          'alpha': 4,
          'objective': 'binary:logistic',
          'scale_pos_weight': 9,
          'eval_metric': 'auc',
          'nthread': nthread,
          'random_state': 99,
          'silent': False}
    
    watchlist = [(dtrain, 'train')]
    model = xgb.train(params, dtrain, 15, watchlist, maximize=True, verbose_eval=1)
    del dtrain
    gc.collect()

    logger.info("Generating test matrix")
    test = xgb.DMatrix(test.values, nthread=nthread)
    gc.collect()
    
    return model.predict(test, ntree_limit=model.best_ntree_limit)
    
predicts = []
for day in range(4):
    logger.info("day {}".format(day))
    predicts.append(do_train(click_data, day))

[I 180502 14:48:44 <timed exec>:53] day 0
[I 180502 14:48:44 <timed exec>:2] train started
[I 180502 14:53:54 <timed exec>:4] train created
[I 180502 14:53:56 <timed exec>:6] test created
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
[I 180502 14:54:26 <timed exec>:13] dropped
[I 180502 14:54:26 <timed exec>:17] Generating matrix...
[I 180502 14:58:02 <timed exec>:22] testing...


[0]	train-auc:0.971319
[1]	train-auc:0.973401
[2]	train-auc:0.975875
[3]	train-auc:0.978341
[4]	train-auc:0.980457
[5]	train-auc:0.98101
[6]	train-auc:0.982078
[7]	train-auc:0.982384
[8]	train-auc:0.982537
[9]	train-auc:0.983656
[10]	train-auc:0.984648
[11]	train-auc:0.98559
[12]	train-auc:0.985688
[13]	train-auc:0.985622
[14]	train-auc:0.985923


[I 180502 15:18:50 <timed exec>:45] Generating test matrix
[I 180502 15:18:56 <timed exec>:53] day 1
[I 180502 15:18:56 <timed exec>:2] train started
[I 180502 15:19:21 <timed exec>:4] train created
[I 180502 15:19:34 <timed exec>:6] test created
[I 180502 15:20:02 <timed exec>:13] dropped
[I 180502 15:20:02 <timed exec>:17] Generating matrix...
[I 180502 15:20:47 <timed exec>:22] testing...


[0]	train-auc:0.968843
[1]	train-auc:0.975119
[2]	train-auc:0.976101
[3]	train-auc:0.978401
[4]	train-auc:0.980212
[5]	train-auc:0.980661
[6]	train-auc:0.981355
[7]	train-auc:0.982051
[8]	train-auc:0.98253
[9]	train-auc:0.983299
[10]	train-auc:0.984378
[11]	train-auc:0.985384
[12]	train-auc:0.986058
[13]	train-auc:0.986484
[14]	train-auc:0.986055


[I 180502 15:36:09 <timed exec>:45] Generating test matrix
[I 180502 15:36:49 <timed exec>:53] day 2
[I 180502 15:36:49 <timed exec>:2] train started
[I 180502 15:37:14 <timed exec>:4] train created
[I 180502 15:37:27 <timed exec>:6] test created
[I 180502 15:37:55 <timed exec>:13] dropped
[I 180502 15:37:55 <timed exec>:17] Generating matrix...
[I 180502 15:38:41 <timed exec>:22] testing...


[0]	train-auc:0.960732
[1]	train-auc:0.971809
[2]	train-auc:0.974655
[3]	train-auc:0.976758
[4]	train-auc:0.977789
[5]	train-auc:0.978991


In [20]:
len(predicts)

4

In [21]:
predicts[0]

array([8.2362583e-04, 8.2676363e-04, 8.2676363e-04, ..., 2.9272158e-04,
       6.6526383e-01, 3.4567919e-02], dtype=float32)

In [24]:
predicts = np.concatenate(predicts, axis=0)

In [25]:
predicts

array([0.00082363, 0.00082676, 0.00082676, ..., 0.00263758, 0.00010159,
       0.00251996], dtype=float32)

In [26]:
is_attributed = read_feather("../data/basic_table")[["is_attributed"]]

[I 180502 18:34:20 <ipython-input-1-29f11ab86043>:14] loading ../data/basic_table
[I 180502 18:35:01 <ipython-input-1-29f11ab86043>:16] Index(['app', 'channel', 'click_id', 'device', 'ip', 'is_attributed', 'os'], dtype='object')


In [30]:
is_attributed = is_attributed[:len(predicts)]

In [32]:
is_attributed["prediction"] = predicts

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [34]:
is_attributed["abs"] = np.abs(is_attributed["is_attributed"] - is_attributed["prediction"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [38]:
is_attributed.drop(columns=["is_attributed"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [39]:
is_attributed.to_feather("../data/down_sampling_train")

In [40]:
is_attributed

Unnamed: 0,prediction,abs
0,0.000824,0.000824
1,0.000827,0.000827
2,0.000827,0.000827
3,0.000202,0.000202
4,0.003911,0.003911
5,0.004696,0.004696
6,0.000502,0.000502
7,0.000977,0.000977
8,0.001174,0.001174
9,0.001130,0.001130


# Validation

全特徴量でやっていく

In [None]:
ls -lha