In [1]:
import gc
import time
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
import xgboost as xgb
from xgboost import plot_importance
import matplotlib.pyplot as plt

def read_hdf(name:str):
    return pd.read_hdf("../data/{}.hdf".format(name), name)



In [3]:
ls -lha ../data/*.hdf

-rw-rw-r-- 1 ubuntu ubuntu 7.7M Apr 19 17:21 ../data/click_count_by_ip.hdf
-rw-rw-r-- 1 ubuntu ubuntu 158M Apr 19 17:29 ../data/click_count_by_ip_os_device_dow.hdf
-rw-rw-r-- 1 ubuntu ubuntu 529M Apr 19 17:33 ../data/click_count_by_ip_os_device_dow_hour.hdf
-rw-rw-r-- 1 ubuntu ubuntu 6.1G Apr 19 22:18 ../data/click_counts.hdf
-rw-rw-r-- 1 ubuntu ubuntu  18G Apr 19 10:09 ../data/click_data.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.1G Apr 19 20:16 ../data/click_time_interval_by_ip.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.1G Apr 19 20:48 ../data/click_time_interval_by_ip_os_device.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.1G Apr 19 20:58 ../data/click_time_interval_by_ip_os_device_dow.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.1G Apr 19 21:05 ../data/click_time_interval_by_ip_os_device_dow_hour.hdf
-rw-rw-r-- 1 ubuntu ubuntu 7.6G Apr 19 11:00 ../data/click_times.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.0G Apr 19 11:26 ../data/ip_os_device.hdf
-rw-rw-r-- 1 ubuntu ubuntu 4.6G Apr 19 17:02 ../data/ip_os_device_dow.hdf
-rw-rw-r-- 1 ubu

In [4]:
%%time
click_counts = read_hdf("click_counts")

CPU times: user 1.08 s, sys: 2.72 s, total: 3.8 s
Wall time: 1min 43s


In [5]:
%%time
ranks = read_hdf("ranks")

CPU times: user 432 ms, sys: 3.32 s, total: 3.75 s
Wall time: 1min 44s


In [6]:
%%time
click_data = read_hdf("click_data")

CPU times: user 13.7 s, sys: 9.1 s, total: 22.8 s
Wall time: 3min 7s


In [7]:
%%time
click_times = read_hdf("click_times")

CPU times: user 1.07 s, sys: 3.61 s, total: 4.68 s
Wall time: 2min 9s


In [12]:
click_counts.shape, ranks.shape, click_data.shape, click_times.shape

((203694359, 3), (203694359, 3), (203694359, 9), (203694359, 4))

In [15]:
%%time
merged_data = pd.concat([click_data, click_counts, click_times, ranks], axis=1)

CPU times: user 16.9 s, sys: 7.29 s, total: 24.2 s
Wall time: 24.2 s


In [17]:
%%time
del click_data
del click_counts
del click_times
del ranks
merged_data.drop(columns=["click_time"], inplace=True)

CPU times: user 15.7 s, sys: 17.6 s, total: 33.3 s
Wall time: 33.2 s


In [18]:
gc.collect()

4565

In [None]:
%%time
merged_data.to_hdf("../data/merged_click_data.hdf", "merged_click_data")

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['attributed_time']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [2]:
ls -lha ../data/*.hdf

-rw-rw-r-- 1 ubuntu ubuntu 7.7M Apr 19 17:21 ../data/click_count_by_ip.hdf
-rw-rw-r-- 1 ubuntu ubuntu 158M Apr 19 17:29 ../data/click_count_by_ip_os_device_dow.hdf
-rw-rw-r-- 1 ubuntu ubuntu 529M Apr 19 17:33 ../data/click_count_by_ip_os_device_dow_hour.hdf
-rw-rw-r-- 1 ubuntu ubuntu 6.1G Apr 19 22:18 ../data/click_counts.hdf
-rw-rw-r-- 1 ubuntu ubuntu  18G Apr 19 10:09 ../data/click_data.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.1G Apr 19 20:16 ../data/click_time_interval_by_ip.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.1G Apr 19 20:48 ../data/click_time_interval_by_ip_os_device.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.1G Apr 19 20:58 ../data/click_time_interval_by_ip_os_device_dow.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.1G Apr 19 21:05 ../data/click_time_interval_by_ip_os_device_dow_hour.hdf
-rw-rw-r-- 1 ubuntu ubuntu 7.6G Apr 19 11:00 ../data/click_times.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.0G Apr 19 11:26 ../data/ip_os_device.hdf
-rw-rw-r-- 1 ubuntu ubuntu 4.6G Apr 19 17:02 ../data/ip_os_device_dow.hdf
-rw-rw-r-- 1 ubu

In [5]:
%%time
click_time_interval_by_ip = read_hdf("click_time_interval_by_ip")
click_time_interval_by_ip_os_device = read_hdf("click_time_interval_by_ip_os_device")
click_time_interval_by_ip_os_device_dow = read_hdf("click_time_interval_by_ip_os_device_dow")
click_time_interval_by_ip_os_device_dow_hour = read_hdf("click_time_interval_by_ip_os_device_dow_hour")

CPU times: user 3.14 s, sys: 7.33 s, total: 10.5 s
Wall time: 3min 28s


In [10]:
%%time
click_time_intervals = pd.concat([
    click_time_interval_by_ip, 
    click_time_interval_by_ip_os_device, 
    click_time_interval_by_ip_os_device_dow, 
    click_time_interval_by_ip_os_device_dow_hour], axis=1)

CPU times: user 4.11 s, sys: 1.54 s, total: 5.66 s
Wall time: 5.66 s


In [14]:
%%time
click_time_intervals.to_hdf("../data/click_time_intervals.hdf", "click_time_intervals")

CPU times: user 5.87 s, sys: 12 s, total: 17.9 s
Wall time: 17.9 s


In [3]:
%%time
click_time_intervals = read_hdf("click_time_intervals")

CPU times: user 1.41 s, sys: 5.56 s, total: 6.97 s
Wall time: 58.5 s


In [2]:
%%time
merged_click_data = read_hdf("merged_click_data")

CPU times: user 1.27 s, sys: 9.94 s, total: 11.2 s
Wall time: 11.2 s


In [6]:
merged_click_data.dtypes

app                                      uint16
attributed_time                          object
channel                                  uint16
click_id                                float64
device                                   uint16
ip                                       uint32
is_attributed                           float64
os                                       uint16
click_count_by_ip_os_device_dow           int64
click_count_by_ip_os_device_dow_hour      int64
click_count_by_ip                         int64
dow                                       int64
hour                                      int64
minute                                    int64
second                                    int64
rank_by_ip                                int64
rank_by_ip_os_device                      int64
rank_by_ip_os_device_dow                  int64
dtype: object

In [13]:
merged_click_data["click_count_by_ip_os_device_dow"] = merged_click_data["click_count_by_ip_os_device_dow"].astype("uint32")

In [15]:
merged_click_data["click_count_by_ip_os_device_dow_hour"] = merged_click_data["click_count_by_ip_os_device_dow_hour"].astype("uint16")

In [17]:
%%time
merged_click_data["click_count_by_ip"] = merged_click_data["click_count_by_ip"].astype("uint32")

CPU times: user 2.47 s, sys: 3.31 s, total: 5.78 s
Wall time: 5.78 s


In [18]:
merged_click_data["dow"].max(), merged_click_data["dow"].min()

(4, 0)

In [19]:
%%time
merged_click_data["dow"] = merged_click_data["dow"].astype("uint8")

CPU times: user 2.08 s, sys: 2.58 s, total: 4.66 s
Wall time: 4.66 s


In [20]:
%%time
merged_click_data["hour"] = merged_click_data["hour"].astype("uint8")
merged_click_data["minute"] = merged_click_data["minute"].astype("uint8")
merged_click_data["second"] = merged_click_data["second"].astype("uint8")

CPU times: user 4.68 s, sys: 5.24 s, total: 9.92 s
Wall time: 9.92 s


In [21]:
merged_click_data.dtypes

app                                      uint16
attributed_time                          object
channel                                  uint16
click_id                                float64
device                                   uint16
ip                                       uint32
is_attributed                           float64
os                                       uint16
click_count_by_ip_os_device_dow          uint32
click_count_by_ip_os_device_dow_hour     uint16
click_count_by_ip                        uint32
dow                                       uint8
hour                                      uint8
minute                                    uint8
second                                    uint8
rank_by_ip                                int64
rank_by_ip_os_device                      int64
rank_by_ip_os_device_dow                  int64
dtype: object

In [22]:
merged_click_data["rank_by_ip_os_device"].max(), merged_click_data["rank_by_ip_os_device"].min()

(282426, 0)

In [23]:
%%time
merged_click_data["rank_by_ip_os_device"] = merged_click_data["rank_by_ip_os_device"].astype("uint32")

CPU times: user 1.03 s, sys: 1.21 s, total: 2.24 s
Wall time: 2.24 s


In [24]:
merged_click_data["rank_by_ip_os_device_dow"].max(), merged_click_data["rank_by_ip_os_device_dow"].min()

(93200, 0)

In [25]:
%%time
merged_click_data["rank_by_ip_os_device_dow"] = merged_click_data["rank_by_ip_os_device_dow"].astype("uint32")

CPU times: user 720 ms, sys: 804 ms, total: 1.52 s
Wall time: 1.52 s


In [26]:
merged_click_data.dtypes

app                                      uint16
attributed_time                          object
channel                                  uint16
click_id                                float64
device                                   uint16
ip                                       uint32
is_attributed                           float64
os                                       uint16
click_count_by_ip_os_device_dow          uint32
click_count_by_ip_os_device_dow_hour     uint16
click_count_by_ip                        uint32
dow                                       uint8
hour                                      uint8
minute                                    uint8
second                                    uint8
rank_by_ip                                int64
rank_by_ip_os_device                     uint32
rank_by_ip_os_device_dow                 uint32
dtype: object

In [29]:
merged_click_data["rank_by_ip"].max(), merged_click_data["rank_by_ip"].min()

(1421255, 0)

In [28]:
%%time
merged_click_data["rank_by_ip"] = merged_click_data["rank_by_ip"].astype("uint32")

CPU times: user 368 ms, sys: 448 ms, total: 816 ms
Wall time: 815 ms


In [30]:
merged_click_data.drop(columns=["attributed_time"], inplace=True)

In [33]:
merged_click_data.to_hdf("../data/merged_click_data.hdf", "merged_click_data")

In [2]:
%%time
merged_click_data = read_hdf("merged_click_data")

CPU times: user 40 ms, sys: 5.72 s, total: 5.76 s
Wall time: 5.76 s


In [4]:
ls -lha ../data/*.hdf

-rw-rw-r-- 1 ubuntu ubuntu 7.7M Apr 19 17:21 ../data/click_count_by_ip.hdf
-rw-rw-r-- 1 ubuntu ubuntu 158M Apr 19 17:29 ../data/click_count_by_ip_os_device_dow.hdf
-rw-rw-r-- 1 ubuntu ubuntu 529M Apr 19 17:33 ../data/click_count_by_ip_os_device_dow_hour.hdf
-rw-rw-r-- 1 ubuntu ubuntu 6.1G Apr 19 22:18 ../data/click_counts.hdf
-rw-rw-r-- 1 ubuntu ubuntu  18G Apr 19 10:09 ../data/click_data.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.1G Apr 19 20:16 ../data/click_time_interval_by_ip.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.1G Apr 19 20:48 ../data/click_time_interval_by_ip_os_device.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.1G Apr 19 20:58 ../data/click_time_interval_by_ip_os_device_dow.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.1G Apr 19 21:05 ../data/click_time_interval_by_ip_os_device_dow_hour.hdf
-rw-rw-r-- 1 ubuntu ubuntu 7.6G Apr 20 00:15 ../data/click_time_intervals.hdf
-rw-rw-r-- 1 ubuntu ubuntu 7.6G Apr 19 11:00 ../data/click_times.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.0G Apr 19 11:26 ../data/ip_os_device.hdf
-rw-rw-r-- 1

In [5]:
%%time
click_time_intervals = read_hdf("click_time_intervals")

CPU times: user 1.51 s, sys: 5.36 s, total: 6.86 s
Wall time: 43 s


In [8]:
%%time
merged_click_data = pd.concat([merged_click_data, click_time_intervals], axis=1)

CPU times: user 19.2 s, sys: 8.44 s, total: 27.7 s
Wall time: 27.7 s


In [9]:
merged_click_data.to_hdf("../data/merged_click_data.hdf", "merged_click_data")

In [10]:
merged_click_data

Unnamed: 0,app,channel,click_id,device,ip,is_attributed,os,click_count_by_ip_os_device_dow,click_count_by_ip_os_device_dow_hour,click_count_by_ip,...,hour,minute,second,rank_by_ip,rank_by_ip_os_device,rank_by_ip_os_device_dow,click_time_interval_by_ip,click_time_interval_by_ip_os_device,click_time_interval_by_ip_os_device_dow,click_time_interval_by_ip_os_device_dow_hour
0,3,379,,1,83230,0.0,13,434,1,28085,...,14,32,21,0,0,0,NaT,NaT,NaT,NaT
1,3,379,,1,17357,0.0,19,183,1,26234,...,14,33,34,0,0,0,NaT,NaT,NaT,NaT
2,3,379,,1,35810,0.0,13,34,1,11002,...,14,34,12,0,0,0,NaT,NaT,NaT,NaT
3,14,478,,1,45745,0.0,13,1501,1,188741,...,14,34,52,0,0,0,NaT,NaT,NaT,NaT
4,3,379,,1,161007,0.0,13,28,1,1171,...,14,35,8,0,0,0,NaT,NaT,NaT,NaT
5,3,379,,1,18787,0.0,16,1,1,4029,...,14,36,26,0,0,0,NaT,NaT,NaT,NaT
6,3,379,,1,103022,0.0,23,1,1,6099,...,14,37,44,0,0,0,NaT,NaT,NaT,NaT
7,3,379,,1,114221,0.0,19,51,1,2334,...,14,37,59,0,0,0,NaT,NaT,NaT,NaT
8,3,379,,1,165970,0.0,13,19,1,2014,...,14,38,10,0,0,0,NaT,NaT,NaT,NaT
9,64,459,,1,74544,0.0,22,5,1,3882,...,14,38,23,0,0,0,NaT,NaT,NaT,NaT


In [2]:
merged_click_data = read_hdf("merged_click_data")

In [5]:
merged_click_data.dtypes

app                                                      uint16
channel                                                  uint16
click_id                                                float64
device                                                   uint16
ip                                                       uint32
is_attributed                                           float64
os                                                       uint16
click_count_by_ip_os_device_dow                          uint32
click_count_by_ip_os_device_dow_hour                     uint16
click_count_by_ip                                        uint32
dow                                                       uint8
hour                                                      uint8
minute                                                    uint8
second                                                    uint8
rank_by_ip                                               uint32
rank_by_ip_os_device                    

In [8]:
%%time
merged_click_data["click_time_interval_by_ip"] = merged_click_data["click_time_interval_by_ip"] / np.timedelta64(1, "s")
merged_click_data["click_time_interval_by_ip_os_device"] = merged_click_data["click_time_interval_by_ip_os_device"] / np.timedelta64(1, "s")
merged_click_data["click_time_interval_by_ip_os_device_dow"] = merged_click_data["click_time_interval_by_ip_os_device_dow"] / np.timedelta64(1, "s")
merged_click_data["click_time_interval_by_ip_os_device_dow_hour"] = merged_click_data["click_time_interval_by_ip_os_device_dow_hour"] / np.timedelta64(1, "s")

CPU times: user 29.4 s, sys: 11.5 s, total: 40.9 s
Wall time: 34.3 s


In [9]:
merged_click_data.dtypes

app                                              uint16
channel                                          uint16
click_id                                        float64
device                                           uint16
ip                                               uint32
is_attributed                                   float64
os                                               uint16
click_count_by_ip_os_device_dow                  uint32
click_count_by_ip_os_device_dow_hour             uint16
click_count_by_ip                                uint32
dow                                               uint8
hour                                              uint8
minute                                            uint8
second                                            uint8
rank_by_ip                                       uint32
rank_by_ip_os_device                             uint32
rank_by_ip_os_device_dow                         uint32
click_time_interval_by_ip                       

In [10]:
merged_click_data.to_hdf("../data/merged_click_data.hdf", "merged_click_data")