In [2]:
import gc
import time
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
import xgboost as xgb
from xgboost import plot_importance
import matplotlib.pyplot as plt

def read_hdf(name:str):
    return pd.read_hdf("../data/{}.hdf".format(name), name)



In [2]:
ls -lha ../data/*.hdf

-rw-rw-r-- 1 ubuntu ubuntu 7.7M Apr 28 17:08 ../data/click_count_by_ip.hdf
-rw-rw-r-- 1 ubuntu ubuntu 158M Apr 28 17:08 ../data/click_count_by_ip_os_device_dow.hdf
-rw-rw-r-- 1 ubuntu ubuntu 529M Apr 28 17:09 ../data/click_count_by_ip_os_device_dow_hour.hdf
-rw-rw-r-- 1 ubuntu ubuntu 6.1G Apr 28 17:27 ../data/click_counts.hdf
-rw-rw-r-- 1 ubuntu ubuntu  11G Apr 28 16:21 ../data/click_data.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.1G Apr 28 17:40 ../data/click_time_interval_by_ip.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.1G Apr 28 17:50 ../data/click_time_interval_by_ip_os_device.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.1G Apr 28 17:58 ../data/click_time_interval_by_ip_os_device_dow.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.1G Apr 28 18:07 ../data/click_time_interval_by_ip_os_device_dow_hour.hdf
-rw-rw-r-- 1 ubuntu ubuntu 7.6G Apr 29 01:37 ../data/click_time_intervals.hdf
-rw-rw-r-- 1 ubuntu ubuntu 7.6G Apr 28 16:42 ../data/click_times.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.0G Apr 28 16:48 ../data/ip_os_device.hdf


In [3]:
%%time
click_counts = read_hdf("click_counts")

CPU times: user 936 ms, sys: 2.73 s, total: 3.66 s
Wall time: 1min 43s


In [4]:
%%time
ranks = read_hdf("ranks")

CPU times: user 300 ms, sys: 3.31 s, total: 3.61 s
Wall time: 1min 44s


In [5]:
%%time
click_data = read_hdf("click_data")

CPU times: user 13.7 s, sys: 9.42 s, total: 23.2 s
Wall time: 3min 8s


In [6]:
%%time
click_times = read_hdf("click_times")

CPU times: user 988 ms, sys: 3.53 s, total: 4.52 s
Wall time: 2min 9s


In [7]:
click_counts.shape, ranks.shape, click_data.shape, click_times.shape

((203694359, 3), (203694359, 3), (203694359, 9), (203694359, 4))

In [8]:
%%time
merged_data = pd.concat([click_data, click_counts, click_times, ranks], axis=1)

CPU times: user 17 s, sys: 6.94 s, total: 23.9 s
Wall time: 23.9 s


In [9]:
%%time
del click_data
del click_counts
del click_times
del ranks
merged_data.drop(columns=["click_time"], inplace=True)

CPU times: user 16.3 s, sys: 17.3 s, total: 33.6 s
Wall time: 33.5 s


In [10]:
gc.collect()

42

In [11]:
%%time
merged_data.to_hdf("../data/merged_click_data.hdf", "merged_click_data")

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['attributed_time']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


CPU times: user 26.8 s, sys: 27.5 s, total: 54.3 s
Wall time: 4min 15s


In [12]:
ls -lha ../data/*.hdf

-rw-rw-r-- 1 ubuntu ubuntu 7.7M Apr 28 17:08 ../data/click_count_by_ip.hdf
-rw-rw-r-- 1 ubuntu ubuntu 158M Apr 28 17:08 ../data/click_count_by_ip_os_device_dow.hdf
-rw-rw-r-- 1 ubuntu ubuntu 529M Apr 28 17:09 ../data/click_count_by_ip_os_device_dow_hour.hdf
-rw-rw-r-- 1 ubuntu ubuntu 6.1G Apr 28 17:27 ../data/click_counts.hdf
-rw-rw-r-- 1 ubuntu ubuntu  11G Apr 28 16:21 ../data/click_data.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.1G Apr 28 17:40 ../data/click_time_interval_by_ip.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.1G Apr 28 17:50 ../data/click_time_interval_by_ip_os_device.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.1G Apr 28 17:58 ../data/click_time_interval_by_ip_os_device_dow.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.1G Apr 28 18:07 ../data/click_time_interval_by_ip_os_device_dow_hour.hdf
-rw-rw-r-- 1 ubuntu ubuntu 7.6G Apr 28 16:42 ../data/click_times.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.0G Apr 28 16:48 ../data/ip_os_device.hdf
-rw-rw-r-- 1 ubuntu ubuntu 4.6G Apr 28 17:04 ../data/ip_os_device_dow.hdf
-rw-

In [13]:
%%time
click_time_interval_by_ip = read_hdf("click_time_interval_by_ip")
click_time_interval_by_ip_os_device = read_hdf("click_time_interval_by_ip_os_device")
click_time_interval_by_ip_os_device_dow = read_hdf("click_time_interval_by_ip_os_device_dow")
click_time_interval_by_ip_os_device_dow_hour = read_hdf("click_time_interval_by_ip_os_device_dow_hour")

CPU times: user 3.38 s, sys: 7.01 s, total: 10.4 s
Wall time: 4min 35s


In [14]:
%%time
click_time_intervals = pd.concat([
    click_time_interval_by_ip, 
    click_time_interval_by_ip_os_device, 
    click_time_interval_by_ip_os_device_dow, 
    click_time_interval_by_ip_os_device_dow_hour], axis=1)

CPU times: user 3.62 s, sys: 1.76 s, total: 5.38 s
Wall time: 5.38 s


In [15]:
%%time
click_time_intervals.to_hdf("../data/click_time_intervals.hdf", "click_time_intervals")

CPU times: user 4.84 s, sys: 13.7 s, total: 18.5 s
Wall time: 18.5 s


In [16]:
%%time
click_time_intervals = read_hdf("click_time_intervals")

CPU times: user 1.04 s, sys: 5.68 s, total: 6.72 s
Wall time: 6.72 s


In [17]:
%%time
merged_click_data = read_hdf("merged_click_data")

CPU times: user 10.9 s, sys: 18.3 s, total: 29.2 s
Wall time: 1min 25s


In [18]:
merged_click_data.dtypes

app                                      uint16
attributed_time                          object
channel                                  uint16
click_id                                float64
device                                   uint16
ip                                       uint32
is_attributed                           float64
os                                       uint16
click_count_by_ip_os_device_dow           int64
click_count_by_ip_os_device_dow_hour      int64
click_count_by_ip                         int64
dow                                       int64
hour                                      int64
minute                                    int64
second                                    int64
rank_by_ip                                int64
rank_by_ip_os_device                      int64
rank_by_ip_os_device_dow                  int64
dtype: object

In [19]:
merged_click_data["click_count_by_ip_os_device_dow"] = merged_click_data["click_count_by_ip_os_device_dow"].astype("uint32")

In [20]:
merged_click_data["click_count_by_ip_os_device_dow_hour"] = merged_click_data["click_count_by_ip_os_device_dow_hour"].astype("uint16")

In [21]:
%%time
merged_click_data["click_count_by_ip"] = merged_click_data["click_count_by_ip"].astype("uint32")

CPU times: user 2.52 s, sys: 3.19 s, total: 5.71 s
Wall time: 5.71 s


In [22]:
merged_click_data["dow"].max(), merged_click_data["dow"].min()

(4, 0)

In [23]:
%%time
merged_click_data["dow"] = merged_click_data["dow"].astype("uint8")

CPU times: user 2.17 s, sys: 2.5 s, total: 4.67 s
Wall time: 4.67 s


In [24]:
%%time
merged_click_data["hour"] = merged_click_data["hour"].astype("uint8")
merged_click_data["minute"] = merged_click_data["minute"].astype("uint8")
merged_click_data["second"] = merged_click_data["second"].astype("uint8")

CPU times: user 4.75 s, sys: 5.05 s, total: 9.8 s
Wall time: 9.8 s


In [25]:
merged_click_data.dtypes

app                                      uint16
attributed_time                          object
channel                                  uint16
click_id                                float64
device                                   uint16
ip                                       uint32
is_attributed                           float64
os                                       uint16
click_count_by_ip_os_device_dow          uint32
click_count_by_ip_os_device_dow_hour     uint16
click_count_by_ip                        uint32
dow                                       uint8
hour                                      uint8
minute                                    uint8
second                                    uint8
rank_by_ip                                int64
rank_by_ip_os_device                      int64
rank_by_ip_os_device_dow                  int64
dtype: object

In [26]:
merged_click_data["rank_by_ip_os_device"].max(), merged_click_data["rank_by_ip_os_device"].min()

(282426, 0)

In [27]:
%%time
merged_click_data["rank_by_ip_os_device"] = merged_click_data["rank_by_ip_os_device"].astype("uint32")

CPU times: user 1.08 s, sys: 1.19 s, total: 2.27 s
Wall time: 2.27 s


In [28]:
merged_click_data["rank_by_ip_os_device_dow"].max(), merged_click_data["rank_by_ip_os_device_dow"].min()

(93200, 0)

In [29]:
%%time
merged_click_data["rank_by_ip_os_device_dow"] = merged_click_data["rank_by_ip_os_device_dow"].astype("uint32")

CPU times: user 720 ms, sys: 824 ms, total: 1.54 s
Wall time: 1.55 s


In [30]:
merged_click_data.dtypes

app                                      uint16
attributed_time                          object
channel                                  uint16
click_id                                float64
device                                   uint16
ip                                       uint32
is_attributed                           float64
os                                       uint16
click_count_by_ip_os_device_dow          uint32
click_count_by_ip_os_device_dow_hour     uint16
click_count_by_ip                        uint32
dow                                       uint8
hour                                      uint8
minute                                    uint8
second                                    uint8
rank_by_ip                                int64
rank_by_ip_os_device                     uint32
rank_by_ip_os_device_dow                 uint32
dtype: object

In [31]:
merged_click_data["rank_by_ip"].max(), merged_click_data["rank_by_ip"].min()

(1421255, 0)

In [32]:
%%time
merged_click_data["rank_by_ip"] = merged_click_data["rank_by_ip"].astype("uint32")

CPU times: user 416 ms, sys: 392 ms, total: 808 ms
Wall time: 808 ms


In [33]:
merged_click_data.drop(columns=["attributed_time"], inplace=True)

In [34]:
merged_click_data.to_hdf("../data/merged_click_data.hdf", "merged_click_data")

In [3]:
%%time
merged_click_data = read_hdf("merged_click_data")

CPU times: user 1.37 s, sys: 10.2 s, total: 11.6 s
Wall time: 11.7 s


In [4]:
ls -lha ../data/*.hdf

-rw-rw-r-- 1 ubuntu ubuntu 7.7M Apr 28 17:08 ../data/click_count_by_ip.hdf
-rw-rw-r-- 1 ubuntu ubuntu 158M Apr 28 17:08 ../data/click_count_by_ip_os_device_dow.hdf
-rw-rw-r-- 1 ubuntu ubuntu 529M Apr 28 17:09 ../data/click_count_by_ip_os_device_dow_hour.hdf
-rw-rw-r-- 1 ubuntu ubuntu 6.1G Apr 28 17:27 ../data/click_counts.hdf
-rw-rw-r-- 1 ubuntu ubuntu  11G Apr 28 16:21 ../data/click_data.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.1G Apr 28 17:40 ../data/click_time_interval_by_ip.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.1G Apr 28 17:50 ../data/click_time_interval_by_ip_os_device.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.1G Apr 28 17:58 ../data/click_time_interval_by_ip_os_device_dow.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.1G Apr 28 18:07 ../data/click_time_interval_by_ip_os_device_dow_hour.hdf
-rw-rw-r-- 1 ubuntu ubuntu 7.6G Apr 29 01:37 ../data/click_time_intervals.hdf
-rw-rw-r-- 1 ubuntu ubuntu 7.6G Apr 28 16:42 ../data/click_times.hdf
-rw-rw-r-- 1 ubuntu ubuntu 3.0G Apr 28 16:48 ../data/ip_os_device.hdf


In [5]:
merged_click_data.dtypes

app                                                      uint16
channel                                                  uint16
click_id                                                float64
device                                                   uint16
ip                                                       uint32
is_attributed                                           float64
os                                                       uint16
click_count_by_ip_os_device_dow                          uint32
click_count_by_ip_os_device_dow_hour                     uint16
click_count_by_ip                                        uint32
dow                                                       uint8
hour                                                      uint8
minute                                                    uint8
second                                                    uint8
rank_by_ip                                               uint32
rank_by_ip_os_device                    

In [None]:
%%time
click_time_intervals = read_hdf("click_time_intervals")

In [38]:
%%time
merged_click_data = pd.concat([merged_click_data, click_time_intervals], axis=1)

CPU times: user 18 s, sys: 7.81 s, total: 25.8 s
Wall time: 25.8 s


In [39]:
merged_click_data.to_hdf("../data/merged_click_data.hdf", "merged_click_data")

In [1]:
merged_click_data.head()

NameError: name 'merged_click_data' is not defined

In [3]:
merged_click_data = read_hdf("merged_click_data")

In [4]:
merged_click_data.dtypes

app                                                      uint16
channel                                                  uint16
click_id                                                float64
device                                                   uint16
ip                                                       uint32
is_attributed                                           float64
os                                                       uint16
click_count_by_ip_os_device_dow                          uint32
click_count_by_ip_os_device_dow_hour                     uint16
click_count_by_ip                                        uint32
dow                                                       uint8
hour                                                      uint8
minute                                                    uint8
second                                                    uint8
rank_by_ip                                               uint32
rank_by_ip_os_device                    

In [5]:
gc.collect()

7

In [6]:
%%time
merged_click_data["click_time_interval_by_ip"] = merged_click_data["click_time_interval_by_ip"] / np.timedelta64(1, "s")
merged_click_data["click_time_interval_by_ip_os_device"] = merged_click_data["click_time_interval_by_ip_os_device"] / np.timedelta64(1, "s")
merged_click_data["click_time_interval_by_ip_os_device_dow"] = merged_click_data["click_time_interval_by_ip_os_device_dow"] / np.timedelta64(1, "s")
merged_click_data["click_time_interval_by_ip_os_device_dow_hour"] = merged_click_data["click_time_interval_by_ip_os_device_dow_hour"] / np.timedelta64(1, "s")

CPU times: user 24.2 s, sys: 11.9 s, total: 36.1 s
Wall time: 31 s


In [7]:
merged_click_data.dtypes

app                                              uint16
channel                                          uint16
click_id                                        float64
device                                           uint16
ip                                               uint32
is_attributed                                   float64
os                                               uint16
click_count_by_ip_os_device_dow                  uint32
click_count_by_ip_os_device_dow_hour             uint16
click_count_by_ip                                uint32
dow                                               uint8
hour                                              uint8
minute                                            uint8
second                                            uint8
rank_by_ip                                       uint32
rank_by_ip_os_device                             uint32
rank_by_ip_os_device_dow                         uint32
click_time_interval_by_ip                       

In [None]:
merged_click_data.to_hdf("../data/merged_click_data.hdf", "merged_click_data")