In [1]:
import gc
import time
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
import xgboost as xgb
from xgboost import plot_importance
import matplotlib.pyplot as plt



In [2]:
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }

In [3]:
%%time
train = pd.read_csv("../data/train.csv", dtype=dtypes)

CPU times: user 1min 44s, sys: 6.37 s, total: 1min 50s
Wall time: 2min 27s


In [4]:
%%time
test = pd.read_csv("../data/test.csv", dtype=dtypes)

CPU times: user 11.1 s, sys: 212 ms, total: 11.3 s
Wall time: 15.5 s


In [5]:
%%time
click_data = pd.concat([train, test])

CPU times: user 7.52 s, sys: 4.09 s, total: 11.6 s
Wall time: 11.6 s


In [6]:
%%time
click_data.sort_values(by=["click_time", "is_attributed"], inplace=True)

CPU times: user 33.6 s, sys: 13.7 s, total: 47.2 s
Wall time: 47.2 s


In [7]:
click_data

Unnamed: 0,app,attributed_time,channel,click_id,click_time,device,ip,is_attributed,os
0,3,,379,,2017-11-06 14:32:21,1,83230,0.0,13
1,3,,379,,2017-11-06 14:33:34,1,17357,0.0,19
2,3,,379,,2017-11-06 14:34:12,1,35810,0.0,13
3,14,,478,,2017-11-06 14:34:52,1,45745,0.0,13
4,3,,379,,2017-11-06 14:35:08,1,161007,0.0,13
5,3,,379,,2017-11-06 14:36:26,1,18787,0.0,16
6,3,,379,,2017-11-06 14:37:44,1,103022,0.0,23
7,3,,379,,2017-11-06 14:37:59,1,114221,0.0,19
8,3,,379,,2017-11-06 14:38:10,1,165970,0.0,13
9,64,,459,,2017-11-06 14:38:23,1,74544,0.0,22


In [8]:
click_data.reset_index(inplace=True)
click_data

Unnamed: 0,index,app,attributed_time,channel,click_id,click_time,device,ip,is_attributed,os
0,0,3,,379,,2017-11-06 14:32:21,1,83230,0.0,13
1,1,3,,379,,2017-11-06 14:33:34,1,17357,0.0,19
2,2,3,,379,,2017-11-06 14:34:12,1,35810,0.0,13
3,3,14,,478,,2017-11-06 14:34:52,1,45745,0.0,13
4,4,3,,379,,2017-11-06 14:35:08,1,161007,0.0,13
5,5,3,,379,,2017-11-06 14:36:26,1,18787,0.0,16
6,6,3,,379,,2017-11-06 14:37:44,1,103022,0.0,23
7,7,3,,379,,2017-11-06 14:37:59,1,114221,0.0,19
8,8,3,,379,,2017-11-06 14:38:10,1,165970,0.0,13
9,9,64,,459,,2017-11-06 14:38:23,1,74544,0.0,22


In [9]:
%time
click_data.drop(["index"], axis=1, inplace=True)
click_data

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.2 µs


Unnamed: 0,app,attributed_time,channel,click_id,click_time,device,ip,is_attributed,os
0,3,,379,,2017-11-06 14:32:21,1,83230,0.0,13
1,3,,379,,2017-11-06 14:33:34,1,17357,0.0,19
2,3,,379,,2017-11-06 14:34:12,1,35810,0.0,13
3,14,,478,,2017-11-06 14:34:52,1,45745,0.0,13
4,3,,379,,2017-11-06 14:35:08,1,161007,0.0,13
5,3,,379,,2017-11-06 14:36:26,1,18787,0.0,16
6,3,,379,,2017-11-06 14:37:44,1,103022,0.0,23
7,3,,379,,2017-11-06 14:37:59,1,114221,0.0,19
8,3,,379,,2017-11-06 14:38:10,1,165970,0.0,13
9,64,,459,,2017-11-06 14:38:23,1,74544,0.0,22


In [10]:
%%time
click_data["click_time"] = pd.to_datetime(click_data["click_time"])

CPU times: user 41.6 s, sys: 2.34 s, total: 44 s
Wall time: 43.8 s


In [11]:
click_data.dtypes

app                        uint16
attributed_time            object
channel                    uint16
click_id                  float64
click_time         datetime64[ns]
device                     uint16
ip                         uint32
is_attributed             float64
os                         uint16
dtype: object

In [12]:
%%time
click_data.to_hdf("../data/click_data.hdf", "click_data")

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['attributed_time']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


CPU times: user 15 s, sys: 12.6 s, total: 27.6 s
Wall time: 27.8 s


In [13]:
%%time
click_data = pd.read_hdf("../data/click_data.hdf", "click_data")

CPU times: user 11 s, sys: 11.2 s, total: 22.2 s
Wall time: 22.2 s


In [14]:
click_data

Unnamed: 0,app,attributed_time,channel,click_id,click_time,device,ip,is_attributed,os
0,3,,379,,2017-11-06 14:32:21,1,83230,0.0,13
1,3,,379,,2017-11-06 14:33:34,1,17357,0.0,19
2,3,,379,,2017-11-06 14:34:12,1,35810,0.0,13
3,14,,478,,2017-11-06 14:34:52,1,45745,0.0,13
4,3,,379,,2017-11-06 14:35:08,1,161007,0.0,13
5,3,,379,,2017-11-06 14:36:26,1,18787,0.0,16
6,3,,379,,2017-11-06 14:37:44,1,103022,0.0,23
7,3,,379,,2017-11-06 14:37:59,1,114221,0.0,19
8,3,,379,,2017-11-06 14:38:10,1,165970,0.0,13
9,64,,459,,2017-11-06 14:38:23,1,74544,0.0,22
