In [1]:
import gc
import time
import numpy as np
import pandas as pd

In [2]:
ls -lah ../data/

total 7.7G
drwxrwxr-x 5 ubuntu ubuntu 4.0K Apr 29 15:09 [0m[01;34m.[0m/
drwxrwxr-x 7 ubuntu ubuntu 4.0K Apr 28 16:26 [01;34m..[0m/
drwxrwxr-x 2 ubuntu ubuntu 4.0K Apr 28 16:52 [01;34m.ipynb_checkpoints[0m/
drwxrwxr-x 2 ubuntu ubuntu 4.0K Apr 29 14:21 [01;34mkenkoooos[0m/
-rw-r--r-- 1 ubuntu ubuntu 6.3G Apr 29 16:13 merge.feather
drwxrwxr-x 2 ubuntu ubuntu 4.0K Apr 28 17:13 [01;34mraw[0m/
-rw-rw-r-- 1 ubuntu ubuntu 1.4G Apr 28 17:20 [01;31mraw.zip[0m


In [3]:
path = '../data/'
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }
train_columns = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']
test_columns  = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'click_id']

### Read csv


In [4]:
%%time
train = pd.read_csv(path + 'raw/train.csv', 
                    dtype = dtypes,
                    usecols = train_columns,
                    engine = 'c')
print(train.shape)

(184903890, 7)
CPU times: user 1min 34s, sys: 4.83 s, total: 1min 39s
Wall time: 1min 39s


In [5]:
train.dtypes

ip               uint32
app              uint16
device           uint16
os               uint16
channel          uint16
click_time       object
is_attributed     uint8
dtype: object

In [6]:
# drop messy records
train = train[train.click_time >= '2017-11-06 16:00:00'] 
print(train.shape)
train_size = train.shape[0]
gc.collect()

(184903407, 7)


0

In [7]:
%%time
test = pd.read_csv(path + 'raw/test.csv', 
                   dtype = dtypes,
                   usecols = test_columns,
                   engine = 'c')
test_size = test.shape[0]
print(test.shape)

(18790469, 7)
CPU times: user 10.6 s, sys: 168 ms, total: 10.8 s
Wall time: 10.8 s


In [8]:
merge = pd.concat([train, test])
print(merge.shape)

(203693876, 8)


In [9]:
merge.head()

Unnamed: 0,app,channel,click_id,click_time,device,ip,is_attributed,os
483,20,259,,2017-11-06 16:00:00,1,14901,0.0,17
484,2,477,,2017-11-06 16:00:00,2,5729,0.0,37
485,8,145,,2017-11-06 16:00:00,1,105475,0.0,19
486,26,121,,2017-11-06 16:00:00,1,93021,0.0,13
487,20,259,,2017-11-06 16:00:00,1,78507,0.0,30


In [10]:
del train, test
gc.collect()

7

In [11]:
click_time_numeric = merge.click_time.astype('datetime64[ns]')
merge = merge.drop(['click_time'], axis=1)
merge['click_time'] = click_time_numeric

In [12]:
print(merge.shape)
merge.head()

(203693876, 8)


Unnamed: 0,app,channel,click_id,device,ip,is_attributed,os,click_time
483,20,259,,1,14901,0.0,17,2017-11-06 16:00:00
484,2,477,,2,5729,0.0,37,2017-11-06 16:00:00
485,8,145,,1,105475,0.0,19,2017-11-06 16:00:00
486,26,121,,1,93021,0.0,13,2017-11-06 16:00:00
487,20,259,,1,78507,0.0,30,2017-11-06 16:00:00


In [13]:
merge.reset_index(inplace=True)

In [14]:
merge.dtypes

index                     int64
app                      uint16
channel                  uint16
click_id                float64
device                   uint16
ip                       uint32
is_attributed           float64
os                       uint16
click_time       datetime64[ns]
dtype: object

In [15]:
# drop original index
merge = merge.drop(['index'], axis=1)

### fillna and create 'is_test' variable

In [16]:
print(np.min(merge.click_id), np.max(merge.click_id))

0.0 18790468.0


In [17]:
merge['click_id'] = merge.click_id.fillna(99999999).astype('uint32')

In [18]:
merge['is_attributed'] = merge.click_id.fillna(99).astype('uint8')

In [19]:
merge['is_test'] = pd.Series([0]*train_size + [1]*test_size, dtype='bool')

In [20]:
print(merge.shape)
merge.head()

(203693876, 9)


Unnamed: 0,app,channel,click_id,device,ip,is_attributed,os,click_time,is_test
0,20,259,99999999,1,14901,255,17,2017-11-06 16:00:00,False
1,2,477,99999999,2,5729,255,37,2017-11-06 16:00:00,False
2,8,145,99999999,1,105475,255,19,2017-11-06 16:00:00,False
3,26,121,99999999,1,93021,255,13,2017-11-06 16:00:00,False
4,20,259,99999999,1,78507,255,30,2017-11-06 16:00:00,False


In [21]:
merge.dtypes

app                      uint16
channel                  uint16
click_id                 uint32
device                   uint16
ip                       uint32
is_attributed             uint8
os                       uint16
click_time       datetime64[ns]
is_test                    bool
dtype: object

In [22]:
merge.to_feather(path + 'merge.feather')