In [1]:
import gc
import time
from logzero import logger
import numpy as np
import pandas as pd

In [2]:
ls -lah ../data/

total 24G
drwxrwxr-x 5 ubuntu ubuntu 4.0K May  4 15:54 [0m[01;34m.[0m/
drwxrwxr-x 8 ubuntu ubuntu 4.0K Apr 30 17:30 [01;34m..[0m/
drwxrwxr-x 2 ubuntu ubuntu 4.0K Apr 28 16:52 [01;34m.ipynb_checkpoints[0m/
-rw-r--r-- 1 ubuntu ubuntu 4.3K May  4 15:41 agg_by_channel.feather
-rw-r--r-- 1 ubuntu ubuntu 4.8G Apr 30 14:27 click_data.feather
-rw-r--r-- 1 ubuntu ubuntu 1.1M May  4 16:12 count_user_by_channel_app.feather
drwxrwxr-x 2 ubuntu ubuntu 4.0K Apr 29 14:21 [01;34mkenkoooos[0m/
-rw-r--r-- 1 ubuntu ubuntu  18G May  4 07:40 merge.feather
drwxrwxr-x 2 ubuntu ubuntu 4.0K Apr 28 17:13 [01;34mraw[0m/
-rw-rw-r-- 1 ubuntu ubuntu 1.4G Apr 28 17:20 [01;31mraw.zip[0m
-rw-r--r-- 1 ubuntu ubuntu 8.4K May  4 15:44 uqcount_by_channel.feather


In [3]:
%time
merge = pd.read_feather("../data/merge.feather", nthreads=8)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.72 µs


In [11]:
merge = merge[merge.is_test == False]
gc.collect()

42

In [12]:
print(merge.shape)
merge.dtypes

(126923238, 37)


app                                                uint16
channel                                            uint16
click_id                                           uint32
device                                             uint16
ip                                                 uint32
is_attributed                                       uint8
os                                                 uint16
click_time                                 datetime64[ns]
is_test                                              bool
dow                                                 uint8
hour                                                uint8
minute                                              uint8
min5                                                uint8
second                                              uint8
sin_time                                          float64
cos_time                                          float64
ip_os_device                                       uint64
ip_os_device_d

### preparation

In [13]:
%%time
channel_grp = merge.groupby('channel', as_index=False)

CPU times: user 4 ms, sys: 48 ms, total: 52 ms
Wall time: 50.2 ms


In [14]:
del merge
gc.collect()

0

### simple count

In [15]:
%%time
count_by_channel = (channel_grp.ip.count().rename(columns={'ip': 'count_by_channel'}))

CPU times: user 1.93 s, sys: 1.05 s, total: 2.98 s
Wall time: 2.98 s


In [24]:
count_by_channel.head()

Unnamed: 0,channel,count_by_channel
0,0,1258
1,3,546455
2,4,735
3,5,9289
4,13,86058


In [23]:
%%time
attrib_by_channel = (channel_grp.is_attributed.sum().rename(columns={'is_attributed': 'attrib_by_channel'})).astype('uint32')

CPU times: user 896 ms, sys: 496 ms, total: 1.39 s
Wall time: 1.39 s


In [26]:
attrib_by_channel.head()

Unnamed: 0,channel,attrib_by_channel
0,0,87
1,3,283
2,4,54
3,5,5735
4,13,6


In [29]:
agg_by_channel = pd.concat([count_by_channel, 
                            attrib_by_channel['attrib_by_channel']], axis=1)

In [30]:
agg_by_channel.to_feather('../data/agg_by_channel.feather')

In [31]:
del agg_by_channel
gc.collect()

7

In [32]:
ls -lah ../data/*.feather

-rw-r--r-- 1 ubuntu ubuntu 4.3K May  4 15:41 ../data/agg_by_channel.feather
-rw-r--r-- 1 ubuntu ubuntu 4.8G Apr 30 14:27 ../data/click_data.feather
-rw-r--r-- 1 ubuntu ubuntu 3.2K May  4 11:58 ../data/count_by_channel.feather
-rw-r--r-- 1 ubuntu ubuntu  18G May  4 07:40 ../data/merge.feather
-rw-r--r-- 1 ubuntu ubuntu 7.8K May  4 11:58 ../data/uqcount_by_channel.feather


### unique count

In [35]:
%%time
uqcount_by_channel = pd.concat([
    channel_grp.app.nunique().to_frame('uqcount_app_by_channel')
    , channel_grp.ip.nunique().to_frame('uqcount_ip_by_channel')
    , channel_grp.device.nunique().to_frame('uqcount_device_by_channel')
    , channel_grp.os.nunique().to_frame('uqcount_os_by_channel')
    , channel_grp.ip_os_device_dow_hour.nunique().to_frame('uqcount_user_by_channel')
], axis=1)

CPU times: user 2min 46s, sys: 5.46 s, total: 2min 51s
Wall time: 2min 51s


In [36]:
uqcount_by_channel.head()

Unnamed: 0,uqcount_app_by_channel,uqcount_ip_by_channel,uqcount_device_by_channel,uqcount_os_by_channel,uqcount_user_by_channel
0,25,705,13,49,776
1,13,10646,12,184,279071
2,6,496,6,42,587
3,2,6831,7,44,8403
4,2,8995,6,108,36210


In [37]:
uqcount_by_channel.to_feather('../data/uqcount_by_channel.feather')

In [38]:
del uqcount_by_channel
gc.collect()

69

In [1]:
ls -lah ../data/*.feather

-rw-r--r-- 1 ubuntu ubuntu 4.3K May  4 15:41 ../data/agg_by_channel.feather
-rw-r--r-- 1 ubuntu ubuntu 4.8G Apr 30 14:27 ../data/click_data.feather
-rw-r--r-- 1 ubuntu ubuntu 1.1M May  4 15:45 ../data/count_mat_channel_app.feather
-rw-r--r-- 1 ubuntu ubuntu  18G May  4 07:40 ../data/merge.feather
-rw-r--r-- 1 ubuntu ubuntu 8.4K May  4 15:44 ../data/uqcount_by_channel.feather


### cross tabulation(app)

In [3]:
%time
merge = pd.read_feather("../data/merge.feather", nthreads=8)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.96 µs


In [4]:
merge = merge[['channel', 'app', 'ip_os_device_dow_hour']][merge.is_test == False]
merge['counter'] = 1
gc.collect()

0

In [5]:
print(merge.shape)
merge.head()

(126923238, 4)


Unnamed: 0,channel,app,ip_os_device_dow_hour,counter
0,153,23,765080350001103,1
1,134,18,765080350001103,1
2,315,15,765080350001103,1
3,469,11,765080350001103,1
4,379,3,765080350001103,1


In [6]:
merge.drop_duplicates(inplace = True)

In [7]:
print(merge.shape)
merge.head()

(83747805, 4)


Unnamed: 0,channel,app,ip_os_device_dow_hour,counter
0,153,23,765080350001103,1
1,134,18,765080350001103,1
2,315,15,765080350001103,1
3,469,11,765080350001103,1
4,379,3,765080350001103,1


In [8]:
%%time
count_user_by_channel_app = merge.pivot_table(values  = 'counter',
                                              index   = 'channel',
                                              columns = 'app', 
                                              aggfunc = sum,
                                              fill_value = 0)

CPU times: user 3.78 s, sys: 2.33 s, total: 6.12 s
Wall time: 6.11 s


In [9]:
applabel = ['app_{0:03d}'.format(i) for i in list(set(merge.app))]
count_user_by_channel_app.columns = applabel
count_user_by_channel_app.reset_index(inplace=True)

In [10]:
count_user_by_channel_app.head()

Unnamed: 0,channel,app_000,app_001,app_002,app_003,app_004,app_005,app_006,app_007,app_008,...,app_758,app_759,app_760,app_761,app_763,app_764,app_765,app_766,app_767,app_768
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,0,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,13,0,36209,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
count_user_by_channel_app.tail()

Unnamed: 0,channel,app_000,app_001,app_002,app_003,app_004,app_005,app_006,app_007,app_008,...,app_758,app_759,app_760,app_761,app_763,app_764,app_765,app_766,app_767,app_768
194,490,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
195,496,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
196,497,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
197,498,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
198,500,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
count_user_by_channel_app.to_feather('../data/count_user_by_channel_app.feather')

In [13]:
del count_user_by_channel_app
gc.collect()

6090

In [14]:
ls -lah ../data/*.feather

-rw-r--r-- 1 ubuntu ubuntu 4.3K May  4 15:41 ../data/agg_by_channel.feather
-rw-r--r-- 1 ubuntu ubuntu 4.8G Apr 30 14:27 ../data/click_data.feather
-rw-r--r-- 1 ubuntu ubuntu 1.1M May  4 16:12 ../data/count_user_by_channel_app.feather
-rw-r--r-- 1 ubuntu ubuntu  18G May  4 07:40 ../data/merge.feather
-rw-r--r-- 1 ubuntu ubuntu 8.4K May  4 15:44 ../data/uqcount_by_channel.feather


### cross tabulation (channels clicked by the same user)

In [8]:
merge.drop(['app'], axis=1,  inplace=True)
merge.drop_duplicates(inplace=True)

In [9]:
print(merge.shape)
merge.head()

(75266415, 3)


Unnamed: 0,channel,ip_os_device_dow_hour,counter
0,153,765080350001103,1
1,134,765080350001103,1
2,315,765080350001103,1
3,469,765080350001103,1
4,379,765080350001103,1


In [10]:
merge['counter'] = merge['counter'].astype('bool')

In [12]:
d = merge.pivot('ip_os_device_dow_hour', 'channel')

In [17]:
d = d.fillna(False)

In [18]:
d.head()

Unnamed: 0_level_0,counter,counter,counter,counter,counter,counter,counter,counter,counter,counter,counter,counter,counter,counter,counter,counter,counter,counter,counter,counter,counter
channel,0,3,4,5,13,14,15,17,18,19,...,484,486,487,488,489,490,496,497,498,500
ip_os_device_dow_hour,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
10020001303,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10020001314,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10040001312,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10040001314,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10090001303,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [19]:
%%time
mat = np.dot(d.astype('bool').astype('uint16').values.T,
             d.astype('bool').astype('uint16').values)

CPU times: user 7min 41s, sys: 3.5 s, total: 7min 45s
Wall time: 7min 45s


In [20]:
count_channel_coincidence_per_user = pd.DataFrame(mat)

In [21]:
del merge, mat, d
gc.collect()

101

In [25]:
chanlabel = ['chan_{0:03d}'.format(i) for i in list(set(merge.channel))]
count_channel_coincidence_per_user.columns = chanlabel
count_channel_coincidence_per_user.head()

Unnamed: 0,chan_000,chan_003,chan_004,chan_005,chan_013,chan_014,chan_015,chan_017,chan_018,chan_019,...,chan_484,chan_486,chan_487,chan_488,chan_489,chan_490,chan_496,chan_497,chan_498,chan_500
0,776,46,0,0,4,0,1,21,5,98,...,7,0,52,2,140,5,20,55,0,0
1,46,16927,6,143,2478,1,262,6136,1586,22342,...,3382,107,8503,604,37836,1745,1724,11075,8,0
2,0,6,587,1,1,0,0,14,1,58,...,2,0,27,0,169,2,7,21,0,0
3,0,143,1,8403,43,0,5,123,17,372,...,58,1,171,10,537,36,118,210,0,0
4,4,2478,1,43,36210,1,1115,1083,2152,3988,...,5198,105,1635,1673,6331,1103,4,1999,1,0


In [26]:
count_channel_coincidence_per_user.to_feather('../data/count_channel_coincidence_per_user.feather')

In [27]:
del count_channel_coincidence_per_user
gc.collect()

1937

In [28]:
ls -lah ../data/*.feather

-rw-r--r-- 1 ubuntu ubuntu 4.3K May  4 15:41 ../data/agg_by_channel.feather
-rw-r--r-- 1 ubuntu ubuntu 4.8G Apr 30 14:27 ../data/click_data.feather
-rw-r--r-- 1 ubuntu ubuntu  93K May  4 16:57 ../data/count_channel_coincidence_per_user.feather
-rw-r--r-- 1 ubuntu ubuntu 1.1M May  4 16:12 ../data/count_user_by_channel_app.feather
-rw-r--r-- 1 ubuntu ubuntu  18G May  4 07:40 ../data/merge.feather
-rw-r--r-- 1 ubuntu ubuntu 8.4K May  4 15:44 ../data/uqcount_by_channel.feather


### pca

In [39]:
from sklearn.decomposition import PCA

In [34]:
d = pd.concat([
    pd.read_feather('../data/agg_by_channel.feather'),
    pd.read_feather('../data/uqcount_by_channel.feather'),
    pd.read_feather('../data/count_user_by_channel_app.feather'),
    pd.read_feather('../data/count_channel_coincidence_per_user.feather')
], axis = 1)

In [44]:
print(d.shape)
d.head()

(199, 882)


Unnamed: 0,channel,count_by_channel,attrib_by_channel,uqcount_app_by_channel,uqcount_ip_by_channel,uqcount_device_by_channel,uqcount_os_by_channel,uqcount_user_by_channel,channel.1,app_000,...,chan_484,chan_486,chan_487,chan_488,chan_489,chan_490,chan_496,chan_497,chan_498,chan_500
0,0,1258,87,25,705,13,49,776,0,0,...,7,0,52,2,140,5,20,55,0,0
1,3,546455,283,13,10646,12,184,279071,3,0,...,3382,107,8503,604,37836,1745,1724,11075,8,0
2,4,735,54,6,496,6,42,587,4,0,...,2,0,27,0,169,2,7,21,0,0
3,5,9289,5735,2,6831,7,44,8403,5,0,...,58,1,171,10,537,36,118,210,0,0
4,13,86058,6,2,8995,6,108,36210,13,0,...,5198,105,1635,1673,6331,1103,4,1999,1,0


In [37]:
y = d[['channel', 'attrib_by_channel']]
X = d.drop(['channel', 'attrib_by_channel'], axis=1)

In [52]:
pca = PCA()
res = pca.fit_transform(X.values)

In [53]:
res.shape

(199, 199)