In [1]:
import gc
import time
from logzero import logger
import numpy as np
import pandas as pd

In [2]:
ls -lah ../data/

total 24G
drwxrwxr-x 5 ubuntu ubuntu 4.0K May  4 16:57 [0m[01;34m.[0m/
drwxrwxr-x 8 ubuntu ubuntu 4.0K Apr 30 17:30 [01;34m..[0m/
drwxrwxr-x 2 ubuntu ubuntu 4.0K Apr 28 16:52 [01;34m.ipynb_checkpoints[0m/
-rw-r--r-- 1 ubuntu ubuntu 4.3K May  5 02:58 agg_by_channel.feather
-rw-r--r-- 1 ubuntu ubuntu 4.8G Apr 30 14:27 click_data.feather
-rw-r--r-- 1 ubuntu ubuntu  93K May  4 16:57 count_channel_coincidence_per_user.feather
-rw-r--r-- 1 ubuntu ubuntu 1.2M May  5 03:03 count_user_by_channel_app.feather
drwxrwxr-x 2 ubuntu ubuntu 4.0K Apr 29 14:21 [01;34mkenkoooos[0m/
-rw-r--r-- 1 ubuntu ubuntu  18G May  4 07:40 merge.feather
drwxrwxr-x 2 ubuntu ubuntu 4.0K Apr 28 17:13 [01;34mraw[0m/
-rw-rw-r-- 1 ubuntu ubuntu 1.4G Apr 28 17:20 [01;31mraw.zip[0m
-rw-r--r-- 1 ubuntu ubuntu 8.4K May  5 03:02 uqcount_by_channel.feather


In [3]:
%time
merge = pd.read_feather("../data/merge.feather", nthreads=8)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.96 µs


In [4]:
merge = merge[merge.is_test == False]
gc.collect()

14

In [5]:
print(merge.shape)
merge.dtypes

(126923238, 37)


app                                                uint16
channel                                            uint16
click_id                                           uint32
device                                             uint16
ip                                                 uint32
is_attributed                                       uint8
os                                                 uint16
click_time                                 datetime64[ns]
is_test                                              bool
dow                                                 uint8
hour                                                uint8
minute                                              uint8
min5                                                uint8
second                                              uint8
sin_time                                          float64
cos_time                                          float64
ip_os_device                                       uint64
ip_os_device_d

### preparation

In [6]:
%%time
channel_grp = merge.groupby('channel', as_index=False)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 253 µs


In [7]:
del merge
gc.collect()

0

### simple count

In [8]:
%%time
count_by_channel = (channel_grp.ip.count().rename(columns={'ip': 'count_by_channel'}))

CPU times: user 2.44 s, sys: 1.11 s, total: 3.55 s
Wall time: 3.55 s


In [9]:
count_by_channel.head()

Unnamed: 0,channel,count_by_channel
0,0,1258
1,3,546455
2,4,735
3,5,9289
4,13,86058


In [10]:
%%time
attrib_by_channel = (channel_grp.is_attributed.sum().rename(columns={'is_attributed': 'attrib_by_channel'})).astype('uint32')

CPU times: user 1 s, sys: 588 ms, total: 1.59 s
Wall time: 1.59 s


In [11]:
attrib_by_channel.head()

Unnamed: 0,channel,attrib_by_channel
0,0,87
1,3,283
2,4,54
3,5,5735
4,13,6


In [12]:
agg_by_channel = pd.concat([count_by_channel, 
                            attrib_by_channel['attrib_by_channel']], axis=1)

In [13]:
agg_by_channel.to_feather('../data/agg_by_channel.feather')

In [14]:
del agg_by_channel
gc.collect()

14

In [15]:
ls -lah ../data/*.feather

-rw-r--r-- 1 ubuntu ubuntu 4.3K May  5 04:22 ../data/agg_by_channel.feather
-rw-r--r-- 1 ubuntu ubuntu 4.8G Apr 30 14:27 ../data/click_data.feather
-rw-r--r-- 1 ubuntu ubuntu  93K May  4 16:57 ../data/count_channel_coincidence_per_user.feather
-rw-r--r-- 1 ubuntu ubuntu 1.2M May  5 03:03 ../data/count_user_by_channel_app.feather
-rw-r--r-- 1 ubuntu ubuntu  18G May  4 07:40 ../data/merge.feather
-rw-r--r-- 1 ubuntu ubuntu 8.4K May  5 03:02 ../data/uqcount_by_channel.feather


### unique count

In [16]:
%%time
uqcount_by_channel = pd.concat([
    channel_grp.app.nunique().to_frame('uqcount_app_by_channel')
    , channel_grp.ip.nunique().to_frame('uqcount_ip_by_channel')
    , channel_grp.device.nunique().to_frame('uqcount_device_by_channel')
    , channel_grp.os.nunique().to_frame('uqcount_os_by_channel')
    , channel_grp.ip_os_device_dow_hour.nunique().to_frame('uqcount_user_by_channel')
], axis=1)

CPU times: user 3min 36s, sys: 3.48 s, total: 3min 39s
Wall time: 3min 39s


In [17]:
uqcount_by_channel.head()

Unnamed: 0,uqcount_app_by_channel,uqcount_ip_by_channel,uqcount_device_by_channel,uqcount_os_by_channel,uqcount_user_by_channel
0,25,705,13,49,776
1,13,10646,12,184,279071
2,6,496,6,42,587
3,2,6831,7,44,8403
4,2,8995,6,108,36210


In [18]:
uqcount_by_channel.to_feather('../data/uqcount_by_channel.feather')

In [19]:
del uqcount_by_channel
gc.collect()

151

In [20]:
ls -lah ../data/*.feather

-rw-r--r-- 1 ubuntu ubuntu 4.3K May  5 04:22 ../data/agg_by_channel.feather
-rw-r--r-- 1 ubuntu ubuntu 4.8G Apr 30 14:27 ../data/click_data.feather
-rw-r--r-- 1 ubuntu ubuntu  93K May  4 16:57 ../data/count_channel_coincidence_per_user.feather
-rw-r--r-- 1 ubuntu ubuntu 1.2M May  5 03:03 ../data/count_user_by_channel_app.feather
-rw-r--r-- 1 ubuntu ubuntu  18G May  4 07:40 ../data/merge.feather
-rw-r--r-- 1 ubuntu ubuntu 8.4K May  5 04:26 ../data/uqcount_by_channel.feather


### cross tabulation(app)

In [44]:
%time
merge = pd.read_feather("../data/merge.feather", nthreads=8)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 5.72 µs


In [22]:
merge = merge[['channel', 'app', 'ip_os_device_dow_hour']][merge.is_test == False]
# merge = merge[['channel', 'app', 'ip_os_device_dow_hour']]
merge['counter'] = 1
gc.collect()

0

In [23]:
print(merge.shape)
merge.head()

(126923238, 4)


Unnamed: 0,channel,app,ip_os_device_dow_hour,counter
0,153,23,765080350001103,1
1,134,18,765080350001103,1
2,315,15,765080350001103,1
3,469,11,765080350001103,1
4,379,3,765080350001103,1


In [24]:
merge.drop_duplicates(inplace = True)

In [25]:
print(merge.shape)
merge.head()

(83747805, 4)


Unnamed: 0,channel,app,ip_os_device_dow_hour,counter
0,153,23,765080350001103,1
1,134,18,765080350001103,1
2,315,15,765080350001103,1
3,469,11,765080350001103,1
4,379,3,765080350001103,1


In [26]:
%%time
count_user_by_channel_app = merge.pivot_table(values  = 'counter',
                                              index   = 'channel',
                                              columns = 'app', 
                                              aggfunc = sum,
                                              fill_value = 0)

CPU times: user 4.61 s, sys: 2.61 s, total: 7.22 s
Wall time: 7.22 s


In [27]:
applabel = ['app_{0:03d}'.format(i) for i in list(set(merge.app))]
count_user_by_channel_app.columns = applabel
count_user_by_channel_app.reset_index(inplace=True)

In [28]:
count_user_by_channel_app.head()

Unnamed: 0,channel,app_000,app_001,app_002,app_003,app_004,app_005,app_006,app_007,app_008,...,app_758,app_759,app_760,app_761,app_763,app_764,app_765,app_766,app_767,app_768
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,0,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,13,0,36209,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
count_user_by_channel_app.tail()

Unnamed: 0,channel,app_000,app_001,app_002,app_003,app_004,app_005,app_006,app_007,app_008,...,app_758,app_759,app_760,app_761,app_763,app_764,app_765,app_766,app_767,app_768
194,490,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
195,496,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
196,497,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
197,498,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
198,500,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
count_user_by_channel_app.to_feather('../data/count_user_by_channel_app.feather')

In [31]:
del count_user_by_channel_app
gc.collect()

6090

In [32]:
ls -lah ../data/*.feather

-rw-r--r-- 1 ubuntu ubuntu 4.3K May  5 04:22 ../data/agg_by_channel.feather
-rw-r--r-- 1 ubuntu ubuntu 4.8G Apr 30 14:27 ../data/click_data.feather
-rw-r--r-- 1 ubuntu ubuntu  93K May  4 16:57 ../data/count_channel_coincidence_per_user.feather
-rw-r--r-- 1 ubuntu ubuntu 1.1M May  5 04:27 ../data/count_user_by_channel_app.feather
-rw-r--r-- 1 ubuntu ubuntu  18G May  4 07:40 ../data/merge.feather
-rw-r--r-- 1 ubuntu ubuntu 8.4K May  5 04:26 ../data/uqcount_by_channel.feather


### cross tabulation (channels clicked by the same user)

In [33]:
merge.drop(['app'], axis=1,  inplace=True)
merge.drop_duplicates(inplace=True)

In [34]:
print(merge.shape)
merge.head()

(75266415, 3)


Unnamed: 0,channel,ip_os_device_dow_hour,counter
0,153,765080350001103,1
1,134,765080350001103,1
2,315,765080350001103,1
3,469,765080350001103,1
4,379,765080350001103,1


In [35]:
gc.collect()

89

In [36]:
merge['counter'] = merge['counter'].astype('bool')

In [37]:
d = merge.pivot('ip_os_device_dow_hour', 'channel')

In [38]:
d = d.fillna(False)

In [39]:
d.head()

Unnamed: 0_level_0,counter,counter,counter,counter,counter,counter,counter,counter,counter,counter,counter,counter,counter,counter,counter,counter,counter,counter,counter,counter,counter
channel,0,3,4,5,13,14,15,17,18,19,...,484,486,487,488,489,490,496,497,498,500
ip_os_device_dow_hour,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
10020001303,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10020001314,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10040001312,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10040001314,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10090001303,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [40]:
%%time
mat = np.dot(d.astype('bool').astype('uint16').values.T,
             d.astype('bool').astype('uint16').values)

CPU times: user 8min 20s, sys: 4.02 s, total: 8min 24s
Wall time: 8min 24s


In [41]:
count_channel_coincidence_per_user = pd.DataFrame(mat)

In [45]:
chanlabel = ['chan_{0:03d}'.format(i) for i in list(set(merge.channel))]
count_channel_coincidence_per_user.columns = chanlabel
count_channel_coincidence_per_user.head()

Unnamed: 0,chan_000,chan_003,chan_004,chan_005,chan_013,chan_014,chan_015,chan_017,chan_018,chan_019,...,chan_484,chan_486,chan_487,chan_488,chan_489,chan_490,chan_496,chan_497,chan_498,chan_500
0,776,46,0,0,4,0,1,21,5,98,...,7,0,52,2,140,5,20,55,0,0
1,46,16927,6,143,2478,1,262,6136,1586,22342,...,3382,107,8503,604,37836,1745,1724,11075,8,0
2,0,6,587,1,1,0,0,14,1,58,...,2,0,27,0,169,2,7,21,0,0
3,0,143,1,8403,43,0,5,123,17,372,...,58,1,171,10,537,36,118,210,0,0
4,4,2478,1,43,36210,1,1115,1083,2152,3988,...,5198,105,1635,1673,6331,1103,4,1999,1,0


In [42]:
del merge, mat, d
gc.collect()

111

In [46]:
count_channel_coincidence_per_user.to_feather('../data/count_channel_coincidence_per_user.feather')

In [47]:
del count_channel_coincidence_per_user
gc.collect()

1803

In [48]:
ls -lah ../data/*.feather

-rw-r--r-- 1 ubuntu ubuntu 4.3K May  5 04:22 ../data/agg_by_channel.feather
-rw-r--r-- 1 ubuntu ubuntu 4.8G Apr 30 14:27 ../data/click_data.feather
-rw-r--r-- 1 ubuntu ubuntu  93K May  5 04:53 ../data/count_channel_coincidence_per_user.feather
-rw-r--r-- 1 ubuntu ubuntu 1.1M May  5 04:27 ../data/count_user_by_channel_app.feather
-rw-r--r-- 1 ubuntu ubuntu  18G May  4 07:40 ../data/merge.feather
-rw-r--r-- 1 ubuntu ubuntu 8.4K May  5 04:26 ../data/uqcount_by_channel.feather


### pca

In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

In [2]:
d = pd.concat([
    pd.read_feather('../data/agg_by_channel.feather'),
    pd.read_feather('../data/uqcount_by_channel.feather'),
    pd.read_feather('../data/count_user_by_channel_app.feather'),
    pd.read_feather('../data/count_channel_coincidence_per_user.feather')
], axis = 1)

In [3]:
print(d.shape)
d.head()

(199, 882)


Unnamed: 0,channel,count_by_channel,attrib_by_channel,uqcount_app_by_channel,uqcount_ip_by_channel,uqcount_device_by_channel,uqcount_os_by_channel,uqcount_user_by_channel,channel.1,app_000,...,chan_484,chan_486,chan_487,chan_488,chan_489,chan_490,chan_496,chan_497,chan_498,chan_500
0,0,1258,87,25,705,13,49,776,0,0,...,7,0,52,2,140,5,20,55,0,0
1,3,546455,283,13,10646,12,184,279071,3,0,...,3382,107,8503,604,37836,1745,1724,11075,8,0
2,4,735,54,6,496,6,42,587,4,0,...,2,0,27,0,169,2,7,21,0,0
3,5,9289,5735,2,6831,7,44,8403,5,0,...,58,1,171,10,537,36,118,210,0,0
4,13,86058,6,2,8995,6,108,36210,13,0,...,5198,105,1635,1673,6331,1103,4,1999,1,0


In [4]:
y = d[['channel', 'attrib_by_channel']].iloc[:, 1:]
X = d.drop(['channel', 'attrib_by_channel'], axis=1)

In [5]:
n_components = 20
pca = PCA(n_components = n_components)
res = pca.fit_transform(X.values)

In [6]:
res.shape

(199, 20)

In [7]:
pca.explained_variance_ratio_.cumsum()

array([0.84613685, 0.88533139, 0.91000717, 0.92937141, 0.94309187,
       0.95279619, 0.96147134, 0.96703268, 0.97245128, 0.97691818,
       0.98097505, 0.98406001, 0.98667171, 0.98860084, 0.99005921,
       0.99137541, 0.99250953, 0.9935579 , 0.99424756, 0.99482423])

In [8]:
col = ['channel_pca_{0:02d}'.format(i) for i in [i + 1 for i in range(20)]]
res = pd.DataFrame(res, columns=col)
channel_pca = pd.concat([y.channel, res], axis=1)

In [10]:
channel_pca.to_feather('../data/channel_pca.feather')

In [9]:
channel_pca.head()

Unnamed: 0,channel,channel_pca_01,channel_pca_02,channel_pca_03,channel_pca_04,channel_pca_05,channel_pca_06,channel_pca_07,channel_pca_08,channel_pca_09,...,channel_pca_11,channel_pca_12,channel_pca_13,channel_pca_14,channel_pca_15,channel_pca_16,channel_pca_17,channel_pca_18,channel_pca_19,channel_pca_20
0,0,-752818.244165,51908.446351,24084.129406,-61086.744195,69772.192004,9664.09881,-21196.799379,5825.773622,10385.103648,...,-12887.367785,-2545.875057,-37452.297652,4153.414052,8944.322702,5453.486828,7468.202736,724.31247,3826.247412,2832.185252
1,3,-130830.41228,-39690.161209,-75434.725347,-49293.680501,-2887.239246,114504.559921,175029.709959,-135512.24591,-48637.925197,...,-1681.408214,-1708.302756,47264.302591,-13309.292334,-12437.432243,-18154.741356,-21433.61699,-10974.927755,-27888.468723,-5670.686402
2,4,-753386.848518,51979.27963,24131.105319,-61155.07512,69854.031239,9622.948792,-21246.193832,5839.320265,10416.72236,...,-12984.261963,-2582.235473,-37749.390562,4175.201124,9028.090233,5526.026571,7551.844882,731.748658,3884.571108,2873.608418
3,5,-742193.57272,50291.630508,23837.067834,-58894.469669,67275.447628,8970.97859,-20197.406721,6673.033984,9845.711583,...,-12450.057179,-2376.740001,-36298.403254,3999.363226,8873.627135,5868.181044,7663.26033,326.569169,3127.659777,2897.295679
4,13,-659150.843687,39703.522279,19338.725568,-51718.550735,57973.518248,17399.89477,-9317.838809,17013.266838,2051.281541,...,-3649.617223,1863.984973,3756.45425,-2246.902155,11516.34475,-5239.480764,-10999.921091,2755.70152,15534.328965,6055.053807


### とりあえず回帰してみる

In [None]:
import statsmodels as sm

In [92]:
X1 = sm.add_constant(
    channel_pca.iloc[:, 1:] / channel_pca.iloc[:, 1:].max().max()
    , prepend=False
)

In [93]:
X1.head()

Unnamed: 0,channel_pca_01,channel_pca_02,channel_pca_03,channel_pca_04,channel_pca_05,channel_pca_06,channel_pca_07,channel_pca_08,channel_pca_09,channel_pca_10,...,channel_pca_12,channel_pca_13,channel_pca_14,channel_pca_15,channel_pca_16,channel_pca_17,channel_pca_18,channel_pca_19,channel_pca_20,const
0,-0.067062,0.004624,0.002145,-0.005442,0.006215,0.000861,-0.001888,0.000519,0.000925,0.003068,...,-0.000227,-0.003336,0.00037,0.000797,0.000486,0.000665,6.5e-05,0.000341,0.000253,1.0
1,-0.011655,-0.003536,-0.00672,-0.004391,-0.000257,0.0102,0.015592,-0.012072,-0.004333,-0.00807,...,-0.000152,0.00421,-0.001186,-0.001108,-0.001617,-0.00191,-0.000978,-0.002484,-0.000519,1.0
2,-0.067112,0.00463,0.00215,-0.005448,0.006223,0.000857,-0.001893,0.00052,0.000928,0.003083,...,-0.00023,-0.003363,0.000372,0.000804,0.000492,0.000673,6.5e-05,0.000346,0.000257,1.0
3,-0.066115,0.00448,0.002123,-0.005246,0.005993,0.000799,-0.001799,0.000594,0.000877,0.00289,...,-0.000212,-0.003233,0.000356,0.00079,0.000523,0.000683,2.9e-05,0.000278,0.000259,1.0
4,-0.058718,0.003537,0.001723,-0.004607,0.005164,0.00155,-0.00083,0.001516,0.000183,0.002031,...,0.000166,0.000335,-0.0002,0.001026,-0.000467,-0.00098,0.000246,0.001384,0.000536,1.0


In [81]:
model = sm.GLM(y.attrib_by_channel,
               X1,
               family=sm.families.Poisson(),
               offset=X.count_by_channel)

In [96]:
model = sm.GLM(y.attrib_by_channel.values,
               X1.values,
               family=sm.families.Poisson(),
               offset = np.log(X.count_by_channel.values)
              )

In [97]:
print(y.attrib_by_channel.shape)
print(X1.shape)
print(X.count_by_channel.shape)

(199,)
(199, 21)
(199,)


In [98]:
print(model.fit().summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                      y   No. Observations:                  199
Model:                            GLM   Df Residuals:                      178
Model Family:                 Poisson   Df Model:                           20
Link Function:                    log   Scale:                             1.0
Method:                          IRLS   Log-Likelihood:            -4.7362e+05
Date:                Sat, 05 May 2018   Deviance:                   9.4609e+05
Time:                        05:46:06   Pearson chi2:                 1.27e+08
No. Iterations:                     8                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1           -13.9893      0.024   -578.931      0.000     -14.037     -13.942
x2            25.7859      0.092    280.949      0.0