In [1]:
import gc
import time
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import plot_importance
import matplotlib.pyplot as plt
from logzero import logger

In [2]:
def do_count(df, group_cols, agg_type='uint32'):
    agg_name = "_".join(group_cols) + "_count"
    print("Aggregating by ", group_cols, '...')
    gp = df[group_cols][group_cols].groupby(group_cols).size().rename(agg_name).to_frame().reset_index()
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    print(agg_name + " max value = ", df[agg_name].max())
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return(df)


def do_countuniq(df, group_cols, counted, agg_type='uint32'):
    agg_name = "_".join(group_cols) + "_{}_countuniq".format(counted)
    print("Counting unqiue ", counted, " by ", group_cols, '...')
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].nunique().reset_index().rename(columns={counted: agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    print(agg_name + " max value = ", df[agg_name].max())
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return(df)


def do_cumcount(df, group_cols, counted, agg_type='uint32'):
    agg_name = "_".join(group_cols) + "_cumcount"
    print("Cumulative count by ", group_cols, '...')
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].cumcount()
    df[agg_name] = gp.values
    del gp
    print(agg_name + " max value = ", df[agg_name].max())
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return(df)


def do_mean(df, group_cols, counted, agg_type='float32'):
    agg_name = "_".join(group_cols) + "_mean"
    print("Calculating mean of ", counted, " by ", group_cols, '...')
    gp = df[group_cols + [counted]].groupby(group_cols)[counted].mean(
    ).reset_index().rename(columns={counted: agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    print(agg_name + " max value = ", df[agg_name].max())
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return(df)


def do_var(df, group_cols, counted, agg_type='float32'):
    agg_name = "_".join(group_cols) + "_var"
    print("Calculating variance of ", counted, " by ", group_cols, '...')
    gp = df[group_cols + [counted]].groupby(group_cols)[counted].var(
    ).reset_index().rename(columns={counted: agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    print(agg_name + " max value = ", df[agg_name].max())
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return(df)

In [3]:
%%time
df = pd.read_feather("../data/merged_click_data.hdf", "merged_click_data")

CPU times: user 52 ms, sys: 8.52 s, total: 8.57 s
Wall time: 8.57 s


In [4]:
df.dtypes

app                                              uint16
channel                                          uint16
click_id                                        float64
device                                           uint16
ip                                               uint32
is_attributed                                   float64
os                                               uint16
click_count_by_ip_os_device_dow                  uint32
click_count_by_ip_os_device_dow_hour             uint16
click_count_by_ip                                uint32
dow                                               uint8
hour                                              uint8
minute                                            uint8
second                                            uint8
rank_by_ip                                       uint32
rank_by_ip_os_device                             uint32
rank_by_ip_os_device_dow                         uint32
click_time_interval_by_ip                       

In [5]:
%%time
df = do_cumcount(df, ['ip', 'device', 'os'], 'app')
gc.collect()
df = do_cumcount(df, ['ip'], 'os')
gc.collect()
df = do_countuniq(df, ['ip'], 'channel', 'uint8')
gc.collect()
df = do_countuniq(df, ['ip', 'dow'], 'hour', 'uint8')
gc.collect()
df = do_countuniq(df, ['ip'], 'app', 'uint8')
gc.collect()
df = do_countuniq(df, ['ip', 'app'], 'os', 'uint8')
gc.collect()
df = do_countuniq(df, ['ip'], 'device', 'uint16')
gc.collect()
df = do_countuniq(df, ['app'], 'channel')
gc.collect()
df = do_countuniq(df, ['ip', 'device', 'os'], 'app')
gc.collect()
df = do_count(df, ['ip', 'dow', 'hour'])
gc.collect()
df = do_count(df, ['ip', 'app'])
gc.collect()
df = do_count(df, ['ip', 'app', 'os'], 'uint16')
gc.collect()
df = do_var(df, ['ip', 'dow', 'channel'], 'hour')
gc.collect()
df = do_var(df, ['ip', 'app', 'os'], 'hour')
gc.collect()
df = do_var(df, ['ip', 'app', 'channel'], 'dow')
gc.collect()
df = do_mean(df, ['ip', 'app', 'channel'], 'hour')
gc.collect()

Cumulative count by  ['ip', 'device', 'os'] ...
ip_device_os_cumcount max value =  282426
Cumulative count by  ['ip'] ...
ip_cumcount max value =  1421255
Counting unqiue  channel  by  ['ip'] ...
ip_channel_countuniq max value =  165
Counting unqiue  hour  by  ['ip', 'dow'] ...
ip_dow_hour_countuniq max value =  24
Counting unqiue  app  by  ['ip'] ...
ip_app_countuniq max value =  277
Counting unqiue  os  by  ['ip', 'app'] ...
ip_app_os_countuniq max value =  148
Counting unqiue  device  by  ['ip'] ...
ip_device_countuniq max value =  551
Counting unqiue  channel  by  ['app'] ...
app_channel_countuniq max value =  49
Counting unqiue  app  by  ['ip', 'device', 'os'] ...
ip_device_os_app_countuniq max value =  100
Aggregating by  ['ip', 'dow', 'hour'] ...
ip_dow_hour_count max value =  44259
Aggregating by  ['ip', 'app'] ...
ip_app_count max value =  220743
Aggregating by  ['ip', 'app', 'os'] ...
ip_app_os_count max value =  55159
Calculating variance of  hour  by  ['ip', 'dow', 'channel

In [6]:
df.columns

Index(['app', 'channel', 'click_id', 'device', 'ip', 'is_attributed', 'os',
       'click_count_by_ip_os_device_dow',
       'click_count_by_ip_os_device_dow_hour', 'click_count_by_ip', 'dow',
       'hour', 'minute', 'second', 'rank_by_ip', 'rank_by_ip_os_device',
       'rank_by_ip_os_device_dow', 'click_time_interval_by_ip',
       'click_time_interval_by_ip_os_device',
       'click_time_interval_by_ip_os_device_dow',
       'click_time_interval_by_ip_os_device_dow_hour', 'ip_device_os_cumcount',
       'ip_cumcount', 'ip_channel_countuniq', 'ip_dow_hour_countuniq',
       'ip_app_countuniq', 'ip_app_os_countuniq', 'ip_device_countuniq',
       'app_channel_countuniq', 'ip_device_os_app_countuniq',
       'ip_dow_hour_count', 'ip_app_count', 'ip_app_os_count',
       'ip_dow_channel_var', 'ip_app_os_var', 'ip_app_channel_var',
       'ip_app_channel_mean'],
      dtype='object')

In [7]:
df

Unnamed: 0,app,channel,click_id,device,ip,is_attributed,os,click_count_by_ip_os_device_dow,click_count_by_ip_os_device_dow_hour,click_count_by_ip,...,ip_device_countuniq,app_channel_countuniq,ip_device_os_app_countuniq,ip_dow_hour_count,ip_app_count,ip_app_os_count,ip_dow_channel_var,ip_app_os_var,ip_app_channel_var,ip_app_channel_mean
0,3,379,,1,83230,0.0,13,434,1,28085,...,26,49,49,1,5759,1431,7.893333,36.597950,1.389350,8.521276
1,3,379,,1,17357,0.0,19,183,1,26234,...,22,49,48,1,5245,1451,9.618462,25.959045,1.103880,8.236263
2,3,379,,1,35810,0.0,13,34,1,11002,...,16,49,41,1,2156,462,15.600000,33.348476,1.012026,8.872340
3,14,478,,1,45745,0.0,13,1501,1,188741,...,99,40,58,1,10547,2186,5.947712,36.870251,1.070305,10.361702
4,3,379,,1,161007,0.0,13,28,1,1171,...,5,49,28,1,232,80,10.800000,37.221359,0.619048,11.428572
5,3,379,,1,18787,0.0,16,1,1,4029,...,2,49,20,1,884,30,2.333333,18.975863,1.937909,10.277778
6,3,379,,1,103022,0.0,23,1,1,6099,...,11,49,4,1,1256,3,1.000000,44.333332,0.976950,11.208333
7,3,379,,1,114221,0.0,19,51,1,2334,...,4,49,29,1,404,88,32.000000,29.235109,2.028571,9.466666
8,3,379,,1,165970,0.0,13,19,1,2014,...,6,49,22,1,434,46,8.000000,26.908213,0.566667,14.333333
9,64,459,,1,74544,0.0,22,5,1,3882,...,6,3,18,1,32,1,13.809524,,0.555444,11.875000


In [8]:
%%time
df.to_hdf("../data/merged_click_data.hdf", "merged_click_data")

CPU times: user 36.5 s, sys: 2min 56s, total: 3min 32s
Wall time: 6min 16s


In [9]:
gc.collect()

95

In [10]:
%%time
df = pd.read_hdf("../data/merged_click_data.hdf", "merged_click_data")

CPU times: user 440 ms, sys: 12.6 s, total: 13 s
Wall time: 36.5 s


In [11]:
df.dtypes

app                                              uint16
channel                                          uint16
click_id                                        float64
device                                           uint16
ip                                               uint32
is_attributed                                   float64
os                                               uint16
click_count_by_ip_os_device_dow                  uint32
click_count_by_ip_os_device_dow_hour             uint16
click_count_by_ip                                uint32
dow                                               uint8
hour                                              uint8
minute                                            uint8
second                                            uint8
rank_by_ip                                       uint32
rank_by_ip_os_device                             uint32
rank_by_ip_os_device_dow                         uint32
click_time_interval_by_ip                       

In [12]:
%%time
MOD = int(1e9+7)
df["ip_os_device_app_hash"] = (df["ip"].astype(str)+"_"+df["os"].astype(str)+"_"+df["device"].astype(str)+"_"+df["app"].astype(str)).apply(hash)%MOD

CPU times: user 11min 13s, sys: 1min 17s, total: 12min 31s
Wall time: 12min 29s


In [13]:
df["ip_os_device_app_hash"] = df["ip_os_device_app_hash"].astype("uint32")

In [14]:
df.to_hdf("../data/merged_click_data.hdf", "merged_click_data")

# Add time difference features

In [15]:
df = pd.read_hdf("../data/merged_click_data.hdf", "merged_click_data")

In [16]:
df.dtypes

app                                              uint16
channel                                          uint16
click_id                                        float64
device                                           uint16
ip                                               uint32
is_attributed                                   float64
os                                               uint16
click_count_by_ip_os_device_dow                  uint32
click_count_by_ip_os_device_dow_hour             uint16
click_count_by_ip                                uint32
dow                                               uint8
hour                                              uint8
minute                                            uint8
second                                            uint8
rank_by_ip                                       uint32
rank_by_ip_os_device                             uint32
rank_by_ip_os_device_dow                         uint32
click_time_interval_by_ip                       

In [17]:
df = df[["dow", "hour", "minute", "second", "ip_os_device_app_hash"]]

In [18]:
df["click_time_second"] = df["dow"].astype("uint32")*24*60*60 + df["hour"].astype("uint32")*60*60 + df["minute"].astype("uint32")*60 + df["second"].astype("uint32")

In [19]:
df

Unnamed: 0,dow,hour,minute,second,ip_os_device_app_hash,click_time_second
0,0,14,32,21,210218720,52341
1,0,14,33,34,808483042,52414
2,0,14,34,12,307460829,52452
3,0,14,34,52,250305947,52492
4,0,14,35,8,950166762,52508
5,0,14,36,26,90009919,52586
6,0,14,37,44,228641874,52664
7,0,14,37,59,261270055,52679
8,0,14,38,10,797366259,52690
9,0,14,38,23,87040044,52703


In [20]:
%%time
df.sort_values(by=["ip_os_device_app_hash", "click_time_second"], inplace=True)

CPU times: user 4min 26s, sys: 16.6 s, total: 4min 43s
Wall time: 4min 43s


In [21]:
df

Unnamed: 0,dow,hour,minute,second,ip_os_device_app_hash,click_time_second
5983865,0,22,18,1,111,80281
20255942,1,3,17,36,111,98256
28230899,1,5,37,16,111,106636
32910114,1,7,12,4,111,112324
61244089,1,16,35,51,111,146151
64804845,1,20,43,33,111,161013
71205361,2,0,38,39,111,175119
73762271,2,1,24,26,111,177866
100583594,2,9,49,39,111,208179
104020559,2,10,50,55,111,211855


In [22]:
df["click_time_shift"] = df["click_time_second"].shift()

In [24]:
df["click_time_interval_ip_device_os_app_hash"] = df["click_time_second"] - df["click_time_shift"]

In [23]:
df["ip_os_device_app_hash_shift"] = df["ip_os_device_app_hash"].shift()

In [25]:
%%time
df["click_time_interval_ip_device_os_app_hash"].where(df["ip_os_device_app_hash"]==df["ip_os_device_app_hash_shift"], np.nan, inplace=True)

CPU times: user 1.84 s, sys: 644 ms, total: 2.48 s
Wall time: 2.48 s


In [26]:
df

Unnamed: 0,dow,hour,minute,second,ip_os_device_app_hash,click_time_second,click_time_shift,ip_os_device_app_hash_shift,click_time_interval_ip_device_os_app_hash
5983865,0,22,18,1,111,80281,,,
20255942,1,3,17,36,111,98256,80281.0,111.0,17975.0
28230899,1,5,37,16,111,106636,98256.0,111.0,8380.0
32910114,1,7,12,4,111,112324,106636.0,111.0,5688.0
61244089,1,16,35,51,111,146151,112324.0,111.0,33827.0
64804845,1,20,43,33,111,161013,146151.0,111.0,14862.0
71205361,2,0,38,39,111,175119,161013.0,111.0,14106.0
73762271,2,1,24,26,111,177866,175119.0,111.0,2747.0
100583594,2,9,49,39,111,208179,177866.0,111.0,30313.0
104020559,2,10,50,55,111,211855,208179.0,111.0,3676.0


In [27]:
df["first_click_interval_ip_device_os_app_hash"] = df["click_time_interval_ip_device_os_app_hash"].isnull()

In [28]:
df

Unnamed: 0,dow,hour,minute,second,ip_os_device_app_hash,click_time_second,click_time_shift,ip_os_device_app_hash_shift,click_time_interval_ip_device_os_app_hash,first_click_interval_ip_device_os_app_hash
5983865,0,22,18,1,111,80281,,,,True
20255942,1,3,17,36,111,98256,80281.0,111.0,17975.0,False
28230899,1,5,37,16,111,106636,98256.0,111.0,8380.0,False
32910114,1,7,12,4,111,112324,106636.0,111.0,5688.0,False
61244089,1,16,35,51,111,146151,112324.0,111.0,33827.0,False
64804845,1,20,43,33,111,161013,146151.0,111.0,14862.0,False
71205361,2,0,38,39,111,175119,161013.0,111.0,14106.0,False
73762271,2,1,24,26,111,177866,175119.0,111.0,2747.0,False
100583594,2,9,49,39,111,208179,177866.0,111.0,30313.0,False
104020559,2,10,50,55,111,211855,208179.0,111.0,3676.0,False


In [29]:
df.drop(columns=["ip_os_device_app_hash_shift", "click_time_shift"], inplace=True)

In [30]:
df

Unnamed: 0,dow,hour,minute,second,ip_os_device_app_hash,click_time_second,click_time_interval_ip_device_os_app_hash,first_click_interval_ip_device_os_app_hash
5983865,0,22,18,1,111,80281,,True
20255942,1,3,17,36,111,98256,17975.0,False
28230899,1,5,37,16,111,106636,8380.0,False
32910114,1,7,12,4,111,112324,5688.0,False
61244089,1,16,35,51,111,146151,33827.0,False
64804845,1,20,43,33,111,161013,14862.0,False
71205361,2,0,38,39,111,175119,14106.0,False
73762271,2,1,24,26,111,177866,2747.0,False
100583594,2,9,49,39,111,208179,30313.0,False
104020559,2,10,50,55,111,211855,3676.0,False


In [31]:
df["click_time_interval_ip_device_os_app_hash"].max()

341856.0

In [32]:
df["click_time_interval_ip_device_os_app_hash"].fillna(1e9+7, inplace=True)

In [33]:
df["click_time_interval_ip_device_os_app_hash"] = df["click_time_interval_ip_device_os_app_hash"].astype("uint32")

In [34]:
df

Unnamed: 0,dow,hour,minute,second,ip_os_device_app_hash,click_time_second,click_time_interval_ip_device_os_app_hash,first_click_interval_ip_device_os_app_hash
5983865,0,22,18,1,111,80281,1000000007,True
20255942,1,3,17,36,111,98256,17975,False
28230899,1,5,37,16,111,106636,8380,False
32910114,1,7,12,4,111,112324,5688,False
61244089,1,16,35,51,111,146151,33827,False
64804845,1,20,43,33,111,161013,14862,False
71205361,2,0,38,39,111,175119,14106,False
73762271,2,1,24,26,111,177866,2747,False
100583594,2,9,49,39,111,208179,30313,False
104020559,2,10,50,55,111,211855,3676,False


In [35]:
df.sort_index(inplace=True)

In [36]:
df

Unnamed: 0,dow,hour,minute,second,ip_os_device_app_hash,click_time_second,click_time_interval_ip_device_os_app_hash,first_click_interval_ip_device_os_app_hash
0,0,14,32,21,210218720,52341,1000000007,True
1,0,14,33,34,808483042,52414,1000000007,True
2,0,14,34,12,307460829,52452,1000000007,True
3,0,14,34,52,250305947,52492,1000000007,True
4,0,14,35,8,950166762,52508,1000000007,True
5,0,14,36,26,90009919,52586,1000000007,True
6,0,14,37,44,228641874,52664,1000000007,True
7,0,14,37,59,261270055,52679,1000000007,True
8,0,14,38,10,797366259,52690,1000000007,True
9,0,14,38,23,87040044,52703,1000000007,True


In [37]:
df[["click_time_interval_ip_device_os_app_hash", "first_click_interval_ip_device_os_app_hash"]].to_hdf("../data/click_time_interval_ip_device_os_app_hash.hdf", "click_time_interval_ip_device_os_app_hash")

In [38]:
gc.collect()

200

# Add first click features

In [39]:
import pandas as pd
df = pd.read_hdf("../data/merged_click_data.hdf", "merged_click_data")

In [40]:
df["is_attributed"].fillna(0.0, inplace=True)
df["is_attributed"] = (df["is_attributed"] == 1.0)

In [41]:
df["is_attributed"] = df["is_attributed"].astype("uint8")

In [42]:
df["click_id"].fillna(1e9+7, inplace=True)

In [43]:
df["click_id"] = df["click_id"].astype("uint32")

In [44]:
df.dtypes

app                                              uint16
channel                                          uint16
click_id                                         uint32
device                                           uint16
ip                                               uint32
is_attributed                                     uint8
os                                               uint16
click_count_by_ip_os_device_dow                  uint32
click_count_by_ip_os_device_dow_hour             uint16
click_count_by_ip                                uint32
dow                                               uint8
hour                                              uint8
minute                                            uint8
second                                            uint8
rank_by_ip                                       uint32
rank_by_ip_os_device                             uint32
rank_by_ip_os_device_dow                         uint32
click_time_interval_by_ip                       

In [45]:
df["click_time_interval_by_ip"].fillna(1e9+7, inplace=True)
df["click_time_interval_by_ip"] = df["click_time_interval_by_ip"].astype("uint32")
df["click_time_interval_by_ip_os_device"].fillna(1e9+7, inplace=True)
df["click_time_interval_by_ip_os_device"] = df["click_time_interval_by_ip_os_device"].astype("uint32")
df["click_time_interval_by_ip_os_device_dow"].fillna(1e9+7, inplace=True)
df["click_time_interval_by_ip_os_device_dow"] = df["click_time_interval_by_ip_os_device_dow"].astype("uint32")
df["click_time_interval_by_ip_os_device_dow_hour"].fillna(1e9+7, inplace=True)
df["click_time_interval_by_ip_os_device_dow_hour"] = df["click_time_interval_by_ip_os_device_dow_hour"].astype("uint32")

In [46]:
df.dtypes

app                                              uint16
channel                                          uint16
click_id                                         uint32
device                                           uint16
ip                                               uint32
is_attributed                                     uint8
os                                               uint16
click_count_by_ip_os_device_dow                  uint32
click_count_by_ip_os_device_dow_hour             uint16
click_count_by_ip                                uint32
dow                                               uint8
hour                                              uint8
minute                                            uint8
second                                            uint8
rank_by_ip                                       uint32
rank_by_ip_os_device                             uint32
rank_by_ip_os_device_dow                         uint32
click_time_interval_by_ip                       

In [47]:
df["first_click_by_ip"] = df["click_time_interval_by_ip"]==1e9+7
df["first_click_by_ip_os_device"] = df["click_time_interval_by_ip_os_device"]==1e9+7
df["first_click_by_ip_os_device_dow"] = df["click_time_interval_by_ip_os_device_dow"]==1e9+7
df["first_click_by_ip_os_device_dow_hour"] = df["click_time_interval_by_ip_os_device_dow_hour"]==1e9+7

In [48]:
df.to_hdf("../data/merged_click_data.hdf", "merged_click_data")

# Add categorical features

In [1]:
%%time
import pandas as pd
from logzero import logger
df = pd.read_feather("../data/basic_table")

CPU times: user 892 ms, sys: 1.08 s, total: 1.98 s
Wall time: 1.98 s


In [2]:
df.dtypes

app              uint16
channel          uint16
click_id         uint32
device           uint16
ip               uint32
is_attributed     uint8
os               uint16
dtype: object

In [3]:
%%time
logger.info("app")
df["app"] = df["app"].astype(str)
logger.info("os")
df["os"] = df["os"].astype(str)
logger.info("device")
df["device"] = df["device"].astype(str)
logger.info("channel")
df["channel"] = df["channel"].astype(str)
logger.info("ip")
df["ip"] = df["ip"].astype(str)

[I 180429 11:44:23 <timed exec>:1] app
[I 180429 11:46:59 <timed exec>:3] os
[I 180429 11:49:35 <timed exec>:5] device
[I 180429 11:52:03 <timed exec>:7] channel
[I 180429 11:54:45 <timed exec>:9] ip


CPU times: user 12min 34s, sys: 32.5 s, total: 13min 6s
Wall time: 13min 3s


In [4]:
df.dtypes

app              object
channel          object
click_id         uint32
device           object
ip               object
is_attributed     uint8
os               object
dtype: object

In [5]:
columns = ["ip", "os", "device", "app", "channel"]
n = len(columns)

In [6]:
logger.info("started")
for i in range(n):
    for j in range(i+1, n):
        ci=columns[i]
        cj=columns[j]
        feature="{}_{}".format(ci, cj)
        logger.info("{} started".format(feature))
        df[feature] = pd.Categorical(df[ci]+"_"+df[cj]).codes
        logger.info("{} finished".format(feature))

[I 180429 11:57:27 <ipython-input-6-e942d1df4103>:1] started
[I 180429 11:57:27 <ipython-input-6-e942d1df4103>:7] ip_os started
[I 180429 11:59:02 <ipython-input-6-e942d1df4103>:9] ip_os finished
[I 180429 11:59:02 <ipython-input-6-e942d1df4103>:7] ip_device started
[I 180429 12:01:06 <ipython-input-6-e942d1df4103>:9] ip_device finished
[I 180429 12:01:06 <ipython-input-6-e942d1df4103>:7] ip_app started
[I 180429 12:03:26 <ipython-input-6-e942d1df4103>:9] ip_app finished
[I 180429 12:03:26 <ipython-input-6-e942d1df4103>:7] ip_channel started
[I 180429 12:06:17 <ipython-input-6-e942d1df4103>:9] ip_channel finished
[I 180429 12:06:17 <ipython-input-6-e942d1df4103>:7] os_device started
[I 180429 12:09:34 <ipython-input-6-e942d1df4103>:9] os_device finished
[I 180429 12:09:34 <ipython-input-6-e942d1df4103>:7] os_app started
[I 180429 12:12:54 <ipython-input-6-e942d1df4103>:9] os_app finished
[I 180429 12:12:54 <ipython-input-6-e942d1df4103>:7] os_channel started
[I 180429 12:16:14 <ipython

In [7]:
df.dtypes



app               object
channel           object
click_id          uint32
device            object
ip                object
is_attributed      uint8
os                object
ip_os              int32
ip_device          int32
ip_app             int32
ip_channel         int32
os_device          int16
os_app             int16
os_channel         int16
device_app         int16
device_channel     int16
app_channel        int16
dtype: object

In [8]:
import gc
gc.collect()

133

In [9]:
twos = ["{}_{}".format(columns[i], columns[j]) for i in range(n) for j in range(i+1, n)]

In [10]:
for two in twos:
    logger.info("{} {}".format(two, df[two].max()))

[I 180429 12:28:27 <ipython-input-10-80b0f64acc4d>:2] ip_os 3260003
[I 180429 12:28:28 <ipython-input-10-80b0f64acc4d>:2] ip_device 795046
[I 180429 12:28:29 <ipython-input-10-80b0f64acc4d>:2] ip_app 4135195
[I 180429 12:28:30 <ipython-input-10-80b0f64acc4d>:2] ip_channel 8419361
[I 180429 12:28:31 <ipython-input-10-80b0f64acc4d>:2] os_device 5897
[I 180429 12:28:32 <ipython-input-10-80b0f64acc4d>:2] os_app 14638
[I 180429 12:28:33 <ipython-input-10-80b0f64acc4d>:2] os_channel 19725
[I 180429 12:28:33 <ipython-input-10-80b0f64acc4d>:2] device_app 13140
[I 180429 12:28:34 <ipython-input-10-80b0f64acc4d>:2] device_channel 11565
[I 180429 12:28:35 <ipython-input-10-80b0f64acc4d>:2] app_channel 1456


In [11]:
df.drop(columns=["click_id", "is_attributed"], inplace=True)

In [12]:
gc.collect()

104

In [13]:
df["app"] = pd.Categorical(df["app"]).codes
logger.info("app")
df["os"] = pd.Categorical(df["os"]).codes
logger.info("os")
df["device"] = pd.Categorical(df["device"]).codes
logger.info("device")
df["channel"] = pd.Categorical(df["channel"]).codes
logger.info("channel")
df["ip"] = pd.Categorical(df["ip"]).codes
logger.info("ip")

[I 180429 12:30:30 <ipython-input-13-e56391e1c60f>:2] app
[I 180429 12:30:57 <ipython-input-13-e56391e1c60f>:4] os
[I 180429 12:31:19 <ipython-input-13-e56391e1c60f>:6] device
[I 180429 12:31:39 <ipython-input-13-e56391e1c60f>:8] channel
[I 180429 12:32:08 <ipython-input-13-e56391e1c60f>:10] ip


In [14]:
df = df.astype("uint64")

In [15]:
ones = columns
n = len(ones)

In [16]:
df.max()

app                   729
channel               201
device               3798
ip                 333167
os                    855
ip_os             3260003
ip_device          795046
ip_app            4135195
ip_channel        8419361
os_device            5897
os_app              14638
os_channel          19725
device_app          13140
device_channel      11565
app_channel          1456
dtype: uint64

In [17]:
for i in range(n):
    for j in range(i+1, n):
        for k in range(j+1, n):
            feature="{}_{}_{}".format(ones[i], ones[j], ones[k])
            one = ones[i]
            two = "{}_{}".format(ones[j], ones[k])
            logger.info(feature)
            df[feature] = pd.Categorical(df[one]*10000000+df[two]).codes
            logger.info("done")

[I 180429 12:33:12 <ipython-input-17-402fd42ae267>:7] ip_os_device
[I 180429 12:33:31 <ipython-input-17-402fd42ae267>:9] done
[I 180429 12:33:31 <ipython-input-17-402fd42ae267>:7] ip_os_app
[I 180429 12:34:02 <ipython-input-17-402fd42ae267>:9] done
[I 180429 12:34:02 <ipython-input-17-402fd42ae267>:7] ip_os_channel
[I 180429 12:34:52 <ipython-input-17-402fd42ae267>:9] done
[I 180429 12:34:52 <ipython-input-17-402fd42ae267>:7] ip_device_app
[I 180429 12:35:07 <ipython-input-17-402fd42ae267>:9] done
[I 180429 12:35:07 <ipython-input-17-402fd42ae267>:7] ip_device_channel
[I 180429 12:35:28 <ipython-input-17-402fd42ae267>:9] done
[I 180429 12:35:28 <ipython-input-17-402fd42ae267>:7] ip_app_channel
[I 180429 12:35:53 <ipython-input-17-402fd42ae267>:9] done
[I 180429 12:35:53 <ipython-input-17-402fd42ae267>:7] os_device_app
[I 180429 12:36:01 <ipython-input-17-402fd42ae267>:9] done
[I 180429 12:36:01 <ipython-input-17-402fd42ae267>:7] os_device_channel
[I 180429 12:36:09 <ipython-input-17-40

In [18]:
df.max()

app                        729.0
channel                    201.0
device                    3798.0
ip                      333167.0
os                         855.0
ip_os                  3260003.0
ip_device               795046.0
ip_app                 4135195.0
ip_channel             8419361.0
os_device                 5897.0
os_app                   14638.0
os_channel               19725.0
device_app               13140.0
device_channel           11565.0
app_channel               1456.0
ip_os_device           3669417.0
ip_os_app             23835240.0
ip_os_channel         43803139.0
ip_device_app          4795604.0
ip_device_channel      9446113.0
ip_app_channel        13206832.0
os_device_app            32952.0
os_device_channel        44789.0
os_app_channel           43287.0
device_app_channel       19606.0
dtype: float64

In [19]:
%%time
df = df.astype("int64")

MemoryError: 

In [20]:
%%time
df.max()

CPU times: user 27.2 s, sys: 9.12 s, total: 36.3 s
Wall time: 36.3 s


app                        729
channel                    201
device                    3798
ip                      333167
os                         855
ip_os                  3260003
ip_device               795046
ip_app                 4135195
ip_channel             8419361
os_device                 5897
os_app                   14638
os_channel               19725
device_app               13140
device_channel           11565
app_channel               1456
ip_os_device           3669417
ip_os_app             23835240
ip_os_channel         43803139
ip_device_app          4795604
ip_device_channel      9446113
ip_app_channel        13206832
os_device_app            32952
os_device_channel        44789
os_app_channel           43287
device_app_channel       19606
dtype: int64

In [21]:
%%time
for i in range(n):
    for j in range(i+1, n):
        for k in range(j+1, n):
            for l in range(k+1, n):
                two1="{}_{}".format(ones[i], ones[j])
                two2="{}_{}".format(ones[k], ones[l])
                feature = "{}_{}".format(two1, two2)
                logger.info(feature)
                df[feature] = pd.Categorical(df[two1]*1000000000+ df[two2]).codes
                logger.info("done")

[I 180429 12:39:00 <timed exec>:8] ip_os_device_app
[I 180429 12:39:31 <timed exec>:10] done
[I 180429 12:39:31 <timed exec>:8] ip_os_device_channel


MemoryError: 

In [22]:
df.dtypes

app                   uint64
channel               uint64
device                uint64
ip                    uint64
os                    uint64
ip_os                 uint64
ip_device             uint64
ip_app                uint64
ip_channel            uint64
os_device             uint64
os_app                uint64
os_channel            uint64
device_app            uint64
device_channel        uint64
app_channel           uint64
ip_os_device           int32
ip_os_app              int32
ip_os_channel          int32
ip_device_app          int32
ip_device_channel      int32
ip_app_channel         int32
os_device_app          int32
os_device_channel      int32
os_app_channel         int32
device_app_channel     int16
ip_os_device_app       int32
dtype: object

In [23]:
df["ip_os_device_app"].max()

24990688

In [24]:
df["channel"].max()

201

In [25]:
df["ip_os_device_app"] = df["ip_os_device_app"].astype("int64")

In [26]:
df["ip_os_device_app"] = df["ip_os_device_app"] * 1000

In [27]:
%%time
df["ip_os_device_app_channel"] = pd.Categorical(df["ip_os_device_app"]*1000+df["channel"]).codes

CPU times: user 57.8 s, sys: 5.08 s, total: 1min 2s
Wall time: 1min 2s


In [28]:
df.dtypes

app                         uint64
channel                     uint64
device                      uint64
ip                          uint64
os                          uint64
ip_os                       uint64
ip_device                   uint64
ip_app                      uint64
ip_channel                  uint64
os_device                   uint64
os_app                      uint64
os_channel                  uint64
device_app                  uint64
device_channel              uint64
app_channel                 uint64
ip_os_device                 int32
ip_os_app                    int32
ip_os_channel                int32
ip_device_app                int32
ip_device_channel            int32
ip_app_channel               int32
os_device_app                int32
os_device_channel            int32
os_app_channel               int32
device_app_channel           int16
ip_os_device_app             int64
ip_os_device_app_channel     int32
dtype: object

In [29]:
df.drop(columns=["ip", "app", "os", "device", "channel"], inplace=True)

In [30]:
df.dtypes

ip_os                       uint64
ip_device                   uint64
ip_app                      uint64
ip_channel                  uint64
os_device                   uint64
os_app                      uint64
os_channel                  uint64
device_app                  uint64
device_channel              uint64
app_channel                 uint64
ip_os_device                 int32
ip_os_app                    int32
ip_os_channel                int32
ip_device_app                int32
ip_device_channel            int32
ip_app_channel               int32
os_device_app                int32
os_device_channel            int32
os_app_channel               int32
device_app_channel           int16
ip_os_device_app             int64
ip_os_device_app_channel     int32
dtype: object

In [31]:
df.max()

ip_os                       3.260003e+06
ip_device                   7.950460e+05
ip_app                      4.135195e+06
ip_channel                  8.419361e+06
os_device                   5.897000e+03
os_app                      1.463800e+04
os_channel                  1.972500e+04
device_app                  1.314000e+04
device_channel              1.156500e+04
app_channel                 1.456000e+03
ip_os_device                3.669417e+06
ip_os_app                   2.383524e+07
ip_os_channel               4.380314e+07
ip_device_app               4.795604e+06
ip_device_channel           9.446113e+06
ip_app_channel              1.320683e+07
os_device_app               3.295200e+04
os_device_channel           4.478900e+04
os_app_channel              4.328700e+04
device_app_channel          1.960600e+04
ip_os_device_app            2.499069e+10
ip_os_device_app_channel    5.869162e+07
dtype: float64

In [32]:
df["ip_os_device_app"] = pd.Categorical(df["ip_os_device_app"]).codes

In [33]:
df_max = df.max()
df_max

ip_os                        3260003.0
ip_device                     795046.0
ip_app                       4135195.0
ip_channel                   8419361.0
os_device                       5897.0
os_app                         14638.0
os_channel                     19725.0
device_app                     13140.0
device_channel                 11565.0
app_channel                     1456.0
ip_os_device                 3669417.0
ip_os_app                   23835240.0
ip_os_channel               43803139.0
ip_device_app                4795604.0
ip_device_channel            9446113.0
ip_app_channel              13206832.0
os_device_app                  32952.0
os_device_channel              44789.0
os_app_channel                 43287.0
device_app_channel             19606.0
ip_os_device_app            24990688.0
ip_os_device_app_channel    58691622.0
dtype: float64

In [34]:
df = df.astype("uint32")

In [35]:
for c in df_max[df_max < 60000].index:
    logger.info(c)
    logger.info(df[c].max())
    df[c] = df[c].astype("uint16")

[I 180429 12:44:44 <ipython-input-35-a94b67c351ee>:2] os_device
[I 180429 12:44:46 <ipython-input-35-a94b67c351ee>:3] 5897
[I 180429 12:44:53 <ipython-input-35-a94b67c351ee>:2] os_app
[I 180429 12:44:54 <ipython-input-35-a94b67c351ee>:3] 14638
[I 180429 12:45:02 <ipython-input-35-a94b67c351ee>:2] os_channel
[I 180429 12:45:03 <ipython-input-35-a94b67c351ee>:3] 19725
[I 180429 12:45:10 <ipython-input-35-a94b67c351ee>:2] device_app
[I 180429 12:45:11 <ipython-input-35-a94b67c351ee>:3] 13140
[I 180429 12:45:18 <ipython-input-35-a94b67c351ee>:2] device_channel
[I 180429 12:45:19 <ipython-input-35-a94b67c351ee>:3] 11565
[I 180429 12:45:25 <ipython-input-35-a94b67c351ee>:2] app_channel
[I 180429 12:45:27 <ipython-input-35-a94b67c351ee>:3] 1456
[I 180429 12:45:33 <ipython-input-35-a94b67c351ee>:2] os_device_app
[I 180429 12:45:34 <ipython-input-35-a94b67c351ee>:3] 32952
[I 180429 12:45:39 <ipython-input-35-a94b67c351ee>:2] os_device_channel
[I 180429 12:45:41 <ipython-input-35-a94b67c351ee>:3

In [36]:
df.dtypes

ip_os                       uint32
ip_device                   uint32
ip_app                      uint32
ip_channel                  uint32
os_device                   uint16
os_app                      uint16
os_channel                  uint16
device_app                  uint16
device_channel              uint16
app_channel                 uint16
ip_os_device                uint32
ip_os_app                   uint32
ip_os_channel               uint32
ip_device_app               uint32
ip_device_channel           uint32
ip_app_channel              uint32
os_device_app               uint16
os_device_channel           uint16
os_app_channel              uint16
device_app_channel          uint16
ip_os_device_app            uint32
ip_os_device_app_channel    uint32
dtype: object

In [37]:
df.to_feather("../data/multi_basic")