In [1]:
import gc
import time
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import plot_importance
import matplotlib.pyplot as plt
from logzero import logger

In [2]:
def do_count(df, group_cols, agg_type='uint32'):
    agg_name = "_".join(group_cols) + "_count"
    print("Aggregating by ", group_cols, '...')
    gp = df[group_cols][group_cols].groupby(group_cols).size().rename(agg_name).to_frame().reset_index()
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    print(agg_name + " max value = ", df[agg_name].max())
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return(df)


def do_countuniq(df, group_cols, counted, agg_type='uint32'):
    agg_name = "_".join(group_cols) + "_{}_countuniq".format(counted)
    print("Counting unqiue ", counted, " by ", group_cols, '...')
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].nunique().reset_index().rename(columns={counted: agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    print(agg_name + " max value = ", df[agg_name].max())
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return(df)


def do_cumcount(df, group_cols, counted, agg_type='uint32'):
    agg_name = "_".join(group_cols) + "_cumcount"
    print("Cumulative count by ", group_cols, '...')
    gp = df[group_cols+[counted]].groupby(group_cols)[counted].cumcount()
    df[agg_name] = gp.values
    del gp
    print(agg_name + " max value = ", df[agg_name].max())
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return(df)


def do_mean(df, group_cols, counted, agg_type='float32'):
    agg_name = "_".join(group_cols) + "_mean"
    print("Calculating mean of ", counted, " by ", group_cols, '...')
    gp = df[group_cols + [counted]].groupby(group_cols)[counted].mean(
    ).reset_index().rename(columns={counted: agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    print(agg_name + " max value = ", df[agg_name].max())
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return(df)


def do_var(df, group_cols, counted, agg_type='float32'):
    agg_name = "_".join(group_cols) + "_var"
    print("Calculating variance of ", counted, " by ", group_cols, '...')
    gp = df[group_cols + [counted]].groupby(group_cols)[counted].var(
    ).reset_index().rename(columns={counted: agg_name})
    df = df.merge(gp, on=group_cols, how='left')
    del gp
    print(agg_name + " max value = ", df[agg_name].max())
    df[agg_name] = df[agg_name].astype(agg_type)
    gc.collect()
    return(df)

In [3]:
%%time
df = pd.read_hdf("../data/merged_click_data.hdf", "merged_click_data")

CPU times: user 1.56 s, sys: 9.31 s, total: 10.9 s
Wall time: 5min 5s


In [4]:
df.dtypes

app                                              uint16
channel                                          uint16
click_id                                        float64
device                                           uint16
ip                                               uint32
is_attributed                                   float64
os                                               uint16
click_count_by_ip_os_device_dow                  uint32
click_count_by_ip_os_device_dow_hour             uint16
click_count_by_ip                                uint32
dow                                               uint8
hour                                              uint8
minute                                            uint8
second                                            uint8
rank_by_ip                                       uint32
rank_by_ip_os_device                             uint32
rank_by_ip_os_device_dow                         uint32
click_time_interval_by_ip                       

In [None]:
%%time
df = do_cumcount(df, ['ip', 'device', 'os'], 'app')
gc.collect()
df = do_cumcount(df, ['ip'], 'os')
gc.collect()
df = do_countuniq(df, ['ip'], 'channel', 'uint8')
gc.collect()
df = do_countuniq(df, ['ip', 'dow'], 'hour', 'uint8')
gc.collect()
df = do_countuniq(df, ['ip'], 'app', 'uint8')
gc.collect()
df = do_countuniq(df, ['ip', 'app'], 'os', 'uint8')
gc.collect()
df = do_countuniq(df, ['ip'], 'device', 'uint16')
gc.collect()
df = do_countuniq(df, ['app'], 'channel')
gc.collect()
df = do_countuniq(df, ['ip', 'device', 'os'], 'app')
gc.collect()
df = do_count(df, ['ip', 'dow', 'hour'])
gc.collect()
df = do_count(df, ['ip', 'app'])
gc.collect()
df = do_count(df, ['ip', 'app', 'os'], 'uint16')
gc.collect()
df = do_var(df, ['ip', 'dow', 'channel'], 'hour')
gc.collect()
df = do_var(df, ['ip', 'app', 'os'], 'hour')
gc.collect()
df = do_var(df, ['ip', 'app', 'channel'], 'dow')
gc.collect()
df = do_mean(df, ['ip', 'app', 'channel'], 'hour')
gc.collect()

Cumulative count by  ['ip', 'device', 'os'] ...
ip_device_os_cumcount max value =  282426
Cumulative count by  ['ip'] ...
ip_cumcount max value =  1421255
Counting unqiue  channel  by  ['ip'] ...
ip_dow_hour_countuniq max value =  24
Counting unqiue  app  by  ['ip'] ...
ip_app_countuniq max value =  277
Counting unqiue  os  by  ['ip', 'app'] ...
ip_app_os_countuniq max value =  148
Counting unqiue  device  by  ['ip'] ...
ip_device_countuniq max value =  551
Counting unqiue  channel  by  ['app'] ...
app_channel_countuniq max value =  49
Counting unqiue  app  by  ['ip', 'device', 'os'] ...
ip_device_os_app_countuniq max value =  100
Aggregating by  ['ip', 'dow', 'hour'] ...
ip_dow_hour_count max value =  44259
Aggregating by  ['ip', 'app'] ...
ip_app_count max value =  220743
Aggregating by  ['ip', 'app', 'os'] ...
ip_app_os_count max value =  55159
Calculating variance of  hour  by  ['ip', 'dow', 'channel'] ...


In [7]:
df.columns

Index(['app', 'channel', 'click_id', 'device', 'ip', 'is_attributed', 'os',
       'click_count_by_ip_os_device_dow',
       'click_count_by_ip_os_device_dow_hour', 'click_count_by_ip', 'dow',
       'hour', 'minute', 'second', 'rank_by_ip', 'rank_by_ip_os_device',
       'rank_by_ip_os_device_dow', 'click_time_interval_by_ip',
       'click_time_interval_by_ip_os_device',
       'click_time_interval_by_ip_os_device_dow',
       'click_time_interval_by_ip_os_device_dow_hour', 'ip_device_os_cumcount',
       'ip_cumcount', 'ip_channel_countuniq', 'ip_dow_hour_countuniq',
       'ip_app_countuniq', 'ip_app_os_countuniq', 'ip_device_countuniq',
       'app_channel_countuniq', 'ip_device_os_app_countuniq',
       'ip_dow_hour_count', 'ip_app_count', 'ip_app_os_count',
       'ip_dow_channel_var', 'ip_app_os_var', 'ip_app_channel_var',
       'ip_app_channel_mean'],
      dtype='object')

In [8]:
df

Unnamed: 0,app,channel,click_id,device,ip,is_attributed,os,click_count_by_ip_os_device_dow,click_count_by_ip_os_device_dow_hour,click_count_by_ip,...,ip_device_countuniq,app_channel_countuniq,ip_device_os_app_countuniq,ip_dow_hour_count,ip_app_count,ip_app_os_count,ip_dow_channel_var,ip_app_os_var,ip_app_channel_var,ip_app_channel_mean
0,3,379,,1,83230,0.0,13,434,1,28085,...,26,49,49,1,5759,1431,7.893333,36.597950,1.389350,8.521276
1,3,379,,1,17357,0.0,19,183,1,26234,...,22,49,48,1,5245,1451,9.618462,25.959045,1.103880,8.236263
2,3,379,,1,35810,0.0,13,34,1,11002,...,16,49,41,1,2156,462,15.600000,33.348476,1.012026,8.872340
3,14,478,,1,45745,0.0,13,1501,1,188741,...,99,40,58,1,10547,2186,5.947712,36.870251,1.070305,10.361702
4,3,379,,1,161007,0.0,13,28,1,1171,...,5,49,28,1,232,80,10.800000,37.221359,0.619048,11.428572
5,3,379,,1,18787,0.0,16,1,1,4029,...,2,49,20,1,884,30,2.333333,18.975863,1.937909,10.277778
6,3,379,,1,103022,0.0,23,1,1,6099,...,11,49,4,1,1256,3,1.000000,44.333332,0.976950,11.208333
7,3,379,,1,114221,0.0,19,51,1,2334,...,4,49,29,1,404,88,32.000000,29.235109,2.028571,9.466666
8,3,379,,1,165970,0.0,13,19,1,2014,...,6,49,22,1,434,46,8.000000,26.908213,0.566667,14.333333
9,64,459,,1,74544,0.0,22,5,1,3882,...,6,3,18,1,32,1,13.809524,,0.555444,11.875000


In [None]:
%%time
df.to_hdf("../data/merged_click_data.hdf", "merged_click_data")

In [2]:
%%time
df = pd.read_hdf("../data/merged_click_data.hdf", "merged_click_data")

CPU times: user 1.82 s, sys: 14.5 s, total: 16.3 s
Wall time: 7min 41s


In [3]:
df.dtypes

app                                              uint16
channel                                          uint16
click_id                                        float64
device                                           uint16
ip                                               uint32
is_attributed                                   float64
os                                               uint16
click_count_by_ip_os_device_dow                  uint32
click_count_by_ip_os_device_dow_hour             uint16
click_count_by_ip                                uint32
dow                                               uint8
hour                                              uint8
minute                                            uint8
second                                            uint8
rank_by_ip                                       uint32
rank_by_ip_os_device                             uint32
rank_by_ip_os_device_dow                         uint32
click_time_interval_by_ip                       

In [12]:
%%time
MOD = int(1e9+7)
df["ip_os_device_app_hash"] = (df["ip"].astype(str)+"_"+df["os"].astype(str)+"_"+df["device"].astype(str)+"_"+df["app"].astype(str)).apply(hash)%MOD

CPU times: user 12min 2s, sys: 1min 11s, total: 13min 13s
Wall time: 13min 12s


In [16]:
df["ip_os_device_app_hash"] = df["ip_os_device_app_hash"].astype("uint32")

In [17]:
df.to_hdf("../data/merged_click_data.hdf", "merged_click_data")

# Add time difference features

In [2]:
df = pd.read_hdf("../data/merged_click_data.hdf", "merged_click_data")

In [3]:
df.dtypes

app                                              uint16
channel                                          uint16
click_id                                        float64
device                                           uint16
ip                                               uint32
is_attributed                                   float64
os                                               uint16
click_count_by_ip_os_device_dow                  uint32
click_count_by_ip_os_device_dow_hour             uint16
click_count_by_ip                                uint32
dow                                               uint8
hour                                              uint8
minute                                            uint8
second                                            uint8
rank_by_ip                                       uint32
rank_by_ip_os_device                             uint32
rank_by_ip_os_device_dow                         uint32
click_time_interval_by_ip                       

In [4]:
df = df[["dow", "hour", "minute", "second", "ip_os_device_app_hash"]]

In [9]:
df["click_time_second"] = df["dow"].astype("uint32")*24*60*60 + df["hour"].astype("uint32")*60*60 + df["minute"].astype("uint32")*60 + df["second"].astype("uint32")

In [11]:
df

Unnamed: 0,dow,hour,minute,second,ip_os_device_app_hash,click_time_second
0,0,14,32,21,525163057,52341
1,0,14,33,34,116105055,52414
2,0,14,34,12,4835573,52452
3,0,14,34,52,271103479,52492
4,0,14,35,8,383355941,52508
5,0,14,36,26,265346367,52586
6,0,14,37,44,451576942,52664
7,0,14,37,59,48712048,52679
8,0,14,38,10,409149612,52690
9,0,14,38,23,991191953,52703


In [12]:
%%time
df.sort_values(by=["ip_os_device_app_hash", "click_time_second"], inplace=True)

CPU times: user 4min 34s, sys: 12.2 s, total: 4min 46s
Wall time: 4min 46s


In [14]:
df

Unnamed: 0,dow,hour,minute,second,ip_os_device_app_hash,click_time_second
121638,0,16,2,13,26,57733
1787841,0,16,43,25,26,60205
9261562,0,23,58,58,26,86338
11126756,1,0,30,26,26,88226
13377758,1,1,7,58,26,90478
13841862,1,1,16,21,26,90981
13860275,1,1,16,40,26,91000
42169678,1,10,21,31,26,123691
51061818,1,13,11,33,26,133893
52111160,1,13,31,21,26,135081


In [15]:
df["click_time_shift"] = df["click_time_second"].shift()

In [16]:
df

Unnamed: 0,dow,hour,minute,second,ip_os_device_app_hash,click_time_second,click_time_shift
121638,0,16,2,13,26,57733,
1787841,0,16,43,25,26,60205,57733.0
9261562,0,23,58,58,26,86338,60205.0
11126756,1,0,30,26,26,88226,86338.0
13377758,1,1,7,58,26,90478,88226.0
13841862,1,1,16,21,26,90981,90478.0
13860275,1,1,16,40,26,91000,90981.0
42169678,1,10,21,31,26,123691,91000.0
51061818,1,13,11,33,26,133893,123691.0
52111160,1,13,31,21,26,135081,133893.0


In [17]:
df["click_time_interval_ip_device_os_app_hash"] = df["click_time_second"] - df["click_time_shift"]

In [19]:
df["ip_os_device_app_hash_shift"] = df["ip_os_device_app_hash"].shift()

In [22]:
%%time
df["click_time_interval_ip_device_os_app_hash"].where(df["ip_os_device_app_hash"]==df["ip_os_device_app_hash_shift"], np.nan, inplace=True)

CPU times: user 2.03 s, sys: 508 ms, total: 2.54 s
Wall time: 2.54 s


In [23]:
df

Unnamed: 0,dow,hour,minute,second,ip_os_device_app_hash,click_time_second,click_time_shift,click_time_interval_ip_device_os_app_hash,ip_os_device_app_hash_shift
121638,0,16,2,13,26,57733,,,
1787841,0,16,43,25,26,60205,57733.0,2472.0,2.600000e+01
9261562,0,23,58,58,26,86338,60205.0,26133.0,2.600000e+01
11126756,1,0,30,26,26,88226,86338.0,1888.0,2.600000e+01
13377758,1,1,7,58,26,90478,88226.0,2252.0,2.600000e+01
13841862,1,1,16,21,26,90981,90478.0,503.0,2.600000e+01
13860275,1,1,16,40,26,91000,90981.0,19.0,2.600000e+01
42169678,1,10,21,31,26,123691,91000.0,32691.0,2.600000e+01
51061818,1,13,11,33,26,133893,123691.0,10202.0,2.600000e+01
52111160,1,13,31,21,26,135081,133893.0,1188.0,2.600000e+01


In [25]:
df["first_click_interval_ip_device_os_app_hash"] = df["click_time_interval_ip_device_os_app_hash"].isnull()

In [27]:
df

Unnamed: 0,dow,hour,minute,second,ip_os_device_app_hash,click_time_second,click_time_shift,click_time_interval_ip_device_os_app_hash,ip_os_device_app_hash_shift,first_click_interval_ip_device_os_app_hash
121638,0,16,2,13,26,57733,,,,True
1787841,0,16,43,25,26,60205,57733.0,2472.0,2.600000e+01,False
9261562,0,23,58,58,26,86338,60205.0,26133.0,2.600000e+01,False
11126756,1,0,30,26,26,88226,86338.0,1888.0,2.600000e+01,False
13377758,1,1,7,58,26,90478,88226.0,2252.0,2.600000e+01,False
13841862,1,1,16,21,26,90981,90478.0,503.0,2.600000e+01,False
13860275,1,1,16,40,26,91000,90981.0,19.0,2.600000e+01,False
42169678,1,10,21,31,26,123691,91000.0,32691.0,2.600000e+01,False
51061818,1,13,11,33,26,133893,123691.0,10202.0,2.600000e+01,False
52111160,1,13,31,21,26,135081,133893.0,1188.0,2.600000e+01,False


In [28]:
df.drop(columns=["ip_os_device_app_hash_shift", "click_time_shift"], inplace=True)

In [29]:
df

Unnamed: 0,dow,hour,minute,second,ip_os_device_app_hash,click_time_second,click_time_interval_ip_device_os_app_hash,first_click_interval_ip_device_os_app_hash
121638,0,16,2,13,26,57733,,True
1787841,0,16,43,25,26,60205,2472.0,False
9261562,0,23,58,58,26,86338,26133.0,False
11126756,1,0,30,26,26,88226,1888.0,False
13377758,1,1,7,58,26,90478,2252.0,False
13841862,1,1,16,21,26,90981,503.0,False
13860275,1,1,16,40,26,91000,19.0,False
42169678,1,10,21,31,26,123691,32691.0,False
51061818,1,13,11,33,26,133893,10202.0,False
52111160,1,13,31,21,26,135081,1188.0,False


In [30]:
df["click_time_interval_ip_device_os_app_hash"].max()

341856.0

In [31]:
df["click_time_interval_ip_device_os_app_hash"].fillna(1e9+7, inplace=True)

In [34]:
df["click_time_interval_ip_device_os_app_hash"] = df["click_time_interval_ip_device_os_app_hash"].astype("uint32")

In [36]:
df

Unnamed: 0,dow,hour,minute,second,ip_os_device_app_hash,click_time_second,click_time_interval_ip_device_os_app_hash,first_click_interval_ip_device_os_app_hash
121638,0,16,2,13,26,57733,1000000007,True
1787841,0,16,43,25,26,60205,2472,False
9261562,0,23,58,58,26,86338,26133,False
11126756,1,0,30,26,26,88226,1888,False
13377758,1,1,7,58,26,90478,2252,False
13841862,1,1,16,21,26,90981,503,False
13860275,1,1,16,40,26,91000,19,False
42169678,1,10,21,31,26,123691,32691,False
51061818,1,13,11,33,26,133893,10202,False
52111160,1,13,31,21,26,135081,1188,False


In [37]:
df.sort_index(inplace=True)

In [38]:
df

Unnamed: 0,dow,hour,minute,second,ip_os_device_app_hash,click_time_second,click_time_interval_ip_device_os_app_hash,first_click_interval_ip_device_os_app_hash
0,0,14,32,21,525163057,52341,1000000007,True
1,0,14,33,34,116105055,52414,1000000007,True
2,0,14,34,12,4835573,52452,1000000007,True
3,0,14,34,52,271103479,52492,1000000007,True
4,0,14,35,8,383355941,52508,1000000007,True
5,0,14,36,26,265346367,52586,1000000007,True
6,0,14,37,44,451576942,52664,1000000007,True
7,0,14,37,59,48712048,52679,1000000007,True
8,0,14,38,10,409149612,52690,1000000007,True
9,0,14,38,23,991191953,52703,1000000007,True


In [39]:
df[["click_time_interval_ip_device_os_app_hash", "first_click_interval_ip_device_os_app_hash"]].to_hdf("../data/click_time_interval_ip_device_os_app_hash.hdf", "click_time_interval_ip_device_os_app_hash")

# Add first click features

In [1]:
import pandas as pd
df = pd.read_hdf("../data/merged_click_data.hdf", "merged_click_data")

In [4]:
df["is_attributed"].fillna(0.0, inplace=True)
df["is_attributed"] = (df["is_attributed"] == 1.0)

In [7]:
df["is_attributed"] = df["is_attributed"].astype("uint8")

In [11]:
df["click_id"].fillna(1e9+7, inplace=True)

In [12]:
df["click_id"] = df["click_id"].astype("uint32")

In [15]:
df.dtypes

app                                              uint16
channel                                          uint16
click_id                                         uint32
device                                           uint16
ip                                               uint32
is_attributed                                     uint8
os                                               uint16
click_count_by_ip_os_device_dow                  uint32
click_count_by_ip_os_device_dow_hour             uint16
click_count_by_ip                                uint32
dow                                               uint8
hour                                              uint8
minute                                            uint8
second                                            uint8
rank_by_ip                                       uint32
rank_by_ip_os_device                             uint32
rank_by_ip_os_device_dow                         uint32
click_time_interval_by_ip                       

In [16]:
df["click_time_interval_by_ip"].fillna(1e9+7, inplace=True)
df["click_time_interval_by_ip"] = df["click_time_interval_by_ip"].astype("uint32")
df["click_time_interval_by_ip_os_device"].fillna(1e9+7, inplace=True)
df["click_time_interval_by_ip_os_device"] = df["click_time_interval_by_ip_os_device"].astype("uint32")
df["click_time_interval_by_ip_os_device_dow"].fillna(1e9+7, inplace=True)
df["click_time_interval_by_ip_os_device_dow"] = df["click_time_interval_by_ip_os_device_dow"].astype("uint32")
df["click_time_interval_by_ip_os_device_dow_hour"].fillna(1e9+7, inplace=True)
df["click_time_interval_by_ip_os_device_dow_hour"] = df["click_time_interval_by_ip_os_device_dow_hour"].astype("uint32")

In [18]:
df.dtypes

app                                              uint16
channel                                          uint16
click_id                                         uint32
device                                           uint16
ip                                               uint32
is_attributed                                     uint8
os                                               uint16
click_count_by_ip_os_device_dow                  uint32
click_count_by_ip_os_device_dow_hour             uint16
click_count_by_ip                                uint32
dow                                               uint8
hour                                              uint8
minute                                            uint8
second                                            uint8
rank_by_ip                                       uint32
rank_by_ip_os_device                             uint32
rank_by_ip_os_device_dow                         uint32
click_time_interval_by_ip                       

In [19]:
df["first_click_by_ip"] = df["click_time_interval_by_ip"]==1e9+7
df["first_click_by_ip_os_device"] = df["click_time_interval_by_ip_os_device"]==1e9+7
df["first_click_by_ip_os_device_dow"] = df["click_time_interval_by_ip_os_device_dow"]==1e9+7
df["first_click_by_ip_os_device_dow_hour"] = df["click_time_interval_by_ip_os_device_dow_hour"]==1e9+7

In [21]:
df.to_hdf("../data/merged_click_data.hdf", "merged_click_data")

# Add categorical features

In [1]:
import pandas as pd

In [2]:
%%time
df = pd.read_feather("../data/basic_table")

CPU times: user 892 ms, sys: 768 ms, total: 1.66 s
Wall time: 1.66 s


In [5]:
%%time
df["ip_str"] = df["ip"].astype(str)

CPU times: user 3min 49s, sys: 10.1 s, total: 3min 59s
Wall time: 3min 58s


In [6]:
%%time
df["os_str"] = df["os"].astype(str)

CPU times: user 3min 40s, sys: 14.2 s, total: 3min 55s
Wall time: 3min 53s


In [7]:
%%time
df["device_str"] = df["device"].astype(str)

CPU times: user 3min 39s, sys: 11.8 s, total: 3min 51s
Wall time: 3min 49s


In [8]:
%%time
df["app_str"] = df["app"].astype(str)

CPU times: user 3min 42s, sys: 12.1 s, total: 3min 55s
Wall time: 3min 53s


In [9]:
df.dtypes

app              uint16
channel          uint16
click_id         uint32
device           uint16
ip               uint32
is_attributed     uint8
os               uint16
ip_str           object
os_str           object
device_str       object
app_str          object
dtype: object

In [10]:
%%time
df["channel_str"] = df["channel"].astype(str)

CPU times: user 3min 44s, sys: 15.5 s, total: 3min 59s
Wall time: 3min 58s


In [11]:
df["ip_os_device_str"] = df["ip_str"]+" "+df["os_str"]+" "+df["device_str"]

In [12]:
%%time
df["ip_os_device_app_str"] = (df["ip_str"]+" "
                              +df["os_str"]+" "
                              +df["device_str"]+" "
                              +df["app_str"])

CPU times: user 1min 46s, sys: 1min 40s, total: 3min 27s
Wall time: 3min 26s


In [13]:
df.dtypes

app                     uint16
channel                 uint16
click_id                uint32
device                  uint16
ip                      uint32
is_attributed            uint8
os                      uint16
ip_str                  object
os_str                  object
device_str              object
app_str                 object
channel_str             object
ip_os_device_str        object
ip_os_device_app_str    object
dtype: object

In [14]:
%%time
df["ip_os_device_categorical"] = pd.Categorical(df["ip_os_device_str"]).codes

CPU times: user 1min 1s, sys: 19 s, total: 1min 20s
Wall time: 1min 20s


In [15]:
%%time
df["ip_os_device_app_categorical"] = pd.Categorical(df["ip_os_device_app_str"]).codes

CPU times: user 3min 29s, sys: 52 s, total: 4min 21s
Wall time: 4min 21s


In [16]:
%%time
df["app_channel_str"] = df["app_str"]+" "+df["channel_str"]

CPU times: user 31.6 s, sys: 18.9 s, total: 50.4 s
Wall time: 50.1 s


In [17]:
df.dtypes

app                             uint16
channel                         uint16
click_id                        uint32
device                          uint16
ip                              uint32
is_attributed                    uint8
os                              uint16
ip_str                          object
os_str                          object
device_str                      object
app_str                         object
channel_str                     object
ip_os_device_str                object
ip_os_device_app_str            object
ip_os_device_categorical         int32
ip_os_device_app_categorical     int32
app_channel_str                 object
dtype: object

In [18]:
%%time
df["app_channel_categorical"] = pd.Categorical(df["app_channel_str"]).codes

CPU times: user 12.1 s, sys: 2.61 s, total: 14.7 s
Wall time: 14.7 s


In [19]:
df.dtypes

app                             uint16
channel                         uint16
click_id                        uint32
device                          uint16
ip                              uint32
is_attributed                    uint8
os                              uint16
ip_str                          object
os_str                          object
device_str                      object
app_str                         object
channel_str                     object
ip_os_device_str                object
ip_os_device_app_str            object
ip_os_device_categorical         int32
ip_os_device_app_categorical     int32
app_channel_str                 object
app_channel_categorical          int16
dtype: object

In [20]:
%%time
df["ip_os_channel_str"] = (df["ip_str"]+" "
                              +df["os_str"]+" "
                              +df["channel_str"])

CPU times: user 1min 4s, sys: 49.6 s, total: 1min 54s
Wall time: 1min 53s


In [21]:
%%time
df["ip_os_device_channel_str"] = (df["ip_str"]+" "
                              +df["os_str"]+" "
                              +df["device_str"]+" "
                              +df["channel_str"])

CPU times: user 1min 43s, sys: 1min 39s, total: 3min 23s
Wall time: 3min 22s


In [22]:
df.dtypes

app                             uint16
channel                         uint16
click_id                        uint32
device                          uint16
ip                              uint32
is_attributed                    uint8
os                              uint16
ip_str                          object
os_str                          object
device_str                      object
app_str                         object
channel_str                     object
ip_os_device_str                object
ip_os_device_app_str            object
ip_os_device_categorical         int32
ip_os_device_app_categorical     int32
app_channel_str                 object
app_channel_categorical          int16
ip_os_channel_str               object
ip_os_device_channel_str        object
dtype: object

In [23]:
%%time
df["ip_os_channel_categorical"] = pd.Categorical(df["ip_os_channel_str"]).codes

CPU times: user 5min 39s, sys: 2min 12s, total: 7min 52s
Wall time: 7min 51s


In [24]:
%%time
df["ip_os_device_channel_categorical"] = pd.Categorical(df["ip_os_device_channel_str"]).codes

CPU times: user 5min 56s, sys: 2min 20s, total: 8min 16s
Wall time: 8min 16s


In [25]:
df.dtypes

app                                 uint16
channel                             uint16
click_id                            uint32
device                              uint16
ip                                  uint32
is_attributed                        uint8
os                                  uint16
ip_str                              object
os_str                              object
device_str                          object
app_str                             object
channel_str                         object
ip_os_device_str                    object
ip_os_device_app_str                object
ip_os_device_categorical             int32
ip_os_device_app_categorical         int32
app_channel_str                     object
app_channel_categorical              int16
ip_os_channel_str                   object
ip_os_device_channel_str            object
ip_os_channel_categorical            int32
ip_os_device_channel_categorical     int32
dtype: object

In [27]:
df.drop(columns=["ip_os_device_str", "ip_os_device_app_str", "app_channel_str", "ip_os_channel_str", "ip_os_device_channel_str"], inplace=True)

In [28]:
df.dtypes

app                                 uint16
channel                             uint16
click_id                            uint32
device                              uint16
ip                                  uint32
is_attributed                        uint8
os                                  uint16
ip_str                              object
os_str                              object
device_str                          object
app_str                             object
channel_str                         object
ip_os_device_categorical             int32
ip_os_device_app_categorical         int32
app_channel_categorical              int16
ip_os_channel_categorical            int32
ip_os_device_channel_categorical     int32
dtype: object

In [30]:
import gc
gc.collect()

3196

In [32]:
df.dtypes

app                                 uint16
channel                             uint16
click_id                            uint32
device                              uint16
ip                                  uint32
is_attributed                        uint8
os                                  uint16
ip_str                              object
os_str                              object
device_str                          object
app_str                             object
channel_str                         object
ip_os_device_categorical             int32
ip_os_device_app_categorical         int32
app_channel_categorical              int16
ip_os_channel_categorical            int32
ip_os_device_channel_categorical     int32
dtype: object

In [33]:
%%time
df.sort_index(axis=1, inplace=True)

CPU times: user 23.8 s, sys: 28.1 s, total: 51.9 s
Wall time: 51.8 s


In [34]:
df

Unnamed: 0,app,app_channel_categorical,app_str,channel,channel_str,click_id,device,device_str,ip,ip_os_channel_categorical,ip_os_device_app_categorical,ip_os_device_categorical,ip_os_device_channel_categorical,ip_str,is_attributed,os,os_str
0,3,687,3,379,379,1000000007,1,1,83230,39340234,22588778,3373482,40610265,83230,0,13,13
1,3,687,3,379,379,1000000007,1,1,17357,12816010,7242710,1016835,13248517,17357,0,19,19
2,3,687,3,379,379,1000000007,1,1,35810,26183596,15428639,2469885,26905865,35810,0,13,13
3,14,219,14,478,478,1000000007,1,1,45745,29128018,17047141,2682681,29977839,45745,0,13,13
4,3,687,3,379,379,1000000007,1,1,161007,11347548,6370507,877172,11747871,161007,0,13,13
5,3,687,3,379,379,1000000007,1,1,18787,14589942,8287118,1181562,15053805,18787,0,16,16
6,3,687,3,379,379,1000000007,1,1,103022,858515,471631,57800,903597,103022,0,23,23
7,3,687,3,379,379,1000000007,1,1,114221,4007349,2172576,268351,4175485,114221,0,19,19
8,3,687,3,379,379,1000000007,1,1,165970,11915489,6712127,932732,12329302,165970,0,13,13
9,64,1191,64,459,459,1000000007,1,1,74544,36909068,21268889,3210709,38073857,74544,0,22,22


In [35]:
df.dtypes

app                                 uint16
app_channel_categorical              int16
app_str                             object
channel                             uint16
channel_str                         object
click_id                            uint32
device                              uint16
device_str                          object
ip                                  uint32
ip_os_channel_categorical            int32
ip_os_device_app_categorical         int32
ip_os_device_categorical             int32
ip_os_device_channel_categorical     int32
ip_str                              object
is_attributed                        uint8
os                                  uint16
os_str                              object
dtype: object

In [36]:
df["os_device_categorical"] = pd.Categorical(df["os_str"]+" "+df["device_str"]).codes

In [37]:
df["os_device_app_categorical"] =pd.Categorical(
                                    df["os_str"]+" "+
                                    df["device_str"]+" "+
                                    df["app_str"]).codes

In [38]:
df.dtypes

app                                 uint16
app_channel_categorical              int16
app_str                             object
channel                             uint16
channel_str                         object
click_id                            uint32
device                              uint16
device_str                          object
ip                                  uint32
ip_os_channel_categorical            int32
ip_os_device_app_categorical         int32
ip_os_device_categorical             int32
ip_os_device_channel_categorical     int32
ip_str                              object
is_attributed                        uint8
os                                  uint16
os_str                              object
os_device_categorical                int16
os_device_app_categorical            int32
dtype: object

columns = ["ip", "os", "device", "app", "channel"]
n = len(columns)

In [3]:
%%time
import pandas as pd
from logzero import logger
df = pd.read_feather("../data/basic_table")

CPU times: user 876 ms, sys: 736 ms, total: 1.61 s
Wall time: 1.61 s


In [4]:
df.dtypes

app              uint16
channel          uint16
click_id         uint32
device           uint16
ip               uint32
is_attributed     uint8
os               uint16
dtype: object

In [5]:
%%time
logger.info("app")
df["app"] = df["app"].astype(str)
logger.info("os")
df["os"] = df["os"].astype(str)
logger.info("device")
df["device"] = df["device"].astype(str)
logger.info("channel")
df["channel"] = df["channel"].astype(str)
logger.info("ip")
df["ip"] = df["ip"].astype(str)

[I 180425 17:11:26 <timed exec>:1] app
[I 180425 17:15:14 <timed exec>:3] os
[I 180425 17:19:01 <timed exec>:5] device
[I 180425 17:22:45 <timed exec>:7] channel
[I 180425 17:26:36 <timed exec>:9] ip


CPU times: user 18min 21s, sys: 50 s, total: 19min 11s
Wall time: 19min 6s


In [6]:
df.dtypes

app              object
channel          object
click_id         uint32
device           object
ip               object
is_attributed     uint8
os               object
dtype: object

In [7]:
columns = ["ip", "os", "device", "app", "channel"]
n = len(columns)

In [8]:
logger.info("started")
for i in range(n):
    for j in range(i+1, n):
        ci=columns[i]
        cj=columns[j]
        feature="{}_{}".format(ci, cj)
        logger.info("{} started".format(feature))
        df[feature] = pd.Categorical(df[ci]+"_"+df[cj]).codes
        logger.info("{} finished".format(feature))

[I 180425 17:30:32 <ipython-input-8-e942d1df4103>:1] started
[I 180425 17:30:32 <ipython-input-8-e942d1df4103>:7] ip_os started
[I 180425 17:32:35 <ipython-input-8-e942d1df4103>:9] ip_os finished
[I 180425 17:32:35 <ipython-input-8-e942d1df4103>:7] ip_device started
[I 180425 17:34:55 <ipython-input-8-e942d1df4103>:9] ip_device finished
[I 180425 17:34:55 <ipython-input-8-e942d1df4103>:7] ip_app started
[I 180425 17:37:32 <ipython-input-8-e942d1df4103>:9] ip_app finished
[I 180425 17:37:32 <ipython-input-8-e942d1df4103>:7] ip_channel started
[I 180425 17:40:41 <ipython-input-8-e942d1df4103>:9] ip_channel finished
[I 180425 17:40:41 <ipython-input-8-e942d1df4103>:7] os_device started
[I 180425 17:43:51 <ipython-input-8-e942d1df4103>:9] os_device finished
[I 180425 17:43:51 <ipython-input-8-e942d1df4103>:7] os_app started
[I 180425 17:47:06 <ipython-input-8-e942d1df4103>:9] os_app finished
[I 180425 17:47:06 <ipython-input-8-e942d1df4103>:7] os_channel started
[I 180425 17:50:25 <ipython

In [9]:
df.dtypes

app               object
channel           object
click_id          uint32
device            object
ip                object
is_attributed      uint8
os                object
ip_os              int32
ip_device          int32
ip_app             int32
ip_channel         int32
os_device          int16
os_app             int16
os_channel         int16
device_app         int16
device_channel     int16
app_channel        int16
dtype: object

In [12]:
import gc
gc.collect()

0

In [13]:
twos = ["{}_{}".format(columns[i], columns[j]) for i in range(n) for j in range(i+1, n)]

In [16]:
for two in twos:
    logger.info("{} {}".format(two, df[two].max()))

[I 180425 18:07:43 <ipython-input-16-80b0f64acc4d>:2] ip_os 3260003
[I 180425 18:07:44 <ipython-input-16-80b0f64acc4d>:2] ip_device 795046
[I 180425 18:07:45 <ipython-input-16-80b0f64acc4d>:2] ip_app 4135195
[I 180425 18:07:46 <ipython-input-16-80b0f64acc4d>:2] ip_channel 8419361
[I 180425 18:07:47 <ipython-input-16-80b0f64acc4d>:2] os_device 5897
[I 180425 18:07:48 <ipython-input-16-80b0f64acc4d>:2] os_app 14638
[I 180425 18:07:48 <ipython-input-16-80b0f64acc4d>:2] os_channel 19725
[I 180425 18:07:49 <ipython-input-16-80b0f64acc4d>:2] device_app 13140
[I 180425 18:07:50 <ipython-input-16-80b0f64acc4d>:2] device_channel 11565
[I 180425 18:07:51 <ipython-input-16-80b0f64acc4d>:2] app_channel 1456


In [19]:
df.drop(columns=["click_id", "is_attributed"], inplace=True)

In [20]:
gc.collect()

1960

In [23]:
df["app"] = pd.Categorical(df["app"]).codes
logger.info("app")
df["os"] = pd.Categorical(df["os"]).codes
logger.info("os")
df["device"] = pd.Categorical(df["device"]).codes
logger.info("device")
df["channel"] = pd.Categorical(df["channel"]).codes
logger.info("channel")
df["ip"] = pd.Categorical(df["ip"]).codes
logger.info("ip")

[I 180425 18:13:36 <ipython-input-23-e56391e1c60f>:2] app
[I 180425 18:14:04 <ipython-input-23-e56391e1c60f>:4] os
[I 180425 18:14:25 <ipython-input-23-e56391e1c60f>:6] device
[I 180425 18:14:43 <ipython-input-23-e56391e1c60f>:8] channel
[I 180425 18:15:11 <ipython-input-23-e56391e1c60f>:10] ip


In [26]:
df = df.astype("uint64")

In [33]:
ones = columns
n = len(ones)

In [36]:
df.max()

app                   729
channel               201
device               3798
ip                 333167
os                    855
ip_os             3260003
ip_device          795046
ip_app            4135195
ip_channel        8419361
os_device            5897
os_app              14638
os_channel          19725
device_app          13140
device_channel      11565
app_channel          1456
dtype: uint64

In [37]:
for i in range(n):
    for j in range(i+1, n):
        for k in range(j+1, n):
            feature="{}_{}_{}".format(ones[i], ones[j], ones[k])
            one = ones[i]
            two = "{}_{}".format(ones[j], ones[k])
            logger.info(feature)
            df[feature] = pd.Categorical(df[one]*10000000+df[two]).codes
            logger.info("done")

[I 180425 18:24:50 <ipython-input-37-402fd42ae267>:7] ip_os_device
[I 180425 18:25:08 <ipython-input-37-402fd42ae267>:9] done
[I 180425 18:25:08 <ipython-input-37-402fd42ae267>:7] ip_os_app
[I 180425 18:25:38 <ipython-input-37-402fd42ae267>:9] done
[I 180425 18:25:38 <ipython-input-37-402fd42ae267>:7] ip_os_channel
[I 180425 18:26:24 <ipython-input-37-402fd42ae267>:9] done
[I 180425 18:26:24 <ipython-input-37-402fd42ae267>:7] ip_device_app
[I 180425 18:26:42 <ipython-input-37-402fd42ae267>:9] done
[I 180425 18:26:42 <ipython-input-37-402fd42ae267>:7] ip_device_channel
[I 180425 18:27:04 <ipython-input-37-402fd42ae267>:9] done
[I 180425 18:27:04 <ipython-input-37-402fd42ae267>:7] ip_app_channel
[I 180425 18:27:29 <ipython-input-37-402fd42ae267>:9] done
[I 180425 18:27:29 <ipython-input-37-402fd42ae267>:7] os_device_app
[I 180425 18:27:40 <ipython-input-37-402fd42ae267>:9] done
[I 180425 18:27:40 <ipython-input-37-402fd42ae267>:7] os_device_channel
[I 180425 18:27:51 <ipython-input-37-40

In [39]:
df.max()

app                        729.0
channel                    201.0
device                    3798.0
ip                      333167.0
os                         855.0
ip_os                  3260003.0
ip_device               795046.0
ip_app                 4135195.0
ip_channel             8419361.0
os_device                 5897.0
os_app                   14638.0
os_channel               19725.0
device_app               13140.0
device_channel           11565.0
app_channel               1456.0
ip_os_device           3669417.0
ip_os_app             23835240.0
ip_os_channel         43803139.0
ip_device_app          4795604.0
ip_device_channel      9446113.0
ip_app_channel        13206832.0
os_device_app            32952.0
os_device_channel        44789.0
os_app_channel           43287.0
device_app_channel       19606.0
dtype: float64

In [41]:
%%time
df = df.astype("int64")

CPU times: user 33.9 s, sys: 1min 23s, total: 1min 57s
Wall time: 1min 57s


In [44]:
%%time
df.max()

CPU times: user 47.4 s, sys: 47.1 s, total: 1min 34s
Wall time: 1min 34s


app                        729
channel                    201
device                    3798
ip                      333167
os                         855
ip_os                  3260003
ip_device               795046
ip_app                 4135195
ip_channel             8419361
os_device                 5897
os_app                   14638
os_channel               19725
device_app               13140
device_channel           11565
app_channel               1456
ip_os_device           3669417
ip_os_app             23835240
ip_os_channel         43803139
ip_device_app          4795604
ip_device_channel      9446113
ip_app_channel        13206832
os_device_app            32952
os_device_channel        44789
os_app_channel           43287
device_app_channel       19606
dtype: int64

In [46]:
%%time
for i in range(n):
    for j in range(i+1, n):
        for k in range(j+1, n):
            for l in range(k+1, n):
                two1="{}_{}".format(ones[i], ones[j])
                two2="{}_{}".format(ones[k], ones[l])
                feature = "{}_{}".format(two1, two2)
                logger.info(feature)
                df[feature] = pd.Categorical(df[two1]*1000000000+ df[two2]).codes
                logger.info("done")

[I 180425 18:43:48 <timed exec>:8] ip_os_device_app
[I 180425 18:44:21 <timed exec>:10] done
[I 180425 18:44:21 <timed exec>:8] ip_os_device_channel
[I 180425 18:45:08 <timed exec>:10] done
[I 180425 18:45:08 <timed exec>:8] ip_os_app_channel
[I 180425 18:46:07 <timed exec>:10] done
[I 180425 18:46:07 <timed exec>:8] ip_device_app_channel
[I 180425 18:46:32 <timed exec>:10] done
[I 180425 18:46:32 <timed exec>:8] os_device_app_channel
[I 180425 18:46:44 <timed exec>:10] done


CPU times: user 2min 46s, sys: 24.6 s, total: 3min 11s
Wall time: 2min 55s


In [47]:
df.dtypes

app                      int64
channel                  int64
device                   int64
ip                       int64
os                       int64
ip_os                    int64
ip_device                int64
ip_app                   int64
ip_channel               int64
os_device                int64
os_app                   int64
os_channel               int64
device_app               int64
device_channel           int64
app_channel              int64
ip_os_device             int64
ip_os_app                int64
ip_os_channel            int64
ip_device_app            int64
ip_device_channel        int64
ip_app_channel           int64
os_device_app            int64
os_device_channel        int64
os_app_channel           int64
device_app_channel       int64
ip_os_device_app         int32
ip_os_device_channel     int32
ip_os_app_channel        int32
ip_device_app_channel    int32
os_device_app_channel    int32
dtype: object

In [49]:
df["ip_os_device_app"].max()

24990688

In [50]:
df["channel"].max()

201

In [51]:
df["ip_os_device_app"] = df["ip_os_device_app"].astype("int64")

In [52]:
df["ip_os_device_app"] = df["ip_os_device_app"] * 1000

In [53]:
%%time
df["ip_os_device_app_channel"] = pd.Categorical(df["ip_os_device_app"]*1000+df["channel"]).codes

CPU times: user 58.2 s, sys: 4.62 s, total: 1min 2s
Wall time: 1min


In [54]:
df.dtypes

app                         int64
channel                     int64
device                      int64
ip                          int64
os                          int64
ip_os                       int64
ip_device                   int64
ip_app                      int64
ip_channel                  int64
os_device                   int64
os_app                      int64
os_channel                  int64
device_app                  int64
device_channel              int64
app_channel                 int64
ip_os_device                int64
ip_os_app                   int64
ip_os_channel               int64
ip_device_app               int64
ip_device_channel           int64
ip_app_channel              int64
os_device_app               int64
os_device_channel           int64
os_app_channel              int64
device_app_channel          int64
ip_os_device_app            int64
ip_os_device_channel        int32
ip_os_app_channel           int32
ip_device_app_channel       int32
os_device_app_

In [55]:
df.drop(columns=["ip", "app", "os", "device", "channel"], inplace=True)

In [56]:
df.dtypes

ip_os                       int64
ip_device                   int64
ip_app                      int64
ip_channel                  int64
os_device                   int64
os_app                      int64
os_channel                  int64
device_app                  int64
device_channel              int64
app_channel                 int64
ip_os_device                int64
ip_os_app                   int64
ip_os_channel               int64
ip_device_app               int64
ip_device_channel           int64
ip_app_channel              int64
os_device_app               int64
os_device_channel           int64
os_app_channel              int64
device_app_channel          int64
ip_os_device_app            int64
ip_os_device_channel        int32
ip_os_app_channel           int32
ip_device_app_channel       int32
os_device_app_channel       int32
ip_os_device_app_channel    int32
dtype: object

In [57]:
df.max()

ip_os                           3260003
ip_device                        795046
ip_app                          4135195
ip_channel                      8419361
os_device                          5897
os_app                            14638
os_channel                        19725
device_app                        13140
device_channel                    11565
app_channel                        1456
ip_os_device                    3669417
ip_os_app                      23835240
ip_os_channel                  43803139
ip_device_app                   4795604
ip_device_channel               9446113
ip_app_channel                 13206832
os_device_app                     32952
os_device_channel                 44789
os_app_channel                    43287
device_app_channel                19606
ip_os_device_app            24990688000
ip_os_device_channel           45235476
ip_os_app_channel              57217358
ip_device_app_channel          14226319
os_device_app_channel             82867


In [59]:
df["ip_os_device_app"] = pd.Categorical(df["ip_os_device_app"]).codes

In [60]:
df_max = df.max()
df_max

ip_os                        3260003
ip_device                     795046
ip_app                       4135195
ip_channel                   8419361
os_device                       5897
os_app                         14638
os_channel                     19725
device_app                     13140
device_channel                 11565
app_channel                     1456
ip_os_device                 3669417
ip_os_app                   23835240
ip_os_channel               43803139
ip_device_app                4795604
ip_device_channel            9446113
ip_app_channel              13206832
os_device_app                  32952
os_device_channel              44789
os_app_channel                 43287
device_app_channel             19606
ip_os_device_app            24990688
ip_os_device_channel        45235476
ip_os_app_channel           57217358
ip_device_app_channel       14226319
os_device_app_channel          82867
ip_os_device_app_channel    58691622
dtype: int64

In [70]:
df = df.astype("uint32")

In [71]:
for c in df_max[df_max < 60000].index:
    logger.info(c)
    logger.info(df[c].max())
    df[c] = df[c].astype("uint16")

[I 180425 19:13:55 <ipython-input-71-a94b67c351ee>:2] os_device
[I 180425 19:13:56 <ipython-input-71-a94b67c351ee>:3] 5897
[I 180425 19:14:04 <ipython-input-71-a94b67c351ee>:2] os_app
[I 180425 19:14:05 <ipython-input-71-a94b67c351ee>:3] 14638
[I 180425 19:14:13 <ipython-input-71-a94b67c351ee>:2] os_channel
[I 180425 19:14:14 <ipython-input-71-a94b67c351ee>:3] 19725
[I 180425 19:14:22 <ipython-input-71-a94b67c351ee>:2] device_app
[I 180425 19:14:23 <ipython-input-71-a94b67c351ee>:3] 13140
[I 180425 19:14:31 <ipython-input-71-a94b67c351ee>:2] device_channel
[I 180425 19:14:32 <ipython-input-71-a94b67c351ee>:3] 11565
[I 180425 19:14:39 <ipython-input-71-a94b67c351ee>:2] app_channel
[I 180425 19:14:40 <ipython-input-71-a94b67c351ee>:3] 1456
[I 180425 19:14:46 <ipython-input-71-a94b67c351ee>:2] os_device_app
[I 180425 19:14:47 <ipython-input-71-a94b67c351ee>:3] 32952
[I 180425 19:14:54 <ipython-input-71-a94b67c351ee>:2] os_device_channel
[I 180425 19:14:55 <ipython-input-71-a94b67c351ee>:3

In [72]:
df.dtypes

ip_os                       uint32
ip_device                   uint32
ip_app                      uint32
ip_channel                  uint32
os_device                   uint16
os_app                      uint16
os_channel                  uint16
device_app                  uint16
device_channel              uint16
app_channel                 uint16
ip_os_device                uint32
ip_os_app                   uint32
ip_os_channel               uint32
ip_device_app               uint32
ip_device_channel           uint32
ip_app_channel              uint32
os_device_app               uint16
os_device_channel           uint16
os_app_channel              uint16
device_app_channel          uint16
ip_os_device_app            uint32
ip_os_device_channel        uint32
ip_os_app_channel           uint32
ip_device_app_channel       uint32
os_device_app_channel       uint32
ip_os_device_app_channel    uint32
dtype: object

In [73]:
df.to_feather("../data/multi_basic")