# Add prev/next channel feature

In [1]:
import pandas as pd

In [2]:
df = pd.read_feather("../data/basic_table")

In [3]:
df.drop(columns=["click_id", "is_attributed"], inplace=True)

In [4]:
df = pd.concat([df, pd.read_feather("../data/multi_basic")], axis=1)

In [5]:
df.dtypes

app                         uint16
channel                     uint16
device                      uint16
ip                          uint32
os                          uint16
ip_os                       uint32
ip_device                   uint32
ip_app                      uint32
ip_channel                  uint32
os_device                   uint16
os_app                      uint16
os_channel                  uint16
device_app                  uint16
device_channel              uint16
app_channel                 uint16
ip_os_device                uint32
ip_os_app                   uint32
ip_os_channel               uint32
ip_device_app               uint32
ip_device_channel           uint32
ip_app_channel              uint32
os_device_app               uint16
os_device_channel           uint16
os_app_channel              uint16
device_app_channel          uint16
ip_os_device_app            uint32
ip_os_device_channel        uint32
ip_os_app_channel           uint32
ip_device_app_channe

In [6]:
df["index"] = df.index

In [16]:
non_channel_columns = [c for c in df.columns if "channel" not in c and c != "index"]
non_channel_columns

['app',
 'device',
 'ip',
 'os',
 'ip_os',
 'ip_device',
 'ip_app',
 'os_device',
 'os_app',
 'device_app',
 'ip_os_device',
 'ip_os_app',
 'ip_device_app',
 'os_device_app',
 'ip_os_device_app']

In [17]:
dropping = [c for c in df.columns if "channel" in c and c != "channel"]
dropping

['ip_channel',
 'os_channel',
 'device_channel',
 'app_channel',
 'ip_os_channel',
 'ip_device_channel',
 'ip_app_channel',
 'os_device_channel',
 'os_app_channel',
 'device_app_channel',
 'ip_os_device_channel',
 'ip_os_app_channel',
 'ip_device_app_channel',
 'os_device_app_channel',
 'ip_os_device_app_channel']

In [18]:
df.drop(columns=dropping, inplace=True)

In [19]:
import gc
gc.collect()

4817

In [20]:
from logzero import logger

In [21]:
for column in non_channel_columns:
    logger.info("start {}".format(column))
    df.sort_values(by=[column, "index"], inplace=True)
    logger.info("sorted")

    next_channel_by_column = "next_channel_by_{}".format(column)
    prev_channel_by_column = "prev_channel_by_{}".format(column)
    next_column = "next_{}".format(column)
    prev_column = "prev_{}".format(column)
    
    df[next_channel_by_column] = df["channel"].shift(-1)
    df[prev_channel_by_column] = df["channel"].shift()
    df[next_column] = df[column].shift(-1)
    df[prev_column] = df[column].shift()
    logger.info("shifted")
    
    df[prev_channel_by_column].where(df[column]==df[prev_column], 1000, inplace=True)
    df[prev_channel_by_column].fillna(1000, inplace=True)
    df[prev_channel_by_column] = df[prev_channel_by_column].astype("uint16")

    df[next_channel_by_column].where(df[column]==df[next_column], 1000, inplace=True)
    df[next_channel_by_column].fillna(1000, inplace=True)
    df[next_channel_by_column] = df[next_channel_by_column].astype("uint16")
    logger.info("finished")
    df.drop(columns=[prev_column, next_column], inplace=True)
    logger.info("dropped")

[I 180425 20:03:24 <ipython-input-21-68c438bace29>:2] start app
[I 180425 20:07:27 <ipython-input-21-68c438bace29>:4] sorted
[I 180425 20:07:33 <ipython-input-21-68c438bace29>:15] shifted
[I 180425 20:07:38 <ipython-input-21-68c438bace29>:24] finished
[I 180425 20:07:55 <ipython-input-21-68c438bace29>:26] dropped
[I 180425 20:07:55 <ipython-input-21-68c438bace29>:2] start device
[I 180425 20:11:23 <ipython-input-21-68c438bace29>:4] sorted
[I 180425 20:11:30 <ipython-input-21-68c438bace29>:15] shifted
[I 180425 20:11:35 <ipython-input-21-68c438bace29>:24] finished
[I 180425 20:11:53 <ipython-input-21-68c438bace29>:26] dropped
[I 180425 20:11:53 <ipython-input-21-68c438bace29>:2] start ip
[I 180425 20:17:55 <ipython-input-21-68c438bace29>:4] sorted
[I 180425 20:18:02 <ipython-input-21-68c438bace29>:15] shifted
[I 180425 20:18:07 <ipython-input-21-68c438bace29>:24] finished
[I 180425 20:18:26 <ipython-input-21-68c438bace29>:26] dropped
[I 180425 20:18:26 <ipython-input-21-68c438bace29>:2]

In [22]:
gc.collect()

2139

In [24]:
next_prev_columns = [c for c in df.columns if "next" in c or "prev" in c]
next_prev_columns

['next_channel_by_app',
 'prev_channel_by_app',
 'next_channel_by_device',
 'prev_channel_by_device',
 'next_channel_by_ip',
 'prev_channel_by_ip',
 'next_channel_by_os',
 'prev_channel_by_os',
 'next_channel_by_ip_os',
 'prev_channel_by_ip_os',
 'next_channel_by_ip_device',
 'prev_channel_by_ip_device',
 'next_channel_by_ip_app',
 'prev_channel_by_ip_app',
 'next_channel_by_os_device',
 'prev_channel_by_os_device',
 'next_channel_by_os_app',
 'prev_channel_by_os_app',
 'next_channel_by_device_app',
 'prev_channel_by_device_app',
 'next_channel_by_ip_os_device',
 'prev_channel_by_ip_os_device',
 'next_channel_by_ip_os_app',
 'prev_channel_by_ip_os_app',
 'next_channel_by_ip_device_app',
 'prev_channel_by_ip_device_app',
 'next_channel_by_os_device_app',
 'prev_channel_by_os_device_app',
 'next_channel_by_ip_os_device_app',
 'prev_channel_by_ip_os_device_app']

In [25]:
next_prev_channels = df[next_prev_columns]

In [26]:
next_prev_channels.dtypes

next_channel_by_app                 uint16
prev_channel_by_app                 uint16
next_channel_by_device              uint16
prev_channel_by_device              uint16
next_channel_by_ip                  uint16
prev_channel_by_ip                  uint16
next_channel_by_os                  uint16
prev_channel_by_os                  uint16
next_channel_by_ip_os               uint16
prev_channel_by_ip_os               uint16
next_channel_by_ip_device           uint16
prev_channel_by_ip_device           uint16
next_channel_by_ip_app              uint16
prev_channel_by_ip_app              uint16
next_channel_by_os_device           uint16
prev_channel_by_os_device           uint16
next_channel_by_os_app              uint16
prev_channel_by_os_app              uint16
next_channel_by_device_app          uint16
prev_channel_by_device_app          uint16
next_channel_by_ip_os_device        uint16
prev_channel_by_ip_os_device        uint16
next_channel_by_ip_os_app           uint16
prev_channe

In [29]:
next_prev_channels.sort_index(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [30]:
next_prev_channels

Unnamed: 0,next_channel_by_app,prev_channel_by_app,next_channel_by_device,prev_channel_by_device,next_channel_by_ip,prev_channel_by_ip,next_channel_by_os,prev_channel_by_os,next_channel_by_ip_os,prev_channel_by_ip_os,...,next_channel_by_ip_os_device,prev_channel_by_ip_os_device,next_channel_by_ip_os_app,prev_channel_by_ip_os_app,next_channel_by_ip_device_app,prev_channel_by_ip_device_app,next_channel_by_os_device_app,prev_channel_by_os_device_app,next_channel_by_ip_os_device_app,prev_channel_by_ip_os_device_app
0,379,1000,379,1000,477,1000,379,1000,178,1000,...,178,1000,135,1000,135,1000,379,1000,135,1000
1,379,379,379,379,379,1000,379,1000,121,1000,...,121,1000,280,1000,379,1000,379,1000,280,1000
2,379,379,478,379,409,1000,478,379,107,1000,...,107,1000,409,1000,409,1000,379,379,409,1000
3,478,1000,379,379,442,1000,379,379,442,1000,...,442,1000,379,1000,379,1000,478,1000,379,1000
4,379,379,379,478,480,1000,379,478,205,1000,...,205,1000,205,1000,480,1000,379,379,205,1000
5,379,379,379,379,379,1000,379,1000,280,1000,...,280,1000,280,1000,379,1000,379,1000,280,1000
6,379,379,379,379,122,1000,379,1000,280,1000,...,280,1000,280,1000,489,1000,379,1000,280,1000
7,379,379,379,379,125,1000,379,379,236,1000,...,236,1000,452,1000,115,1000,379,379,452,1000
8,379,379,459,379,459,1000,379,379,234,1000,...,234,1000,442,1000,452,1000,379,379,442,1000
9,459,1000,379,379,128,1000,459,1000,265,1000,...,265,1000,1000,1000,459,1000,459,1000,1000,1000


In [31]:
next_prev_channels.to_feather("../data/next_prev_channels")