In [1]:
import gc
import time
import numpy as np
import pandas as pd
import xgboost as xgb
from xgboost import plot_importance
import matplotlib.pyplot as plt
import click
from logzero import logger

In [2]:
%%time
click_data = pd.concat([
    pd.read_hdf("../data/basic_table.hdf", "basic_table"),
    pd.read_hdf("../data/click_count_table.hdf", "click_count_table"),
    pd.read_hdf("../data/click_time_interval_ip_device_os_app_hash.hdf", "click_time_interval_ip_device_os_app_hash"),
    pd.read_hdf("../data/intervals_table.hdf", "intervals_table"),
    pd.read_hdf("../data/misc_counts.hdf", "misc_counts"),
    pd.read_hdf("../data/ranks_table.hdf", "ranks_table"),
    pd.read_hdf("../data/time_table.hdf", "time_table"),
], axis=1)

CPU times: user 30.7 s, sys: 19.2 s, total: 49.9 s
Wall time: 3min 36s


In [3]:
click_data.dtypes

app                                              uint16
channel                                          uint16
click_id                                         uint32
device                                           uint16
ip                                               uint32
is_attributed                                     uint8
os                                               uint16
click_count_by_ip                                uint32
click_count_by_ip_os_device_dow                  uint32
click_count_by_ip_os_device_dow_hour             uint16
click_time_interval_ip_device_os_app_hash        uint32
first_click_interval_ip_device_os_app_hash         bool
click_time_interval_by_ip                        uint32
click_time_interval_by_ip_os_device              uint32
click_time_interval_by_ip_os_device_dow          uint32
click_time_interval_by_ip_os_device_dow_hour     uint32
first_click_by_ip                                  bool
first_click_by_ip_os_device                     

In [19]:
df = click_data[["channel", "is_attributed", "dow"]]

In [21]:
train_df = df[df["dow"] < 4]

In [24]:
train_df.drop(columns=["dow"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [26]:
clicks = train_df.groupby(["channel"]).count()

In [28]:
clicks["attribute"] = train_df.groupby(["channel"])['is_attributed'].sum()

In [30]:
clicks.columns = ["click", "attribute"]

In [31]:
clicks["attribute"] = clicks["attribute"].astype(int)

In [33]:
clicks["attribute_ratio"] = clicks["attribute"] / clicks["click"]

In [35]:
clicks.sort_values(by=["attribute_ratio"])

Unnamed: 0_level_0,click,attribute,attribute_ratio
channel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
500,1,0,0.000000
422,1,0,0.000000
420,6751,0,0.000000
404,14379,0,0.000000
394,1,0,0.000000
221,1,0,0.000000
354,1,0,0.000000
142,2,0,0.000000
146,1,0,0.000000
149,2,0,0.000000


In [40]:
channel419 = click_data[(click_data["channel"]==419) & (click_data["dow"]<4)]

In [46]:
ip_attibuted_419 = channel419.groupby(["ip", "is_attributed"]).count()[["app"]]

In [50]:
ip_attibuted_419.sort_values(by=["app"], inplace=True, ascending=False)

In [51]:
ip_attibuted_419

Unnamed: 0_level_0,Unnamed: 1_level_0,app
ip,is_attributed,Unnamed: 2_level_1
110300,0,655
110300,1,339
275169,0,154
148276,0,123
20212,0,112
46391,1,95
148276,1,70
20212,1,62
275169,1,61
164198,1,41


In [54]:
attribute_sum = clicks["attribute"].sum()

In [57]:
clicks["total_ratio"] = clicks["attribute"] / attribute_sum

In [59]:
clicks.sort_values(by=["total_ratio"], ascending=False, inplace=True)

In [63]:
clicks["cumsum"] = clicks["total_ratio"].cumsum()

In [67]:
click_sum = clicks["click"].sum()

In [68]:
click_sum, attribute_sum, attribute_sum/click_sum

(184903890, 456846, 0.002470721410998979)

In [71]:
clicks

Unnamed: 0_level_0,click,attribute,attribute_ratio,total_ratio,cumsum
channel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
213,785038,133070,0.169508,0.291280,0.291280
113,474288,55866,0.117789,0.122286,0.413566
21,303235,42469,0.140053,0.092961,0.506527
101,2147174,33176,0.015451,0.072620,0.579147
274,38472,25888,0.672905,0.056667,0.635814
347,966958,19113,0.019766,0.041837,0.677651
343,426332,10272,0.024094,0.022485,0.700135
333,37862,9477,0.250304,0.020744,0.720880
282,33356,6699,0.200833,0.014664,0.735543
419,10371,6675,0.643622,0.014611,0.750154


In [78]:
clicks[:35]

Unnamed: 0_level_0,click,attribute,attribute_ratio,total_ratio,cumsum
channel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
213,785038,133070,0.169508,0.29128,0.29128
113,474288,55866,0.117789,0.122286,0.413566
21,303235,42469,0.140053,0.092961,0.506527
101,2147174,33176,0.015451,0.07262,0.579147
274,38472,25888,0.672905,0.056667,0.635814
347,966958,19113,0.019766,0.041837,0.677651
343,426332,10272,0.024094,0.022485,0.700135
333,37862,9477,0.250304,0.020744,0.72088
282,33356,6699,0.200833,0.014664,0.735543
419,10371,6675,0.643622,0.014611,0.750154


In [81]:
top35channels = clicks[:35].index

In [84]:
channel_df = click_data[["channel"]]

In [86]:
for channel in top35channels:
    feature = "channel_is_{}".format(channel)
    channel_df[feature] = channel_df["channel"]==channel
    print(feature)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


channel_is_213
channel_is_113
channel_is_21
channel_is_101
channel_is_274
channel_is_347
channel_is_343
channel_is_333
channel_is_282
channel_is_419
channel_is_5
channel_is_145
channel_is_243
channel_is_107
channel_is_280
channel_is_171
channel_is_377
channel_is_134
channel_is_466
channel_is_203
channel_is_210
channel_is_465
channel_is_259
channel_is_244
channel_is_334
channel_is_478
channel_is_121
channel_is_317
channel_is_268
channel_is_330
channel_is_232
channel_is_320
channel_is_215
channel_is_489
channel_is_376


In [87]:
channel_df

Unnamed: 0,channel,channel_is_213,channel_is_113,channel_is_21,channel_is_101,channel_is_274,channel_is_347,channel_is_343,channel_is_333,channel_is_282,...,channel_is_478,channel_is_121,channel_is_317,channel_is_268,channel_is_330,channel_is_232,channel_is_320,channel_is_215,channel_is_489,channel_is_376
0,379,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,379,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,379,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,478,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
4,379,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,379,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,379,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,379,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,379,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,459,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [88]:
channel_df.drop(columns=["channel"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [89]:
channel_df.dtypes

channel_is_213    bool
channel_is_113    bool
channel_is_21     bool
channel_is_101    bool
channel_is_274    bool
channel_is_347    bool
channel_is_343    bool
channel_is_333    bool
channel_is_282    bool
channel_is_419    bool
channel_is_5      bool
channel_is_145    bool
channel_is_243    bool
channel_is_107    bool
channel_is_280    bool
channel_is_171    bool
channel_is_377    bool
channel_is_134    bool
channel_is_466    bool
channel_is_203    bool
channel_is_210    bool
channel_is_465    bool
channel_is_259    bool
channel_is_244    bool
channel_is_334    bool
channel_is_478    bool
channel_is_121    bool
channel_is_317    bool
channel_is_268    bool
channel_is_330    bool
channel_is_232    bool
channel_is_320    bool
channel_is_215    bool
channel_is_489    bool
channel_is_376    bool
dtype: object

In [90]:
channel_df.to_hdf("../data/channel_categorical.hdf", "channel_categorical")