In [27]:
import gc
import os
import operator
from glob import glob 
import copy

import numpy as np
import pandas as pd
import xgboost 
from sklearn.model_selection import train_test_split
import matplotlib as plt


In [7]:
cd "/Users/lli2/Git/kaggle/TalkingData_AdTracking"

/Users/lli2/Git/kaggle/TalkingData_AdTracking


#### Import training data

In [9]:
mycols = ['ip','app','device','os','channel','click_time','is_attributed']

mytypes = {'ip':'uint32',
           'app':'uint16',
           'device':'uint16',
           'os':'uint16',
           'channel':'uint16',
           'is_attributed':'uint16'}

mydate = ['click_time']

df_train = pd.read_csv('/Users/lli2/Git/kaggle_data/train.csv', 
                       nrows=10000000, usecols=mycols, dtype=mytypes, parse_dates=mydate)

Make sure non-null target:

In [210]:
df_train = df_train[df_train['is_attributed'].notnull()]

In [211]:
df_train.head(3)

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed
0,83230,3,1,13,379,2017-11-06 14:32:21,0
1,17357,3,1,19,379,2017-11-06 14:33:34,0
2,35810,3,1,13,379,2017-11-06 14:34:12,0


In [212]:
df_train.dtypes

ip                       uint32
app                      uint16
device                   uint16
os                       uint16
channel                  uint16
click_time       datetime64[ns]
is_attributed            uint16
dtype: object

In [213]:
test = df_train.head(10)
test['click_time'].dt.dayofweek

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
Name: click_time, dtype: int64

In [39]:
set(df_train.columns)

{'app', 'channel', 'click_time', 'device', 'ip', 'is_attributed', 'os'}

#### Data Engineering

Helper functions

In [214]:
def list_combinations(combo, data_lst, collector):
    
    for i in range(len(data_lst)):
        
        # deep cope for later update
        new_combo = copy.copy(combo) 
        new_data_lst = copy.copy(data_lst)
        
        # populate combo & reduce rest candidates
        new_combo.append(data_lst[i])
        new_data_lst = data_lst[i+1:]
        collector.append(new_combo)
        
        # go deeper by recursion
        combinations(new_combo, new_data_lst, collector)
        
    return collector
        


def remove_sparse_columns(df, cut_threshold=0.99999):
    nrow = test.shape[0]
    same_pct = dict(test.apply(lambda x: x.value_counts().max()/nrow, axis=0))
    cols_to_remove = [k for k, v in same_pct.items() if v > cut_threshold]
    
    df = df.drop(cols_to_remove, axis=1)
    
    return df
    
    
    
def clicks_creation(new_data, target_var):
    
    if target_var not in new_data.columns:
        print("Error: The given target variable does not exist!")
        return None, None
    
    else:
        # click_time
        new_data['clk_date'] = new_data['click_time'].dt.day
        new_data['clk_hour'] = new_data['click_time'].dt.hour
        new_data['clk_month'] = new_data['click_time'].dt.month
        new_data['clk_dayofweek'] = new_data['click_time'].dt.dayofweek

        new_data = new_data.drop(['click_time'], axis=1)

        # number of clicks by different vars
        data_lst = list(set(new_data.columns)-{target_var})
        var_combo = list_combinations([], data_lst, [])
        var_combo = [x for x in var_combo if len(x)<len(data_lst)]
        var_names = []
        
        for i in range(len(var_combo)):
            var_c = var_combo[i]

            if len(var_c)>1:
                var_n = 'clk_by_' + '_'.join(var_c)
            else:
                var_n = 'clk_by_' + var_c[0]
                
            var_names.append(var_n)

            var_groupby = new_data.groupby(var_c).size().reset_index(name=var_n)
            new_data = new_data.merge(var_groupby, on=var_c, how='left')
    
        return new_data, var_names

    

def dense_dummies(df, cat_cols, cut_threshold=0.99, na_to_miss=True):
    # To create some dummies for categorical variables
    # By default, if there are more than 99% 1s or 0s, then drop the dummy.
    
    df_cat = df[cat_cols]
    df_cat = df.replace(r'\s+', np.nan, regex=True).replace('', np.nan)
    
    if na_to_miss:
        df_cat = df_cat.fillna('miss_cat')
    
    df_dummies = pd.get_dummies(df_cat)  # generate dummies
    df_dummies = remove_sparse_columns(df_dummies, cut_threshold) # filter out sparse dummies
    
    df = pd.concat(df.drop[cat_cols, axis=1], df_dummies, axis=1)
    
    return df


    
def woe_monotonic(df, mynum):
    return None

In [277]:
df = pd.DataFrame([['a1', 'b1', 'c1', 6],
                   ['a2', 'b2', '', 1],
                   ['a3', np.nan, np.nan, 5],
                   ['a4', 'b4', 'c4', 4]],
                  columns=list('ABCD'))
print(df.dtypes)
df = df.replace(r'\s+', np.nan, regex=True).replace('', np.nan).fillna('miss_cat')
print(df.dtypes)
pd.get_dummies(df)

A    object
B    object
C    object
D     int64
dtype: object
A    object
B    object
C    object
D     int64
dtype: object


Unnamed: 0,D,A_a1,A_a2,A_a3,A_a4,B_b1,B_b2,B_b4,B_miss_cat,C_c1,C_c4,C_miss_cat
0,6,1,0,0,0,1,0,0,0,1,0,0
1,1,0,1,0,0,0,1,0,0,0,0,1
2,5,0,0,1,0,0,0,0,1,0,0,1
3,4,0,0,0,1,0,0,1,0,0,1,0


Create the number of clicks by different combination of cats

In [215]:
test = df_train[1:1000]
target_var = 'is_attributed'

test, test_combo_num = clicks_creation(test, target_var)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [196]:
print(test.shape)
test.head()

(999, 520)


Unnamed: 0,ip,app,device,os,channel,is_attributed,clk_date,clk_hour,clk_month,clk_dayofweek,...,clk_by_channel_ip,clk_by_channel_ip_os,clk_by_channel_os,clk_by_clk_month,clk_by_clk_month_ip,clk_by_clk_month_ip_os,clk_by_clk_month_os,clk_by_ip,clk_by_ip_os,clk_by_os
0,17357,3,1,19,379,0,6,14,11,0,...,2,1,132,999,2,1,281,2,1,281
1,35810,3,1,13,379,0,6,14,11,0,...,1,1,149,999,1,1,281,1,1,281
2,45745,14,1,13,478,0,6,14,11,0,...,1,1,4,999,3,3,281,3,3,281
3,161007,3,1,13,379,0,6,14,11,0,...,1,1,149,999,1,1,281,1,1,281
4,18787,3,1,16,379,0,6,14,11,0,...,2,1,7,999,2,1,11,2,1,11


Encode categorical variables to dummies

In [219]:
target_var = ['is_attributed' ]
mycat = list(set(test.columns)-set(target_var + test_combo_num))
mycat

['device',
 'clk_hour',
 'clk_date',
 'app',
 'clk_dayofweek',
 'clk_month',
 'channel',
 'ip',
 'os']

In [None]:
test = dense_dummies(test, mycat)

#### Dimension Reduction

#### Modeling with XGBOOST

In [None]:
def lgb_modelfit_nocv(params, dtrain, dvalid, predictors, target='target', objective='binary', metrics='auc',
                      feval=None, early_stopping_rounds=20, num_boost_round=3000, 
                      verbose_eval=10, categorical_features=None):
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': objective,
        'metric':metrics,
        'learning_rate': 0.01,
        #'is_unbalance': 'true',  #because training data is unbalance (replaced with scale_pos_weight)
        'num_leaves': 31,  # we should let it be smaller than 2^(max_depth)
        'max_depth': -1,  # -1 means no limit
        'min_child_samples': 20,  # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': 255,  # Number of bucketed bin for feature values
        'subsample': 0.6,  # Subsample ratio of the training instance.
        'subsample_freq': 0,  # frequence of subsample, <=0 means no enable
        'colsample_bytree': 0.3,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight': 5,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'subsample_for_bin': 200000,  # Number of samples for constructing bin
        'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        'reg_alpha': 0,  # L1 regularization term on weights
        'reg_lambda': 0,  # L2 regularization term on weights
        'nthread': 4,
        'verbose': 0,
        'metric':metrics
    }

    lgb_params.update(params)

    print("preparing validation datasets")

    xgtrain = lgb.Dataset(dtrain[predictors].values, label=dtrain[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical_features
                          )
    xgvalid = lgb.Dataset(dvalid[predictors].values, label=dvalid[target].values,
                          feature_name=predictors,
                          categorical_feature=categorical_features
                          )

    evals_results = {}

    bst1 = lgb.train(lgb_params, 
                     xgtrain, 
                     valid_sets=[xgtrain, xgvalid], 
                     valid_names=['train','valid'], 
                     evals_result=evals_results, 
                     num_boost_round=num_boost_round,
                     early_stopping_rounds=early_stopping_rounds,
                     verbose_eval=10, 
                     feval=feval)

    n_estimators = bst1.best_iteration
    print("\nModel Report")
    print("n_estimators : ", n_estimators)
    print(metrics+":", evals_results['valid'][metrics][n_estimators-1])

    return bst1