In [27]:
import gc
import os
import operator
from glob import glob 
import copy

import numpy as np
import pandas as pd
import xgboost 
from sklearn.model_selection import train_test_split
import matplotlib as plt


In [7]:
cd "/Users/lli2/Git/kaggle/TalkingData_AdTracking"

/Users/lli2/Git/kaggle/TalkingData_AdTracking


#### Import training data

In [9]:
mycols = ['ip','app','device','os','channel','click_time','is_attributed']

mytypes = {'ip':'uint32',
           'app':'uint16',
           'device':'uint16',
           'os':'uint16',
           'channel':'uint16',
           'is_attributed':'uint16'}

mydate = ['click_time']

df_train = pd.read_csv('/Users/lli2/Git/kaggle_data/train.csv', 
                       nrows=10000000, usecols=mycols, dtype=mytypes, parse_dates=mydate)

Make sure non-null target:

In [210]:
df_train = df_train[df_train['is_attributed'].notnull()]

In [211]:
df_train.head(3)

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed
0,83230,3,1,13,379,2017-11-06 14:32:21,0
1,17357,3,1,19,379,2017-11-06 14:33:34,0
2,35810,3,1,13,379,2017-11-06 14:34:12,0


In [212]:
df_train.dtypes

ip                       uint32
app                      uint16
device                   uint16
os                       uint16
channel                  uint16
click_time       datetime64[ns]
is_attributed            uint16
dtype: object

In [213]:
test = df_train.head(10)
test['click_time'].dt.dayofweek

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
Name: click_time, dtype: int64

In [39]:
set(df_train.columns)

{'app', 'channel', 'click_time', 'device', 'ip', 'is_attributed', 'os'}

#### Data Engineering

Helper functions

In [408]:
def list_combinations(combo, data_lst, collector):
    
    for i in range(len(data_lst)):
        
        # deep cope for later update
        new_combo = copy.copy(combo) 
        new_data_lst = copy.copy(data_lst)
        
        # populate combo & reduce rest candidates
        new_combo.append(data_lst[i])
        new_data_lst = data_lst[i+1:]
        collector.append(new_combo)
        
        # go deeper by recursion
        combinations(new_combo, new_data_lst, collector)
        
    return collector
        


def remove_flat_columns(df, cut_threshold=0.99999):
    # If the top frequent value/NA counts for x% of the column, remove the column.
    nrow = df.shape[0]
    same_pct = dict(df.apply(lambda x: x.value_counts(dropna=False).max()/nrow, axis=0))
    cols_to_remove = [k for k, v in same_pct.items() if v > cut_threshold]
    
    df = df.drop(cols_to_remove, axis=1)
    
    return df
    
    
    
def clicks_creation(new_data, target_var):
    
    if target_var not in new_data.columns:
        print("Error: The given target variable does not exist!")
        return None, None
    
    else:
        # click_time
        new_data['clk_date'] = new_data['click_time'].dt.day
        new_data['clk_hour'] = new_data['click_time'].dt.hour
        new_data['clk_month'] = new_data['click_time'].dt.month
        new_data['clk_dayofweek'] = new_data['click_time'].dt.dayofweek

        new_data = new_data.drop(['click_time'], axis=1)

        # number of clicks by different vars
        data_lst = list(set(new_data.columns)-{target_var})
        var_combo = list_combinations([], data_lst, [])
        var_combo = [x for x in var_combo if len(x)<len(data_lst)]
        var_names = []
        
        for i in range(len(var_combo)):
            var_c = var_combo[i]

            if len(var_c)>1:
                var_n = 'clk_by_' + '_'.join(var_c)
            else:
                var_n = 'clk_by_' + var_c[0]
                
            var_names.append(var_n)

            var_groupby = new_data.groupby(var_c).size().reset_index(name=var_n)
            new_data = new_data.merge(var_groupby, on=var_c, how='left')
    
        return new_data, var_names

    

def denseDummies(df, cat_cols, cut_threshold=0.99, na_to_miss=True):
    # To create some dummies for categorical variables
    # By default, if there are more than 99% 1s or 0s, then drop the dummy.
    
    df_cat = df[cat_cols].astype('str')
    df_cat = df_cat.replace(r'\s+', np.nan, regex=True).replace('', np.nan)
    
    if na_to_miss:
        df_cat = df_cat.fillna('miss_cat')
    
    df_dummies = pd.get_dummies(df_cat)  # generate dummies
    df_dummies = remove_flat_columns(df_dummies, cut_threshold) # filter out sparse dummies
    
    df_num = df.drop(cat_cols, axis=1)
    df_out = pd.concat([df_num, df_dummies], axis=1)
    
    return df_out



def addMissingFlag(df, mynum, cutoff=0.03):
    # For numeric columns, if missing% > 3%, create a flag variable for it.
    nrow = df.shape[0]
    df_miss_pct = df[mynum].isnull().sum()/nrow
    df_to_flag = df_miss_pct.index[df_miss_pct > cutoff].tolist()
    
    # Add to output data frame
    for f in df_to_flag:
        var_name = 'missing_' + f
        df[var_name] = df[f].isnull().astype('int')
    
    return df



def imputationDict(df, mynum=False, mycat=False):
    # Prepare the imputation dictionary for a given data set
    # --> get median for numeric columns
    # --> get mode for categorical columns
    # df = df.fillna(value=impute_dict)
    
    impute_dict = {}
    
    if mynum:
        dict_num = dict(df[mynum].mean().round(2))
        impute_dict = {**impute_dict, **dict_num}
        
    # get mode for categorical columns 
    if mycat:
        dict_cat = df[mycat].mode().iloc[[0]].to_dict(orient='records')[0]
        impute_dict = {**impute_dict, **dict_cat}
    
    return impute_dict



def woe_monotonic(df, mynum):
    # PyWoE
    
    return None



def createBins(df, mynum):
    
    
    return None



def highCorrelations(df, num_cols, target_variable, cutoff=0.95, verbose=False):
    # Remove variables with high pairwise correlations 
    # and keep columns with highest correlation to dependent variable
    # --> use the function after converting all cat to num
    
    x_num = list[set(num_cols).intersection(set(df.select_dtypes(exclude=['object']).columns.values))]
    df_cor = df[x_num].corr() 
    item_remove = df_cor.columns[df_cor.isna().any()].tolist() # check missing
    df_cor = df_cor.drop(columns=item_remove, index=item_remove)
    
    # Remove the lower triangle, including diagonals as this is symmetric
    df_cor = df_cor.where(np.triu(np.ones(df_cor.shape)).astype(np.bool))
    if verbose:
        print('Completed calculation of all independent variable correlations.')
        
    # Grab the rows that are above the cutoff
    df_cut_cor = df_cor[df_cor>cutoff][df_cor<1].dropna(axis=1, how='all').dropna(axis=0, how='all')
    row_names = list(df_cut_cor.index)
    combs_above_cutoff = []
    for i, j in enumerate(df_cut_cor):
        combs_above_cutoff.append([row_names[i], j])
    if verbose:
        print('Completed applying cutoff.')
    
    # Absolute correlation with dependent variable for all variables in combs
    combs_vars = list(set(sum(combs_above_cutoff, [])))
    dep_var_corr = [df[cv].corr(df[target_variable]) for cv in combs_vars]
    dep_var_corr_dict = dict(zip(combs_vars, dep_var_corr))
    if verbose:
        print('Completed calculation of all correlations.')
    
    # Pick the var have higher dep_var_corr in combs
    var_to_keep = []
    var_to_drop = []
    for cv in combs_above_cutoff:
        cv.sort(key=dep_var_corr_dict.get, reverse=True)
        var_to_keep.append(cv[0])
        var_to_drop.append(cv[1])
    
    var_to_keep = list(set(var_to_keep))
    var_to_drop = list(set(var_to_drop))
    
    return var_to_keep, var_to_drop



def cleanColumnNames(df):
    # Replace all non-alphanumeric characters with underscore
    df.columns = df.columns.str.replace('[^0-9a-zA-Z]+','_')
    return df


Create the number of clicks by different combination of cats

In [322]:
test = df_train[1:1000]
target_var = 'is_attributed'

test, test_combo_num = clicks_creation(test, target_var)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [323]:
print(test.shape)
test.head()

(999, 520)


Unnamed: 0,ip,app,device,os,channel,is_attributed,clk_date,clk_hour,clk_month,clk_dayofweek,...,clk_by_channel_ip,clk_by_channel_ip_os,clk_by_channel_os,clk_by_clk_month,clk_by_clk_month_ip,clk_by_clk_month_ip_os,clk_by_clk_month_os,clk_by_ip,clk_by_ip_os,clk_by_os
0,17357,3,1,19,379,0,6,14,11,0,...,2,1,132,999,2,1,281,2,1,281
1,35810,3,1,13,379,0,6,14,11,0,...,1,1,149,999,1,1,281,1,1,281
2,45745,14,1,13,478,0,6,14,11,0,...,1,1,4,999,3,3,281,3,3,281
3,161007,3,1,13,379,0,6,14,11,0,...,1,1,149,999,1,1,281,1,1,281
4,18787,3,1,16,379,0,6,14,11,0,...,2,1,7,999,2,1,11,2,1,11


Encode categorical variables to dummies

In [324]:
target_var = ['is_attributed' ]
mycat = list(set(test.columns)-set(target_var + test_combo_num))
mycat

['device',
 'clk_hour',
 'clk_date',
 'app',
 'clk_dayofweek',
 'clk_month',
 'channel',
 'ip',
 'os']

In [325]:
print(test.shape)
test = denseDummies(test, mycat)
test.shape

(999, 520)


(999, 559)

In [326]:
del df_cat, df_dummies, df_num, df_out
gc.collect()

733

#### Dimension Reduction

Statistics:
* Correlation among variables

Unsupervised Methods:
* PCA: converts our original variables to a new set of variables, which are a linear combination of the original set of variables. 
* SVD
* LDA

Supervised Methods:
* Boostrapping

In [327]:
from sklearn.decomposition import PCA, TruncatedSVD

In [None]:
pca = PCA(n_components=100, withen=True)
x_pca = pca.fit(df_x).transform(df_x)

In [None]:
svd = TruncatedSVD(n_components=100)
x_svd = svd.fit(df_x).transform(df_x)

#### Split into Train and Test 

In [None]:
# Create binary training and validation files for XGBoost
x1, y1 = X[:train_size], y.iloc[:train_size]
dm1 = xgb.DMatrix(x1, y1, feature_names=feature_names)
dm1.save_binary('train.bin')
del dm1, x1, y1
gc.collect()

x2, y2 = X[train_size:], y.iloc[train_size:]
dm2 = xgb.DMatrix(x2, y2, feature_names=feature_names)
dm2.save_binary('validate.bin')
del dm2, x2, y2
del X, y, train_sparse
gc.collect()

#### Baseline with Logistic Regression

In [328]:
from sklearn.linear_model import LogisticRegression 

In [None]:
lgreg = LogisticRegression()
lgreg.fit(x_train, y_train)

In [None]:
lgreg.score(x_test, y_test)

#### Modeling with XGBOOST

With cross-validation

In [None]:
params = {
    'eta': 0.3,
    'tree_method': "hist",
    'grow_policy': "lossguide",
    'max_leaves': 1000,  
    'max_depth': 0, 
    'subsample': 0.9, 
    'alpha':1,
    'objective': 'binary:logistic', 
    'scale_pos_weight':100,
    'eval_metric': 'auc', 
    'nthread':4,
    'silent': 1
}