In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score, precision_recall_curve, precision_score, roc_curve
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.preprocessing import QuantileTransformer, LabelEncoder, OneHotEncoder
from sklearn.model_selection import cross_val_predict
from lightgbm import LGBMClassifier
from tqdm import tnrange, tqdm
from scipy.stats import kurtosis, skew
import gc
import argparse
import matplotlib.pyplot as plt
import seaborn as sns
import catboost
import warnings
warnings.filterwarnings('ignore')

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
seed = 11
data_dir = '../input/'
np.random.seed(seed)

In [3]:
def train_model(data_, data_val_, y_, y_val, features,early_stopping_rounds,categorical_features, **clf_kwargs):

    oof_preds = np.zeros(data_val_.shape[0])
    n_fold = 0
    
    feature_importance_df = pd.DataFrame()
    
    feats = features
    
#     print(feats)
#     print([feats[x] for x in categorical_features_indices])
    trn_x, trn_y = data_[feats], y_
    val_x, val_y = data_val_[feats], y_val
    clf = catboost.CatBoostClassifier(**clf_kwargs)
#     trn_x = trn_x.T.drop_duplicates().T
#     val_x = val_x.T.drop_duplicates().T
#     print(trn_x.columns)
#     print([trn_x.columns[x] for x in categorical_features])
    categorical_features_indices = [i for i, x in enumerate(trn_x.columns) if x in categorical_features]
    clf.fit(trn_x, trn_y, 
            eval_set= [(trn_x, trn_y), (val_x, val_y)], 
            use_best_model=True,verbose=100, early_stopping_rounds=early_stopping_rounds,
            cat_features=categorical_features_indices#30
           )

    oof_preds = clf.predict_proba(val_x)[:, 1]
        
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds)))
    del trn_x, trn_y, val_x, val_y
    gc.collect()
        
    return oof_preds, clf


def display_importances(feature_importance_df_):
    # Plot feature importances
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(
        by="importance", ascending=False)[:50].index
    
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    
    plt.figure(figsize=(8,10))
    sns.barplot(x="importance", y="feature", 
                data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.savefig('lgbm_importances.png')

def display_roc_curve(y_, oof_preds_, folds_idx_):
    # Plot ROC curves
    plt.figure(figsize=(6,6))
    scores = [] 
    for n_fold, (_, val_idx) in enumerate(folds_idx_):  
        # Plot the roc curve
        fpr, tpr, thresholds = roc_curve(y_.iloc[val_idx], oof_preds_[val_idx])
        score = roc_auc_score(y_.iloc[val_idx], oof_preds_[val_idx])
        scores.append(score)
        plt.plot(fpr, tpr, lw=1, alpha=0.3, label='ROC fold %d (AUC = %0.4f)' % (n_fold + 1, score))
    
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Luck', alpha=.8)
    fpr, tpr, thresholds = roc_curve(y_, oof_preds_)
    score = roc_auc_score(y_, oof_preds_)
    plt.plot(fpr, tpr, color='b',
             label='Avg ROC (AUC = %0.4f $\pm$ %0.4f)' % (score, np.std(scores)),
             lw=2, alpha=.8)
    
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('LightGBM ROC Curve')
    plt.legend(loc="lower right")
    plt.tight_layout()
    
    plt.savefig('roc_curve.png')

def feature_loader_helper(df_features):
    df_features.fillna(0, inplace=True)
#     for col in df_features.columns:
#         if 'server_date_<lambda_0>' in col:
#             df_features[col] = df_features[col].apply(lambda x: str(x).split()[0]).astype(np.float16)
    return df_features

def load_features(features_dir= '../output/', days_from = [-1], days_to = [-8], weights=[]):
    train_features, val_features, test_features = None, None, None
    feats = []
    for id, (i, j) in enumerate(zip(days_from, days_to)):
        print('Loading {} and {}'.format(i, j))
        if train_features is None:
            train_features = pd.read_csv(os.path.join(features_dir
                                                  , 'train_features_{}_{}.csv'.format(i, j)))
            train_features = feature_loader_helper(train_features)
        else:
            df_features = pd.read_csv(os.path.join(features_dir
                                                  , 'train_features_{}_{}.csv'.format(i, j)))

            df_features = feature_loader_helper(df_features)
            if len(weights) == 0:
                train_features = train_features.join(df_features. \
                                                 set_index('impression_id')
                                                 , on = 'impression_id'
                                                 , how='left'
                                                 , rsuffix = '{}_{}'.format(i, j))
            else:
                if len(feats) == 0:
                    feats = [c for c in train_features.columns if c != 'impression_id']
                train_features[feats] = train_features[feats] + df_features[feats]*weights[id]/weights[0]

        if val_features is None:
            val_features = pd.read_csv(os.path.join(features_dir
                                                  , 'val_features_{}_{}.csv'.format(i, j)))
            
            val_features = feature_loader_helper(val_features)

        else:
            df_features = pd.read_csv(os.path.join(features_dir
                                                  , 'val_features_{}_{}.csv'.format(i, j)))

            df_features = feature_loader_helper(df_features)
            if len(weights) == 0:
                val_features = val_features.join(df_features. \
                                             set_index('impression_id')
                                             , on = 'impression_id'
                                             , how='left'
                                             , rsuffix = '{}_{}'.format(i, j))
            else:
                val_features[feats] = val_features[feats] + df_features[feats]*weights[id]/weights[0]

        if test_features is None:
            test_features = pd.read_csv(os.path.join(features_dir
                                                  , 'test_features_{}_{}.csv'.format(i, j)))

            test_features = feature_loader_helper(test_features)                                    
        else:
            df_features = pd.read_csv(os.path.join(features_dir
                                                  , 'test_features_{}_{}.csv'.format(i, j)))

            df_features = feature_loader_helper(df_features)
            if len(weights) == 0:
                test_features = test_features.join(df_features. \
                                              set_index('impression_id')
                                             , on = 'impression_id'
                                             , how='left'
                                             , rsuffix = '{}_{}'.format(i, j))
            else:
                test_features[feats] = test_features[feats] + df_features[feats]*weights[id]/weights[0]
    
    if len(weights) > 0:
        train_features[feats] = train_features[feats]*weights[0]
        test_features[feats] = test_features[feats]*weights[0]
        val_features[feats] = val_features[feats]*weights[0]

    return train_features, val_features, test_features

In [4]:
def load_extra_feats(df):
    
    df_unq_users_app = pd.read_csv('../new_output/unique_users_per_app.csv')
    df = df.join(df_unq_users_app.set_index('app_code'), on = 'app_code', how='left')
    df['unique_users_per_app'] = np.log1p(df['unique_users_per_app'])
    
    df_recent_items = pd.read_csv('../new_output/recent_items.csv')
    df = df.join(df_recent_items.set_index('impression_id'), on='impression_id', how='left', rsuffix = '_recent')
    
    df['number_of_uq_items_per_session'] = df['item_id_nunique'].divide(df['session_id_nunique'])
    
    return df

In [5]:
df_train = pd.read_csv(os.path.join(data_dir, 'train.csv'))
df_test = pd.read_csv(os.path.join(data_dir, 'test.csv'))

df_train['impression_time'] = pd.to_datetime(df_train['impression_time'])
df_test['impression_time'] = pd.to_datetime(df_test['impression_time'])

df_train['impression_date'] = df_train['impression_time'].dt.date.astype(str)
df_test['impression_date'] = df_test['impression_time'].dt.date.astype(str)

df_val = df_train.loc[df_train['impression_date'] > '2018-12-05'].reset_index(drop=True)
df_train = df_train.loc[df_train['impression_date'] < '2018-12-06'].reset_index(drop=True)

train_features, val_features, test_features = load_features(features_dir= '../new_output/'
, days_from = [-1,  -8, -15, -22, -1]
, days_to = [-8,  -15, -22, -29, -35]
, weights = [])

df_train = df_train.join(train_features.set_index('impression_id'), on='impression_id', how='left')
df_val = df_val.join(val_features.set_index('impression_id'), on='impression_id', how='left')
df_test = df_test.join(test_features.set_index('impression_id'), on='impression_id', how='left')

Loading -1 and -8
Loading -8 and -15
Loading -15 and -22
Loading -22 and -29
Loading -1 and -35


In [6]:
for col in df_train.columns:
    if 'count' in col:
        df_train[col] = np.log1p(1+df_train[col])
        df_val[col] = np.log1p(1+df_val[col])
        df_test[col] = np.log1p(1+df_test[col])

In [7]:
server_time_max = [c for c in df_train.columns if 'server_time_max' in c]
for s in server_time_max:
    df_train[s] = pd.to_datetime(df_train[s])
    df_train['days_difference_{}'.format(s)] = (df_train['impression_time'] - df_train[s]).apply(lambda x: x.value/10**12)
    
    df_val[s] = pd.to_datetime(df_val[s])
    df_val['days_difference_{}'.format(s)] = (df_val['impression_time'] - df_val[s]).apply(lambda x: x.value/10**12)
    
    df_test[s] = pd.to_datetime(df_test[s])
    df_test['days_difference_{}'.format(s)] = (df_test['impression_time'] - df_test[s]).apply(lambda x: x.value/10**12)

In [8]:
import numba
from numba import prange
from collections import defaultdict

In [9]:
@numba.jit
def get_splits(a):
    m = np.concatenate([[True], a[1:] != a[:-1], [True]])
    m = np.flatnonzero(m)
    return m


@numba.jit
def get_expanding_count(user, time, col=None):
    '''
    user: user_id
    time: DateTime col (converted to int)
    col: attribute col (product/webpage)
    '''
    out = np.zeros((len(user), ))
    if col is not None:
        col_unq = np.unique(col)
        for col_val in col_unq:
            col_val_idx = np.where(col == col_val)[0]
            col_user = user[col_val_idx]
            col_time = time[col_val_idx]
            col_out = np.zeros((len(col_val_idx), ))
            m = get_splits(col_user)
            n = len(m) -1
            for i in range(n):
                j = m[i]
                k = m[i+1]
                sub_time = col_time[j:k]
                oo = get_splits(sub_time)
                pp = len(oo) - 1
                for ii in range(pp):
                    col_out[j+oo[ii]:j+oo[ii+1]] = oo[ii] + 1
            out[col_val_idx] = col_out[:]
            
    else:
        m = get_splits(user)
        n = len(m) -1
        for i in range(n):
            j = m[i]
            k = m[i+1]
            sub_time = time[j:k]
            oo = get_splits(sub_time)
            pp = len(oo) - 1
            for ii in range(pp):
                out[j+oo[ii]:j+oo[ii+1]] = oo[ii] + 1
    return np.log1p(out)

In [10]:
@numba.jit
def get_prev_view(user, time, col=None):
    '''
    user: user_id
    time: DateTime col (converted to int)
    col: attribute col (product/webpage)
    '''
    out = -1*np.ones((len(user), ))
    if col is not None:
        col_unq = np.unique(col)
        for col_val in col_unq:
            col_val_idx = np.where(col == col_val)[0]
            col_user = user[col_val_idx]
            col_time = time[col_val_idx]
            col_out = -1*np.ones((len(col_val_idx), ))
            m = get_splits(col_user)
            n = len(m) -1
            for i in range(n):
                j = m[i]
                k = m[i+1]
                sub_time = col_time[j:k]
                oo = get_splits(sub_time)
                pp = len(oo) - 1
                if pp == 0:
                    col_out[j] = 0
                else:
                    for ii in range(1, pp):
                        col_out[j+oo[ii]:j+oo[ii+1]] = np.log1p(col_time[j+oo[ii]] - col_time[j+oo[ii-1]])
            out[col_val_idx] = col_out[:]
            
    else:
        m = get_splits(user)
        n = len(m) -1
        for i in range(n):
            j = m[i]
            k = m[i+1]
            sub_time = time[j:k]
            oo = get_splits(sub_time)
            pp = len(oo) - 1
            if pp == 0:
                out[j] = -1
            else:
                for ii in range(1, pp):
                    out[j+oo[ii]:j+oo[ii+1]] = np.log1p(time[j+oo[ii]] - time[j+oo[ii-1]])
    return out



In [11]:
@numba.jit
def get_click_counts(user, time, click, col=None):
    '''
    user: user_id
    time: DateTime col (converted to int)
    click: Click column
    col: col (product/webpage)
    '''
    out = np.zeros((len(user), ))
    if col is not None:
        col_unq = np.unique(col)
        for col_val in col_unq:
            col_val_idx = np.where(col == col_val)[0]
            col_user = user[col_val_idx]
            col_time = time[col_val_idx]
            col_click = click[col_val_idx]
            col_out = np.zeros((len(col_val_idx), ))
            m = get_splits(col_user)
            n = len(m) -1
            for i in range(n):
                cnt = 0
                j = m[i]
                k = m[i+1]
                sub_time = col_time[j:k]
                oo = get_splits(sub_time)
                pp = len(oo) - 1
                for ii in range(pp):
                    col_out[j+oo[ii]:j+oo[ii+1]] = cnt
                    cnt += np.sum(col_click[j+oo[ii]:j+oo[ii+1]])
            out[col_val_idx] = col_out[:]
            
    else:
        m = get_splits(user)
        n = len(m) -1
        for i in range(n):
            cnt = 0
            j = m[i]
            k = m[i+1]
            sub_time = time[j:k]
            oo = get_splits(sub_time)
            pp = len(oo) - 1
            for ii in range(pp):
                out[j+oo[ii]:j+oo[ii+1]] = cnt
                cnt += np.sum(click[j+oo[ii]:j+oo[ii+1]])
    return np.log1p(out)

In [12]:
import pandas as pd
import numpy as np 
from collections import Counter
from sklearn.base import BaseEstimator, TransformerMixin

        
class TargetEncoderWithThresh(BaseEstimator, TransformerMixin):
    """
    A utlity class to help encode categorical variables using different methods.
    
    Inputs:
    cols: (List or str) Can be either a string or list of strings with column names
    targetcol: (str) Target column to encode column/group of columns with
    thresh: (int) Minimum count of grouping to encode (Acts as smoothing). Currently not implemented TODO
    func: (str or callable) Function to be applied on column/ group of columns to encode. 
          If str is provided, it should be a attribute of pandas Series
    cname: (str) Column name for new string
    func_kwargs: (dict) Additional arguments to be passed to function 
    add_to_orig: (bool) Whether to return dataframe with added feature or just the feature as series
    
    Output:
    pandas DataFrame/Series
    
    """
    def __init__(self, cols=None, targetcol=None, cname=None, thresh=0, func=np.mean,  add_to_orig=False,
                 func_kwargs={}, use_prior=False, alpha=0.5):                                                 
        self.cols = cols #Can be either a string or list of strings with column names
        self.targetcol = targetcol #Target column to encode column/group of columns with
        self.thresh = thresh  #Minimum count of grouping to encode (Acts as smoothing)
        self.func = func #Function to be applied on column/ group of columns to encode 
        self.add_to_orig = add_to_orig #Whether return a dataframe with added feature or just a series of feature
        self.cname = cname #Column to new feature generated
        self.func_kwargs = func_kwargs  #Additional key word arguments to be applied to func
        self.alpha = alpha #smoothing factor
        self.use_prior = use_prior
                
    def fit(self, X, y=None):
            
        if isinstance(self.func, str):
            if hasattr(pd.Series, self.func):
                #print("here")
                vals = getattr(X.groupby(self.cols)[self.targetcol], self.func)
                self.dictmap = vals(**self.func_kwargs)
                prior = getattr(X[self.targetcol], self.func)(**self.func_kwargs)
                
        else:
            self.dictmap = X.groupby(self.cols)[self.targetcol].apply(lambda x: self.func(x, **self.func_kwargs))
            prior = X[[self.targetcol]].apply(lambda x: self.func(x, **self.func_kwargs)).values[0]
        self.counts = Counter(zip(*[X[col].tolist() for col in self.cols]))
        if len(self.cols) == 1:
            counts_greater_than_thresh = [k[0] for k, v in self.counts.items() if v >= self.thresh ]
        else:
            counts_greater_than_thresh = [k for k, v in self.counts.items() if v >= self.thresh ]
        
        
        self.dictmap = self.dictmap.loc[self.dictmap.index.isin(counts_greater_than_thresh)]
        if self.use_prior:
            self.dictmap = {k: ((self.counts[k] * v + prior * self.alpha)/(self.counts[k] + self.alpha))
                            for k, v in self.dictmap.items()}
            self.dictmap = pd.Series(self.dictmap)
            self.dictmap.index.names = self.cols
        
        if self.cname:
            self.dictmap.name = self.cname
        else:
            cname = ''
            cname = [cname + '_' +str(col) for col in self.cols]
            self.cname = '_'.join(cname) + "_" + str(self.func)
            self.dictmap.name = self.cname
            
        return self
    
    def transform(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            X_transformed = X[self.cols]
            
            X_transformed = X_transformed.join(self.dictmap, on=self.cols, how='left')[self.cname]

            if self.add_to_orig:
                return pd.concat([X, X_transformed], axis=1, copy=False)
            else:
                return X_transformed.values

        else:
            raise TypeError("Input should be a pandas DataFrame")


In [13]:
@numba.jit
def get_prev_click(user, time, click, col=None):
    '''
    user: user_id
    time: DateTime col (converted to int)
    click: Click column
    col: col (product/webpage)
    '''
    out = -1* np.ones((len(user), ))
    if col is not None:
        col_unq = np.unique(col)
        for col_val in col_unq:
            col_val_idx = np.where(col == col_val)[0]
            col_user = user[col_val_idx]
            col_time = time[col_val_idx]
            col_click = click[col_val_idx]
            col_out = -1 * np.ones((len(col_val_idx), ))
            m = get_splits(col_user)
            n = len(m) -1
            for i in range(n):
                prev_time =-1
                j = m[i]
                k = m[i+1]
                sub_time = col_time[j:k]
                oo = get_splits(sub_time)
                pp = len(oo) - 1
                for ii in range(pp):
                    if prev_time != -1:
                        col_out[j+oo[ii]:j+oo[ii+1]] = np.log1p(col_time[j+oo[ii]] - prev_time)
                    if np.sum(col_click[j+oo[ii]:j+oo[ii+1]]) >= 1:
                        prev_time = col_time[j+oo[ii]] 
            out[col_val_idx] = col_out[:]
            
    else:
        m = get_splits(user)
        n = len(m) -1
        for i in range(n):
            prev_time = -1
            j = m[i]
            k = m[i+1]
            sub_time = time[j:k]
            oo = get_splits(sub_time)
            pp = len(oo) - 1
            for ii in range(pp):
                if prev_time != -1:
                    out[j+oo[ii]:j+oo[ii+1]] = np.log1p(time[j+oo[ii]] - prev_time)
                if np.sum(click[j+oo[ii]:j+oo[ii+1]]) >= 1:
                    prev_time = time[j+oo[ii]]
    return out

In [14]:
def woe(X, y, cont=True):
    try:
        tmp = pd.DataFrame()
        tmp["variable"] = X
        if cont:
            tmp["variable"] = pd.qcut(tmp["variable"], 255, duplicates="drop")
        tmp["target"] = y
        var_counts = tmp.groupby("variable")["target"].count()
        var_events = tmp.groupby("variable")["target"].sum()
        var_nonevents = var_counts - var_events
        tmp["var_counts"] = tmp.variable.map(var_counts)
        tmp["var_events"] = tmp.variable.map(var_events)
        tmp["var_nonevents"] = tmp.variable.map(var_nonevents)
        events = sum(tmp["target"] == 1)
        nonevents = sum(tmp["target"] == 0)
        div_series = (tmp["var_nonevents"].astype(float)/nonevents).divide(tmp["var_events"].astype(float)/events)
        tmp["woe"] = np.log(div_series)
        tmp["woe"] = tmp["woe"].replace(np.inf, 0).replace(-np.inf, 0)
        tmp["iv"] = (tmp["var_nonevents"].astype(float)/nonevents - \
                     tmp["var_events"].astype(float)/events) * tmp["woe"].astype(float)
        iv = tmp.groupby("variable")["iv"].last().sum()
        return tmp["woe"], tmp["iv"], iv
    except:
        return 0

In [15]:
def get_view_feats(df):
    df = df.copy()
    users = df["user_id"].values
    times = df["impression_time"].astype(int).values/10**12
    app_code = df["app_code"].values
    os_version = df["os_version"].values
    is_4G = df["is_4G"].values
    df["prev_view"] = get_prev_view(users, times)
    
    df["prev_app_view"] = get_prev_view(users, times, app_code)
    df["prev_os_view"] = get_prev_view(users, times, os_version)
    
    df["prev_4G_view"] = get_prev_view(users, times, is_4G)
    return df

In [16]:
def get_count_feats(df):
    df = df.copy()
    users = df["user_id"].values
    times = df["impression_time"].astype(int).values/10**12
    app_code = df["app_code"].values
    os_version = df["os_version"].values
    is_4G = df["is_4G"].values
    
    df["view_counts"] = get_expanding_count(users, times)
    df["user_app_counts"] = get_expanding_count(users, times, app_code)
    df["user_os_counts"] = get_expanding_count(users, times, os_version)
    df["user_4g_counts"] = get_expanding_count(users, times, is_4G)

    return df


In [17]:
def get_click_feats(tr, val):
    tr = tr.copy()
    uids = tr["user_id"].values
    app_code = tr["app_code"].values
    os_version = tr["os_version"].values
    datetime = tr["impression_time"].astype(int).values/10**12
    clicks = tr["is_click"].values
    
    tr["prev_click"] = get_prev_click(uids, datetime, clicks)
    tr["prev_app_click"] = get_prev_click(uids, datetime, clicks, app_code)
    tr["prev_os_click"] = get_prev_click(uids, datetime, clicks, os_version)
    
    tr["click_counts"] = get_click_counts(uids, datetime, clicks)
    tr["app_click_counts"] = get_click_counts(uids, datetime, clicks, app_code)
    tr["os_click_counts"] = get_click_counts(uids, datetime, clicks, os_version)
    
    tr_clicks = tr.loc[tr.is_click == 1].groupby("user_id")["impression_time"].max()
    val["prev_click_time"] = val.user_id.map(tr_clicks)
    val["prev_click"] = np.log1p((val["impression_time"] - val["prev_click_time"]).astype(int)/10**12)
    val["prev_click"] = val["prev_click"].fillna(-1)
    del val["prev_click_time"]
    
    tr_clicks = tr.loc[tr.is_click == 1].groupby(["user_id", "app_code"])["impression_time"].max()
    tr_clicks.name = "prev_app_click_time"
    val = val.join(tr_clicks, on=["user_id", "app_code"], how="left")
    val["prev_app_click"] = np.log1p((val["impression_time"] - val["prev_app_click_time"]).astype(int)/10**12)
    val["prev_app_click"] = val["prev_app_click"].fillna(-1)
    del val["prev_app_click_time"]
                   
    tr_clicks = tr.loc[tr.is_click == 1].groupby(["user_id", "os_version"])["impression_time"].max()
    tr_clicks.name = "prev_os_click_time"
    val = val.join(tr_clicks, on=["user_id", "os_version"], how="left")
    val["prev_os_click"] = np.log1p((val["impression_time"] - val["prev_os_click_time"]).astype(int)/10**12)
    val["prev_os_click"] = val["prev_os_click"].fillna(-1)
    del val["prev_os_click_time"]
    
    val["click_counts"] = val["user_id"].map(np.log1p(tr.groupby("user_id")["is_click"].sum())).fillna(0)
    
    tmp = np.log1p(tr.groupby(["user_id", "app_code"])["is_click"].sum())
    tmp.name = "app_click_counts"
    val = val.join(tmp, on=["user_id", "app_code"], how="left").fillna(0)
    
    tmp = np.log1p(tr.groupby(["user_id", "os_version"])["is_click"].sum())
    tmp.name = "os_click_counts"
    val = val.join(tmp, on=["user_id", "os_version"], how="left").fillna(0)
    
    
    return tr, val

In [18]:
df_train['flag1'] = 'train'
df_val['flag1'] = 'val'
df_test['flag1'] = 'test'

df_test['is_click'] = 0

df_data_all = pd.concat([df_train, df_val, df_test], axis = 0)

In [19]:
df_data_all = load_extra_feats(df_data_all)

In [20]:
df_data_all.head()

Unnamed: 0,app_code,category_1_-1_sum,category_1_-1_sum-15_-22,category_1_-1_sum-1_-35,category_1_-1_sum-22_-29,category_1_-1_sum-8_-15,category_1_0_sum,category_1_0_sum-15_-22,category_1_0_sum-1_-35,category_1_0_sum-22_-29,...,session_id,user_id_recent,item_id,item_price,category_1,category_2,category_3,product_type,session_time,number_of_uq_items_per_session
0,422,,,,,,,,,,...,190710.0,87862.0,43886.0,2350.0,11.0,35.0,20.0,5622.0,0.0,
1,467,,,,,,,,,,...,343558.0,63410.0,43209.0,3421.0,4.0,74.0,292.0,577.0,3.0,
2,259,,,,,,,,,,...,658178.0,71748.0,122348.0,973.0,13.0,67.0,170.0,2874.0,0.0,
3,244,0.0,,0.0,,,0.0,,0.0,,...,358781.0,69209.0,1528.0,7320.0,9.0,44.0,114.0,3528.0,17.0,1.0
4,473,0.0,,0.0,0.0,,0.0,,0.0,0.0,...,809094.0,62873.0,4069.0,4556.0,7.0,24.0,100.0,9215.0,0.0,1.666667


In [21]:
df_train.sort_values(by=["user_id", "impression_time"], inplace=True)
df_data_all.sort_values(by=["user_id", "impression_time"], inplace=True)

In [22]:
df_train = get_view_feats(df_train)
df_data_all = get_view_feats(df_data_all)

In [23]:
df_train = get_count_feats(df_train)
df_data_all = get_count_feats(df_data_all)

In [24]:
df_train["hour"] = df_train["impression_time"].dt.hour
df_train["dayofweek"] = df_train["impression_time"].dt.dayofweek

df_data_all["hour"] = df_data_all["impression_time"].dt.hour
df_data_all["dayofweek"] = df_data_all["impression_time"].dt.dayofweek

In [25]:
def get_overall_count_feats(df):
    df = df.copy()
    enc = TargetEncoderWithThresh(cols=["user_id"], targetcol="is_click", func='count')
    df["all_counts"] = np.log1p(enc.fit_transform(df))
    
    enc = TargetEncoderWithThresh(cols=["app_code"], targetcol="is_click", func='count')
    df["app_counts"] = np.log1p(enc.fit_transform(df))

    enc = TargetEncoderWithThresh(cols=["os_version"], targetcol="is_click", func='count')
    df["os_counts"] = np.log1p(enc.fit_transform(df))

    enc = TargetEncoderWithThresh(cols=["user_id", "app_code"], targetcol="is_click", func='count')
    df["all_usr_app_counts"] = np.log1p(enc.fit_transform(df))

    enc = TargetEncoderWithThresh(cols=["user_id", "os_version"], targetcol="is_click", func='count')
    df["all_usr_os_counts"] = np.log1p(enc.fit_transform(df))

    enc = TargetEncoderWithThresh(cols=["user_id", "app_code", "os_version"], targetcol="is_click", func='count')
    df["all_usr_app_os_counts"] = np.log1p(enc.fit_transform(df))
    
    enc = TargetEncoderWithThresh(cols=["user_id", "app_code", "os_version", 'impression_date'], targetcol="is_click", func='count')
    df["all_usr_app_os_date_counts"] = np.log1p(enc.fit_transform(df))
    
    enc = TargetEncoderWithThresh(cols=["user_id", "hour"], targetcol="is_click", func='count')
    df["usr_hour_counts"] = np.log1p(enc.fit_transform(df))

    enc = TargetEncoderWithThresh(cols=["user_id", "impression_date"], targetcol="is_click", func='count')
    df["usr_date_counts"] = np.log1p(enc.fit_transform(df))

    enc = TargetEncoderWithThresh(cols=["user_id", "app_code", "impression_date"], targetcol="is_click", func='count')
    df["usr_app_date_counts"] = np.log1p(enc.fit_transform(df))
    
    enc = TargetEncoderWithThresh(cols=["user_id"], targetcol="app_code", func='nunique')
    df["usr_nunq_app"] = enc.fit_transform(df)
    
    enc = TargetEncoderWithThresh(cols=["user_id", "impression_date"], targetcol="app_code", func='nunique')
    df["usr_date_nunq_app"] = enc.fit_transform(df)
    
    enc = TargetEncoderWithThresh(cols=["item_id"], targetcol="app_code", func='count')
    df["recent_item_id_cnt"] = enc.fit_transform(df)
    
    enc = TargetEncoderWithThresh(cols=["category_2"], targetcol="app_code", func='count')
    df["recent_category_2_cnt"] = enc.fit_transform(df)
    
    enc = TargetEncoderWithThresh(cols=["category_3"], targetcol="app_code", func='count')
    df["recent_category_3_cnt"] = enc.fit_transform(df)
    
    enc = TargetEncoderWithThresh(cols=["product_type"], targetcol="app_code", func='count')
    df["recent_prod_type_cnt"] = enc.fit_transform(df)
        
    return df

In [26]:
import math
def entropy2(labels, base=None):
    """ Computes entropy of label distribution. """

    n_labels = len(labels)

    if n_labels <= 1:
        return 0

    value,counts = np.unique(labels, return_counts=True)
    probs = counts / n_labels
    n_classes = np.count_nonzero(probs)

    if n_classes <= 1:
        return 0

    ent = 0.

    # Compute entropy
    base = math.e if base is None else base
    for i in probs:
        ent -= i * math.log(i, base)

    return ent

In [27]:
def get_target_encoding(tr, val, y_tr):
    cvlist2 = list(StratifiedKFold(10, shuffle=True, random_state=12345786).split(tr, y_tr))
    
    print("Likelihood encoding app_code user_id")
    enc = TargetEncoderWithThresh(cols=["user_id", "app_code"], targetcol="is_click", use_prior=True, func='mean')
    tr["user_app_tmean"] = cross_val_predict(enc, tr, y_tr, cv=cvlist2, method="transform", n_jobs=-1)
    val["user_app_tmean"] = enc.fit(tr).transform(val)
    val["user_app_tmean"].fillna(tr['is_click'].mean())

    print("Likelihood encoding user_id hour")
    enc = TargetEncoderWithThresh(cols=["user_id", "hour"], targetcol="is_click", use_prior=True, func='mean')
    tr["user_hour_tmean"] = cross_val_predict(enc, tr, y_tr, cv=cvlist2, method="transform", n_jobs=-1)
    val["user_hour_tmean"] = enc.fit(tr).transform(val)
    val["user_hour_tmean"].fillna(tr['is_click'].mean())

    print("Likelihood encoding user_id dow")
    enc = TargetEncoderWithThresh(cols=["user_id", "dayofweek"], targetcol="is_click", use_prior=True, func='mean')
    tr["user_dow_tmean"] = cross_val_predict(enc, tr, y_tr, cv=cvlist2, method="transform", n_jobs=-1)
    val["user_dow_tmean"] = enc.fit(tr).transform(val)
    val["user_dow_tmean"].fillna(tr['is_click'].mean())
    
    print("Likelihood encoding user_id os_version")
    enc = TargetEncoderWithThresh(cols=["user_id", "os_version"], targetcol="is_click", use_prior=True, func='mean')
    tr["user_os_tmean"] = cross_val_predict(enc, tr, y_tr, cv=cvlist2, method="transform", n_jobs=-1)
    val["user_os_tmean"] = enc.fit(tr).transform(val)
    val["user_os_tmean"].fillna(tr['is_click'].mean())
    
    print("Likelihood encoding app_code user_id os_version")
    enc = TargetEncoderWithThresh(cols=["user_id", "app_code", 'os_version'], targetcol="is_click", use_prior=True, func='mean')
    tr["user_app_os_tmean"] = cross_val_predict(enc, tr, y_tr, cv=cvlist2, method="transform", n_jobs=-1)
    val["user_app_os_tmean"] = enc.fit(tr).transform(val)
    val["user_app_os_tmean"].fillna(tr['is_click'].mean())
    
    print("Likelihood encoding os_version app_os_entropy")
    enc = TargetEncoderWithThresh(cols=["app_code", "os_version"], targetcol="user_id", use_prior=True, func=entropy2)
    tr["app_os_entropy"] = cross_val_predict(enc, tr, y_tr, cv=cvlist2, method="transform", n_jobs=-1)
    val["app_os_entropy"] = enc.fit(tr).transform(val)
    
    print("Likelihood encoding app_entropy")
    enc = TargetEncoderWithThresh(cols=["app_code"], targetcol="user_id", use_prior=True, func=entropy2)
    tr["app_entropy"] = cross_val_predict(enc, tr, y_tr, cv=cvlist2, method="transform", n_jobs=-1)
    val["app_entropy"] = enc.fit(tr).transform(val)

    print("Likelihood encoding app_hour_entropy")
    enc = TargetEncoderWithThresh(cols=["app_code", 'hour'], targetcol="user_id", use_prior=True, func=entropy2)
    tr["app_hour_entropy"] = cross_val_predict(enc, tr, y_tr, cv=cvlist2, method="transform", n_jobs=-1)
    val["app_hour_entropy"] = enc.fit(tr).transform(val)

    print("Likelihood encoding user id")
    enc = TargetEncoderWithThresh(cols=["user_id"], targetcol="is_click", use_prior=True, func='mean', alpha=5)
    tr["usr_tmean"] = cross_val_predict(enc, tr, y_tr, cv=cvlist2, method="transform", n_jobs=-1)
    val["usr_tmean"] = enc.fit(tr).transform(val)
    val["usr_tmean"].fillna(tr['is_click'].mean())
    
    print("Likelihood encoding app_code")
    enc = TargetEncoderWithThresh(cols=["app_code"], targetcol="is_click", use_prior=True, func='mean', alpha=5)
    tr["app_tmean"] = cross_val_predict(enc, tr, y_tr, cv=cvlist2, method="transform", n_jobs=-1)
    val["app_tmean"] = enc.fit(tr).transform(val)
    val["app_tmean"].fillna(tr['is_click'].mean())
    
    print("Likelihood encoding recent item_id")
    enc = TargetEncoderWithThresh(cols=["item_id"], targetcol="is_click", use_prior=True, func='mean', alpha=5)
    tr["item_id_tmean"] = cross_val_predict(enc, tr, y_tr, cv=cvlist2, method="transform", n_jobs=-1)
    val["item_id_tmean"] = enc.fit(tr).transform(val)
    val["item_id_tmean"].fillna(tr['is_click'].mean())

    print("Likelihood encoding recent product")
    enc = TargetEncoderWithThresh(cols=["product_type"], targetcol="is_click", use_prior=True, func='mean', alpha=5)
    tr["prod_tmean"] = cross_val_predict(enc, tr, y_tr, cv=cvlist2, method="transform", n_jobs=-1)
    val["prod_tmean"] = enc.fit(tr).transform(val)
    val["prod_tmean"].fillna(tr['is_click'].mean())

    print("Likelihood encoding recent category 3")
    enc = TargetEncoderWithThresh(cols=["category_3"], targetcol="is_click", use_prior=True, func='mean', alpha=5)
    tr["category_3_tmean"] = cross_val_predict(enc, tr, y_tr, cv=cvlist2, method="transform", n_jobs=-1)
    val["category_3_tmean"] = enc.fit(tr).transform(val)
    val["category_3_tmean"].fillna(tr['is_click'].mean())

    print("Likelihood encoding recent category 2")
    enc = TargetEncoderWithThresh(cols=["category_2"], targetcol="is_click", use_prior=True, func='mean', alpha=5)
    tr["category_2_tmean"] = cross_val_predict(enc, tr, y_tr, cv=cvlist2, method="transform", n_jobs=-1)
    val["category_2_tmean"] = enc.fit(tr).transform(val)
    val["category_2_tmean"].fillna(tr['is_click'].mean())

    return tr, val

In [28]:
def load_factorizations(df_, case = 0):
    df = df_.copy()
#     df = df.join(pd.read_csv('../new_output/X_user_category_2_tsvd_5factors.csv').set_index('user_id'),
#                 on = 'user_id', how='left')
    
#     df = df.join(pd.read_csv('../new_output/X_user_category_2_nmf_5factors.csv').set_index('user_id'),
#                 on = 'user_id', how='left')
    
#     df = df.join(pd.read_csv('../new_output/X_user_category_3_nmf_5factors.csv').set_index('user_id'),
#                 on = 'user_id', how='left')
    
#     df = df.join(pd.read_csv('../new_output/X_user_category_3_tsvd_5factors.csv').set_index('user_id'),
#                 on = 'user_id', how='left')
    
    
#     df = df.join(pd.read_csv('../new_output/X_user_category_1_nmf_5factors.csv').set_index('user_id'),
#                 on = 'user_id', how='left')
    
#     df = df.join(pd.read_csv('../new_output/X_user_category_1_tsvd_5factors.csv').set_index('user_id'),
#                 on = 'user_id', how='left')
    
#     df = df.join(pd.read_csv('../new_output/X_user_product_nmf_5factors.csv').set_index('user_id'),
#                 on = 'user_id', how='left')
    
#     df = df.join(pd.read_csv('../new_output/X_user_product_tsvd_5factors.csv').set_index('user_id'),
#                 on = 'user_id', how='left')

    df = df.join(pd.read_csv('../new_output/usr_itm_tsvd5.csv').set_index('user_id'),
                on = 'user_id', how='left', rsuffix='_itm_tsvd5')
    
    df = df.join(pd.read_csv('../new_output/usr_category_1_nnmf5_v2.csv').set_index('user_id'),
                on = 'user_id', how='left', rsuffix='_cat1_tsvd5')
    
    df = df.join(pd.read_csv('../new_output/usr_category_1_tsvd5_v2.csv').set_index('user_id'),
                on = 'user_id', how='left', rsuffix='_cat1_nnmf5')
    
    df = df.join(pd.read_csv('../new_output/usr_category_2_nnmf5_v2.csv').set_index('user_id'),
                on = 'user_id', how='left', rsuffix='_cat2_nnmf5')
    
    df = df.join(pd.read_csv('../new_output/usr_category_2_tsvd5_v2.csv').set_index('user_id'),
                on = 'user_id', how='left', rsuffix='_cat2_tsvd5')
    
    df = df.join(pd.read_csv('../new_output/usr_category_3_nnmf5_v2.csv').set_index('user_id'),
                on = 'user_id', how='left', rsuffix='_cat3_nnmf5')
    
    df = df.join(pd.read_csv('../new_output/usr_category_3_tsvd5_v2.csv').set_index('user_id'),
                on = 'user_id', how='left', rsuffix='_cat3_tsvd5')
    
    df = df.join(pd.read_csv('../new_output/usr_product_type_nnmf5_v2.csv').set_index('user_id'),
                on = 'user_id', how='left', rsuffix='_prodtype_nnmf5')
    
    df = df.join(pd.read_csv('../new_output/usr_product_type_tsvd5_v2.csv').set_index('user_id'),
                on = 'user_id', how='left', rsuffix='_prodtype_tsvd5')

    df = df.join(pd.read_csv('../new_output/usr_app_code_nnmf5_v2.csv').set_index('user_id'),
                on = 'user_id', how='left', rsuffix='_appcode_nnmf5')

    df = df.join(pd.read_csv('../new_output/usr_app_code_tsvd5_v2.csv').set_index('user_id'),
                on = 'user_id', how='left', rsuffix='_appcode_tsvd5')
    
    if case == 0:
    
        df = df.join(pd.read_csv('../new_output/user_id_app_FM.csv.gz', compression='gzip').set_index(['user_id'
                                                                                                  , 'app_code'])
                , on = ['user_id', 'app_code'], how = 'left')
        df.drop(columns=['num'], inplace=True)
    elif case == 1:
        
        df = df.join(pd.read_csv('../new_output/usr_os_app_fm.csv.gz', compression='gzip').set_index(['user_id'
                                                                                                      , 'os_version'
                                                                                                      , 'app_code'])
                    , on = ['user_id', 'os_version','app_code'], how = 'left', rsuffix='_uoa')
        df.drop(columns=['num'], inplace=True)
    else:
        df['dow'] = pd.to_datetime(df['impression_time']).dt.dayofweek

        df = df.join(pd.read_csv('../new_output/usr_dow_app_fm.csv.gz', compression='gzip').set_index(['user_id'
                                                                                                       , 'dow'
                                                                                                      , 'app_code'])
                    , on = ['user_id', 'dow','app_code'], how = 'left', rsuffix = '_uda')
        df.drop(columns=['num'], inplace=True)
    return df

In [29]:
def get_ratio_feats(df_train, df_val):
    
    features = ['app_counts', 'app_code_fm_bias'
               , 'app_code_fm_factor_0', 'app_hour_entropy', 'app_code_fm_factor_1'
               , 'app_os_entropy', 'prev_os_click']
    
    for i, f in enumerate(features):
        for j, f_ in enumerate(features):
            temp = df_train[f].divide(df_train[f_])
            try:
                _, _, iv = woe(temp.fillna(-1), df_train["is_click"])
            except:
                iv = 0
            if iv > 0.005:
                df_val['ratio_{}_{}'.format(i, j)] = df_val[f].divide(df_val[f_])
                df_train['ratio_{}_{}'.format(i, j)] = df_train[f].divide(df_train[f_])
    return df_train, df_val

In [55]:
from scipy.stats import gmean
def get_test_preds(df, feats,categorical_features, flag = 0, **clf_kwargs):
    if flag:
        train = df.loc[df['flag1'].isin(['train', 'val'])].reset_index()
        train = train.loc[train['impression_time'] >= '2018-11-22'].reset_index()
        test = df.loc[df['flag1'] == 'test'].reset_index()
    else:
        train = df.loc[df['flag1'].isin(['train', 'val'])].reset_index()
        test = df.loc[df['flag1'] == 'test'].reset_index()
    
    print(np.sum(train['category_2_count'].isna()))
    print(train.shape, test.shape)
    y_tr = train["is_click"].values
    
    train, test = get_click_feats(train, test)
    train, test = get_target_encoding(train, test, y_tr)
    train, test = get_ratio_feats(train, test)
    
    
#     train = get_ratio_feats(train)
#     test = get_ratio_feats(test)
        
    X_tr, X_test = train[feats], test[feats]
#     X_tr = X_tr.fillna(-1)
#     X_test = X_test.fillna(-1)
    for f in X_tr.columns:
        if 'skew' in f:
            X_tr[f] = X_tr[f].fillna(0)
            X_test[f] = X_test[f].fillna(0)
        else:
            X_tr[f] = X_tr[f].fillna(-1)
            X_test[f] = X_test[f].fillna(-1)
#     for f in feats:
#         plt.figure()
#         sns.distplot(X_tr[f])
#         sns.distplot(X_test[f])
#         plt.show()
    
    test_preds = []
    categorical_features_indices = [i for i, x in enumerate(X_tr.columns) if x in categorical_features]

    model = catboost.CatBoostClassifier(**clf_kwargs)
    if flag:
        model.set_params(**{'n_estimators': 1450})
    model.fit(X_tr, y_tr, eval_set=[(X_tr, y_tr)], verbose=50,cat_features=categorical_features_indices)
    test_preds.append(model.predict_proba(X_test)[:, 1])
    test_preds = gmean(test_preds, axis=0)
    
    to_sub = test[['impression_id']]
    to_sub['is_click'] = test_preds
    return model, to_sub

In [31]:
df_data_all = get_overall_count_feats(df_data_all)

In [32]:
df_data_all.columns.values

array(['app_code', 'category_1_-1_sum', 'category_1_-1_sum-15_-22',
       'category_1_-1_sum-1_-35', 'category_1_-1_sum-22_-29',
       'category_1_-1_sum-8_-15', 'category_1_0_sum',
       'category_1_0_sum-15_-22', 'category_1_0_sum-1_-35',
       'category_1_0_sum-22_-29', 'category_1_0_sum-8_-15',
       'category_1_10_sum', 'category_1_10_sum-15_-22',
       'category_1_10_sum-1_-35', 'category_1_10_sum-22_-29',
       'category_1_10_sum-8_-15', 'category_1_11_sum',
       'category_1_11_sum-15_-22', 'category_1_11_sum-1_-35',
       'category_1_11_sum-22_-29', 'category_1_11_sum-8_-15',
       'category_1_12_sum', 'category_1_12_sum-15_-22',
       'category_1_12_sum-1_-35', 'category_1_12_sum-22_-29',
       'category_1_12_sum-8_-15', 'category_1_13_sum',
       'category_1_13_sum-15_-22', 'category_1_13_sum-1_-35',
       'category_1_13_sum-22_-29', 'category_1_13_sum-8_-15',
       'category_1_14_sum', 'category_1_14_sum-15_-22',
       'category_1_14_sum-1_-35', 'category_1_

In [33]:
case = 0
cat_params = {
    'learning_rate': 0.02, 
    'max_depth': 5, 
    'l2_leaf_reg': 10, 
    'iterations': 5000,
    'loss_function':'Logloss',
    'use_best_model': True,
    'eval_metric': 'AUC',
    'bagging_temperature': 1,
}



In [34]:

df_data_all_0 = load_factorizations(df_data_all,case = case)

df_train = df_data_all_0.loc[df_data_all['flag1'] == 'train']
df_val = df_data_all_0.loc[df_data_all['flag1'] == 'val']
df_test = df_data_all_0.loc[df_data_all['flag1'] == 'test']

df_train, df_val = get_target_encoding(df_train, df_val, df_train['is_click'])

df_train, df_val = get_click_feats(df_train, df_val)
df_train, df_val = get_ratio_feats(df_train, df_val)

df_val_lb = df_val.loc[df_val['impression_date'] < '2018-12-08'].reset_index(drop=True)
df_val = df_val.loc[df_val['impression_date'] < '2018-12-13'].reset_index(drop=True)

feats = df_train.columns
to_exclude_columns = ['flag1',
                      'impression_id',
                      'impression_time',
                      'is_click',

    'user_id',
    'category_1_0_sum-8_-15',
'category_1_10_sum-15_-22',
'category_1_11_sum-15_-22',
'category_1_10_sum-1_-35',
'category_1_10_sum',
'category_1_11_sum',
'category_1_11_sum-22_-29',
'category_1_12_sum',
'category_1_12_sum-15_-22',
'category_1_12_sum-8_-15',
'category_1_13_sum',
'category_1_13_sum-22_-29',
'category_1_14_sum',
'category_1_14_sum-15_-22',
'category_1_14_sum-8_-15',
'category_1_15_sum-1_-35',
'category_1_16_sum',
'category_1_16_sum-1_-35',
'category_1_17_sum-15_-22',
'category_1_17_sum-1_-35',
'category_1_17_sum-22_-29',
'category_1_1_sum-15_-22',
'category_1_4_sum',
'category_1_4_sum-15_-22',
'category_1_4_sum-22_-29',
'category_1_4_sum-8_-15',
'category_1_7_sum',
'category_1_7_sum-8_-15',
'category_1_8_sum',
'category_1_8_sum-1_-35',
'category_1_8_sum-8_-15',
'category_1_9_sum',
'category_1_9_sum-22_-29',
'category_3_nunique-1_-35',
'item_id_nunique-22_-29',
'item_id_nunique-8_-15',
'item_price_kurtosis-1_-35',
'item_price_kurtosis-22_-29',
'item_price_max-15_-22',
'item_price_max-22_-29',
'item_price_median',
'item_price_median-1_-35',
'item_price_median-22_-29',
'item_price_median-8_-15',
'item_price_min',
'item_price_min-1_-35',
'item_price_min-22_-29',
'item_price_min-8_-15',
#'session_id_nunique',
#'number_of_uq_items_per_session',
'count_vector_sum_mean',
'user_id_recent',
'session_id',
                      'app_code',
                     'ratio_0_2',
'ratio_0_4',
'ratio_1_2',
'ratio_1_4',
'ratio_3_2',
'ratio_3_4',
'ratio_4_2',
'ratio_5_2',
'ratio_5_4',
'ratio_6_2',
'ratio_6_4',
'recent_item_id_cnt',
'recent_item_id_cnt',
   'recent_category_2_cnt', 'recent_category_3_cnt',
   'recent_prod_type_cnt']
# 'svdt_countvect_0'
# 'svdt_countvect_1',
# 'svdt_countvect_2',
# 'svdt_countvect_3',
# 'svdt_countvect_4',
# 'svdt_countvect_5',
# 'svdt_countvect_6',
# 'svdt_countvect_7',
# 'svdt_countvect_8',
# 'svdt_countvect_9']

for f in feats:
    try:
        _, _, iv = woe(df_train[f].fillna(-1), df_train["is_click"])
    except:
        iv = 0
    if iv < 0.005:
        to_exclude_columns.append(f)
        print(f"IV value for {f} is {iv} - DISCARD THIS")
        print("------")
    else:
        print(f"=====IV value for {f} is {iv}=====")
        print("------")

print(len(feats))
feats = [f for f in feats if f not in set(to_exclude_columns + ['impression_time', 'user_id'])]+['category_1',
                      'category_2',
                      'category_3',
                      'product_type',
                      'item_id']


print(len(feats))

X_tr, X_val = df_train[feats], df_val[feats]
#means = X_tr.fillna(-999)
for f in X_tr.columns:
    if 'skew' in f:
        X_tr[f] = X_tr[f].fillna(0)
        X_val[f] = X_val[f].fillna(0)
    else:
        X_tr[f] = X_tr[f].fillna(-1)
        X_val[f] = X_val[f].fillna(-1)




Likelihood encoding app_code user_id
Likelihood encoding user_id hour
Likelihood encoding user_id dow
Likelihood encoding user_id os_version
Likelihood encoding app_code user_id os_version
Likelihood encoding os_version app_os_entropy
Likelihood encoding app_entropy
Likelihood encoding app_hour_entropy
Likelihood encoding user id
Likelihood encoding app_code
Likelihood encoding recent item_id
Likelihood encoding recent product
Likelihood encoding recent category 3
Likelihood encoding recent category 2
=====IV value for app_code is 0.5418294608146091=====
------
IV value for category_1_-1_sum is 0.0006869569719740611 - DISCARD THIS
------
IV value for category_1_-1_sum-15_-22 is 0.0006332643676553119 - DISCARD THIS
------
IV value for category_1_-1_sum-1_-35 is 0.0013672538873907097 - DISCARD THIS
------
IV value for category_1_-1_sum-22_-29 is 0.00022934341920544669 - DISCARD THIS
------
IV value for category_1_-1_sum-8_-15 is 0.00024045022530183485 - DISCARD THIS
------
=====IV value 

=====IV value for category_3_count is 0.06479020511246975=====
------
=====IV value for category_3_count-15_-22 is 0.02498154737676499=====
------
=====IV value for category_3_count-1_-35 is 0.05303157548941419=====
------
=====IV value for category_3_count-22_-29 is 0.022137598950535044=====
------
=====IV value for category_3_count-8_-15 is 0.02326109280908792=====
------
=====IV value for category_3_nunique is 0.0623945307659262=====
------
=====IV value for category_3_nunique-15_-22 is 0.022394730078022528=====
------
=====IV value for category_3_nunique-1_-35 is 0.04925785799082453=====
------
=====IV value for category_3_nunique-22_-29 is 0.019145322745824114=====
------
=====IV value for category_3_nunique-8_-15 is 0.023539460634458755=====
------
=====IV value for days_difference_server_time_max is 0.05758881173351069=====
------
=====IV value for days_difference_server_time_max-15_-22 is 0.030219527567398635=====
------
=====IV value for days_difference_server_time_max-1_-35 i

=====IV value for prev_os_view is 0.02671496890814718=====
------
=====IV value for prev_4G_view is 0.022506487946761817=====
------
=====IV value for view_counts is 0.005899542175846733=====
------
=====IV value for user_app_counts is 0.007315782088386979=====
------
=====IV value for user_os_counts is 0.00571431612150885=====
------
IV value for user_4g_counts is 0.003912512177948009 - DISCARD THIS
------
=====IV value for hour is 0.007193267095142691=====
------
IV value for dayofweek is 0.0015429918233183836 - DISCARD THIS
------
=====IV value for all_counts is 0.026306500338932177=====
------
=====IV value for app_counts is 0.5506845027146015=====
------
=====IV value for os_counts is 0.010337629631925065=====
------
=====IV value for all_usr_app_counts is 0.0213314403024287=====
------
=====IV value for all_usr_os_counts is 0.026255592734237693=====
------
=====IV value for all_usr_app_os_counts is 0.02128922832408675=====
------
=====IV value for all_usr_app_os_date_counts is 0.

=====IV value for ratio_2_6 is 0.5842948318236649=====
------
=====IV value for ratio_3_0 is 0.2998708799369755=====
------
=====IV value for ratio_3_1 is 0.4209063752672827=====
------
=====IV value for ratio_3_2 is 0.46347053744188516=====
------
=====IV value for ratio_3_4 is 0.5052566932797418=====
------
=====IV value for ratio_3_5 is 0.12728719437420447=====
------
=====IV value for ratio_3_6 is 0.5202551202010296=====
------
=====IV value for ratio_4_0 is 0.534503930977192=====
------
=====IV value for ratio_4_1 is 0.5212047797345165=====
------
=====IV value for ratio_4_2 is 0.5311417023856982=====
------
=====IV value for ratio_4_3 is 0.5077244609226874=====
------
=====IV value for ratio_4_5 is 0.5440863987981093=====
------
=====IV value for ratio_4_6 is 0.5268379700569511=====
------
=====IV value for ratio_5_0 is 0.48689431550080053=====
------
=====IV value for ratio_5_1 is 0.5345392946915739=====
------
=====IV value for ratio_5_2 is 0.5241427074924576=====
------
=====I

In [35]:
feats

['category_1_0_sum',
 'category_1_0_sum-15_-22',
 'category_1_0_sum-1_-35',
 'category_1_0_sum-22_-29',
 'category_1_10_sum-22_-29',
 'category_1_10_sum-8_-15',
 'category_1_11_sum-1_-35',
 'category_1_11_sum-8_-15',
 'category_1_12_sum-1_-35',
 'category_1_12_sum-22_-29',
 'category_1_13_sum-15_-22',
 'category_1_13_sum-1_-35',
 'category_1_13_sum-8_-15',
 'category_1_14_sum-1_-35',
 'category_1_14_sum-22_-29',
 'category_1_15_sum',
 'category_1_16_sum-15_-22',
 'category_1_16_sum-22_-29',
 'category_1_16_sum-8_-15',
 'category_1_17_sum',
 'category_1_17_sum-8_-15',
 'category_1_1_sum',
 'category_1_1_sum-1_-35',
 'category_1_1_sum-22_-29',
 'category_1_1_sum-8_-15',
 'category_1_4_sum-1_-35',
 'category_1_6_sum-1_-35',
 'category_1_7_sum-15_-22',
 'category_1_7_sum-1_-35',
 'category_1_8_sum-15_-22',
 'category_1_8_sum-22_-29',
 'category_1_9_sum-15_-22',
 'category_1_9_sum-1_-35',
 'category_1_9_sum-8_-15',
 'category_2_count',
 'category_2_count-15_-22',
 'category_2_count-1_-35',


In [39]:
feats = list(set(feats))
categorical_features = ['category_1',
 'category_2',
 'category_3',
 'product_type',
 'item_id']

In [78]:
oof_preds, clf = train_model(X_tr, X_val, df_train['is_click'], df_val['is_click']
                                     , feats, early_stopping_rounds=500,
                                    categorical_features=  ['category_1','category_2', 'category_3',
                                                            'product_type','item_id'],
                                     **cat_params)


0:	test: 0.5779992	test1: 0.5737573	best: 0.5737573 (0)	total: 501ms	remaining: 41m 46s
100:	test: 0.7279759	test1: 0.7113995	best: 0.7113995 (100)	total: 32.2s	remaining: 26m 1s
200:	test: 0.7709499	test1: 0.7316835	best: 0.7316835 (200)	total: 1m 5s	remaining: 25m 52s
300:	test: 0.7756638	test1: 0.7349479	best: 0.7349479 (300)	total: 1m 36s	remaining: 25m 13s
400:	test: 0.7788104	test1: 0.7365733	best: 0.7365733 (400)	total: 2m 9s	remaining: 24m 49s
500:	test: 0.7815491	test1: 0.7380767	best: 0.7380767 (500)	total: 2m 46s	remaining: 24m 55s
600:	test: 0.7832865	test1: 0.7392069	best: 0.7392069 (600)	total: 3m 24s	remaining: 24m 57s
700:	test: 0.7852249	test1: 0.7401192	best: 0.7401212 (699)	total: 4m 3s	remaining: 24m 51s
800:	test: 0.7869013	test1: 0.7409797	best: 0.7409797 (800)	total: 4m 43s	remaining: 24m 44s
900:	test: 0.7884486	test1: 0.7414945	best: 0.7415106 (897)	total: 5m 20s	remaining: 24m 17s
1000:	test: 0.7904362	test1: 0.7419173	best: 0.7419173 (1000)	total: 5m 54s	rema

ValueError: Length of values does not match length of index

In [83]:


X_tr, X_val = df_train[feats], df_val_lb[feats]
#means = X_tr.fillna(-999)
X_tr = X_tr.fillna(-1)
X_val = X_val.fillna(-1)

oof_preds_lb, _ = train_model(X_tr, X_val, df_train['is_click']
                                        , df_val_lb['is_click'], feats
                                        , early_stopping_rounds=1000
                                        ,categorical_features=  ['category_1','category_2', 'category_3',
                                                            'product_type','item_id']
                                        , **cat_params)
display_importances(importances)

#     cat_params['n_estimators'] = 

_, test_preds1 = get_test_preds(df_data_all_0, feats,categorical_features=  ['category_1','category_2', 'category_3',
                                                            'product_type','item_id'], **cat_params)

return oof_preds, oof_preds_lb, test_preds1

KeyboardInterrupt: 

In [None]:


X_tr, X_val = df_train[feats], df_val_lb[feats]
#means = X_tr.fillna(-999)
X_tr = X_tr.fillna(-1)
X_val = X_val.fillna(-1)

cat_params['iterations'] = 8000

_, test_preds1 = get_test_preds(df_data_all_0, feats,categorical_features=  ['category_1','category_2', 'category_3',
                                                            'product_type','item_id'], **cat_params)



75305
(237609, 298) (90675, 298)
Likelihood encoding app_code user_id
Likelihood encoding user_id hour
Likelihood encoding user_id dow
Likelihood encoding user_id os_version
Likelihood encoding app_code user_id os_version
Likelihood encoding os_version app_os_entropy
Likelihood encoding app_entropy
Likelihood encoding app_hour_entropy
Likelihood encoding user id
Likelihood encoding app_code
Likelihood encoding recent item_id


In [44]:
df_sub = df_test[['impression_id']]
df_sub['is_click'] = test_preds1#*0.7 + test_preds2*0.15 + test_preds3*0.15
df_sub.head()

ValueError: Wrong number of items passed 2, placement implies 1

In [45]:
test_preds1.head()

Unnamed: 0,impression_id,is_click
0,ccf6d380a63293580f2247d840fca638,0.011883
1,2a1b2179f709dc95fb4d819a8f3eb80d,0.007444
2,159e11927eab144e1d24e3255978f111,0.0069
3,e5e233880ea8d2a06943790dc3d37463,0.012237
4,c80ea471e47bbbd551543ca8c0e102ea,0.023395


In [46]:
test_preds1.to_csv('../submissions/10_v2_cat.csv', index=False)