In [1]:
import numpy as np
from numpy.core.fromnumeric import _all_dispatcher
import pandas as pd
import joblib
np.random.seed(2021)
import warnings
warnings.filterwarnings('ignore')

# df --> whole training set

In [2]:
df = pd.read_csv("train_modified.gz", compression='gzip', header='infer')
Y = df['click']
# discard some columns
# unused_cols = ["id", 'site_id', 'app_id']
# df.drop(unused_cols, axis=1, inplace=True)

In [3]:
df

Unnamed: 0,click,hour,C1,banner_pos,site_id,site_domain,site_category,device_id,device_ip,device_model,...,C19,C20,C21,day_of_week,device_ip_count,device_id_count,hour_count,user,hourly_user_count,click_history
0,0,0,5,0,078d3465,dd641cc7,8fd0aea4,a99f214a,ddd2926e,44956a24,...,3,-1,67,1,7647,2533255,140117,ddd2926e44956a24,4.0,first string
1,0,0,5,0,078d3465,dd641cc7,8fd0aea4,a99f214a,96809ac8,711ee120,...,3,85,67,1,7,2533255,140117,96809ac8711ee120,3.0,first string
2,0,0,5,0,078d3465,dd641cc7,8fd0aea4,a99f214a,b3cf8def,8a4875bd,...,3,85,67,1,2,2533255,140117,b3cf8def8a4875bd,2.0,first string
3,0,0,5,0,078d3465,dd641cc7,8fd0aea4,a99f214a,e8275b8f,6332421a,...,3,85,67,1,6,2533255,140117,e8275b8f6332421a,2.0,first string
4,0,0,5,1,a2af7bee,cbee4b41,72722551,a99f214a,9644d0bf,779d90c2,...,3,-1,145,1,31,2533255,140117,9644d0bf779d90c2,15.0,first string
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2683782,0,1,5,1,c12ebe86,c1aa3c04,74073276,a99f214a,8546df25,67fb3069,...,7,-1,145,2,229,2533255,147294,8546df2567fb3069,5.0,1110
2683783,0,1,5,1,c12ebe86,c1aa3c04,74073276,a99f214a,98e4ada3,b314d7b9,...,131,-1,49,2,18,2533255,147294,98e4ada3b314d7b9,4.0,010
2683784,1,1,5,0,5bb07e04,b256a9bc,f66779e6,a99f214a,Unknown,76dc4769,...,135,-1,11,2,378614,2533255,147294,dc38aa0776dc4769,1.0,first string
2683785,0,1,5,1,c12ebe86,c1aa3c04,74073276,a99f214a,e5693fd8,7120e05e,...,7,-1,11,2,9,2533255,147294,e5693fd87120e05e,4.0,0000010


### df_train --> our own training set

In [116]:
df_train = pd.read_csv("train_df.gz", compression='gzip', header='infer')
Y_train = df_train['click']
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051084 entries, 0 to 2051083
Data columns (total 27 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   click              int64 
 1   hour               int64 
 2   C1                 int64 
 3   banner_pos         int64 
 4   site_id            object
 5   site_domain        object
 6   site_category      object
 7   device_id          object
 8   device_ip          object
 9   device_model       object
 10  device_type        int64 
 11  device_conn_type   int64 
 12  C14                int64 
 13  C15                int64 
 14  C16                int64 
 15  C17                int64 
 16  C18                int64 
 17  C19                int64 
 18  C20                int64 
 19  C21                int64 
 20  day_of_week        int64 
 21  device_ip_count    int64 
 22  device_id_count    int64 
 23  hour_count         int64 
 24  user               object
 25  hourly_user_count  int64 
 26  click_history 

In [5]:
Y_train[:1000]

0      0
1      0
2      0
3      0
4      0
      ..
995    0
996    0
997    0
998    0
999    0
Name: click, Length: 1000, dtype: int64

### Optimal catboost

In [15]:
from sklearn.model_selection import train_test_split

df_cat = df_train.copy()
unused_cols = ['site_id', 'click']
df_cat.drop(unused_cols, axis=1, inplace=True)
cut_off = int(len(df_cat) * 0.7)
X_train_cat = df_cat.iloc[:cut_off, :]
X_test_cat = df_cat.iloc[cut_off:, :]
y_train_cat = Y_train[:cut_off]
y_test_cat = Y_train[cut_off:]

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import log_loss
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.metrics import make_scorer

categorical_f = ['C1', 'banner_pos','site_domain','site_category','device_id','device_ip','device_model',
                 'device_type','device_conn_type', 'C14','C15','C16','C17','C18', 'C19','C20','C21',
                 'day_of_week', 'user', 'click_history']

cat1 = CatBoostClassifier(iterations=20,learning_rate=0.1,depth=7,loss_function='Logloss', 
                          cat_features=categorical_f,verbose=False)

param = {
    'iterations': Integer(10, 1000),
    'depth': Integer(1, 8),
    'learning_rate': Real(0.01, 1.0, 'log-uniform'),
    'random_strength': Real(1e-9, 10, 'log-uniform'),
    'bagging_temperature': Real(0.0, 1.0),
    'border_count': Integer(1, 255),
    'l2_leaf_reg': Integer(2, 30),
    'scale_pos_weight':Real(0.01, 1.0, 'uniform')
}

LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

# log-uniform: understand as search over p = exp(x) by varying x
opt_c = BayesSearchCV(
    cat1,
    param,
    scoring = LogLoss,
    n_iter=64,
    cv=5,
    random_state=42
)

# executes bayesian optimization
opt_c.fit(X_train_cat, y_train_cat)

In [None]:
opt_c.best_score_

In [None]:
opt_c.best_params_

### optimal xgboost

In [6]:
df_xgb = df_train.copy()

def convert_obj_to_int(self):
    
    object_list_columns = self.columns
    object_list_dtypes = self.dtypes
    new_col_suffix = '_int'
    for index in range(0,len(object_list_columns)):
        if object_list_dtypes[index] == object :
            self[object_list_columns[index]+new_col_suffix] = self[object_list_columns[index]].map( lambda  x: hash(x))
            self.drop([object_list_columns[index]],inplace=True,axis=1)
    return self

df_xgb = convert_obj_to_int(df_xgb)
cut_off = int(len(df_xgb) * 0.7)
X_train_xgb = df_xgb.iloc[:cut_off, :]
X_test_xgb = df_xgb.iloc[cut_off:, :]
y_train_xgb = Y_train[:cut_off]
y_test_xgb = Y_train[cut_off:]

In [11]:
from xgboost import XGBClassifier
from sklearn.metrics import log_loss
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.metrics import make_scorer

xgb1 = XGBClassifier(max_depth=3,learning_rate=0.1, alpha=0, colsample_bytree = 0.5,
                     subsample=0.1,n_estimators=100,gamma=0)

param = {
    'iterations': Integer(10, 400),
    'max_depth': Integer(3, 8, 'uniform'),
    'learning_rate': Real(0.01, 1.0, 'log-uniform'),
    'alpha': Real(0, 1.0, 'uniform'),
    'colsample_bytree' : Real(0.5,1.0, 'uniform'),
    'subsample': Real(0.1, 1.0, 'uniform'),
    'n_estimators': Integer(100, 1000, 'uniform'),
    'gamma': Real(0, 1.0, 'uniform')
}

LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

# log-uniform: understand as search over p = exp(x) by varying x
opt_x = BayesSearchCV(
    xgb1,
    param,
    scoring = LogLoss,
    n_iter=40,
    cv=5,
    random_state=42
)

# executes bayesian optimization
opt_x.fit(df_xgb, Y_train)

Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to s

Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to s

Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to s

Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to s

Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to s

Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to s

Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to s

Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to s

Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to s

Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to s

Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to s

Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to s

Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to s

Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to s

Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to s

Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations, n_estimnators } might not be used.

  This may not be accurate due to s

BayesSearchCV(cv=5,
              estimator=XGBClassifier(alpha=0, base_score=None, booster=None,
                                      colsample_bylevel=None,
                                      colsample_bynode=None,
                                      colsample_bytree=0.5, gamma=0,
                                      gpu_id=None, importance_type='gain',
                                      interaction_constraints=None,
                                      learning_rate=0.1, max_delta_step=None,
                                      max_depth=3, min_child_weight=None,
                                      missing=nan, monotone_constraints=None,
                                      n_estimators=100, n_job...
                             'iterations': Integer(low=10, high=400, prior='uniform', transform='identity'),
                             'learning_rate': Real(low=0.01, high=1.0, prior='log-uniform', transform='identity'),
                             'max_depth': Intege

In [13]:
opt_x.best_score_

-1.5349627176835216e-06

In [14]:
opt_x.best_params_

OrderedDict([('alpha', 0.0),
             ('colsample_bytree', 1.0),
             ('gamma', 0.0),
             ('iterations', 400),
             ('learning_rate', 0.27950642975302614),
             ('max_depth', 5),
             ('n_estimnators', 100),
             ('subsample', 1.0)])

### optimal lightgbm

In [None]:
df_lgb = df_train.copy()

def convert_obj_to_int(self):
    
    object_list_columns = self.columns
    object_list_dtypes = self.dtypes
    new_col_suffix = '_int'
    for index in range(0,len(object_list_columns)):
        if object_list_dtypes[index] == object :
            self[object_list_columns[index]+new_col_suffix] = self[object_list_columns[index]].map( lambda  x: hash(x))
            self.drop([object_list_columns[index]],inplace=True,axis=1)
    return self

df_lgb = convert_obj_to_int(df_lgb)
cut_off = int(len(df_lgb) * 0.7)
X_train_lgb = df_lgb.iloc[:cut_off, :]
X_test_lgb = df_lgb.iloc[cut_off:, :]
y_train_lgb = Y_train[:cut_off]
y_test_lgb = Y_train[cut_off:]

In [None]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.metrics import make_scorer, auc, log_loss, roc_auc_score

import lightgbm as lgb

# categorical_f = [ca for ca in X_train.columns if X_train[ca].dtype == 'object']
lgb_model = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', metric='binary_logloss', bagging_freq=5)

param = {
    'max_depth': Integer(3, 7),
    'learning_rate': Real(0.01, 1.0, 'log-uniform'),
    'feature_fraction': Real(0.2, 0.9, 'uniform'),
    'bagging_fraction': Real(0.2, 0.9, 'log-uniform'),
    'max_bin': Integer(20, 255, 'uniform'),
    'n_estimators': Integer(100, 1000, 'uniform'),
    'num_leaves': Integer(24, 80, 'uniform'),
    'min_sum_hessian_in_leaf':Integer(0,100, 'uniform'),
}

# log-uniform: understand as search over p = exp(x) by varying x
opt_l = BayesSearchCV(
    lgb_model,
    param,
    scoring = LogLoss,
    n_iter=32,
    cv=5,
    random_state=42,
    verbose=5
)

# executes bayesian optimization
opt_l.fit(df_lgb, Y_train)

## K-fold

In [75]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.metrics import log_loss
from sklearn.metrics import make_scorer
def convert_obj_to_int(self):
    
    object_list_columns = self.columns
    object_list_dtypes = self.dtypes
    new_col_suffix = '_int'
    for index in range(0,len(object_list_columns)):
        if object_list_dtypes[index] == object :
            self[object_list_columns[index]+new_col_suffix] = self[object_list_columns[index]].map( lambda  x: hash(x))
            self.drop([object_list_columns[index]],inplace=True,axis=1)
    return self
LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

In [69]:
df_opt = df_train.copy()
df_opt_2 = df_train.copy()
unused_cols = ['site_id', 'click']
df_opt.drop(unused_cols, axis=1, inplace=True)
df_opt_2.drop(unused_cols, axis=1, inplace=True)

In [None]:
best_p_c = {
    'iterations': 150,
    'depth': 4,
    'learning_rate': 0.2,
    'random_strength': 10,
    'bagging_temperature': 1.0,
    'border_count': 255,
    'l2_leaf_reg': 30,
    'scale_pos_weight':0.86438264586532
}

best_p_x = {
    'alpha':0.0,
    'colsample_bytree':1.0,
    'gamma':0.0,
    'iterations':400,
    'learning_rate':0.27950642975302614,
    'max_depth':5,
    'n_estimators':100,
    'subsample':1.0
}

best_p_l = {
    'boosting_type':'gbdt', 
    'objective': 'binary', 
    'metric':'binary_logloss', 
    'bagging_freq': 5, 
    'max_depth': 7,
    'learning_rate': 0.11304216699488043,
    'feature_fraction': 0.5066204305086464,
    'bagging_fraction': 0.6657456066570288,
    'max_bin': 188,
    'n_estimators': 482,
    'num_leaves': 60,
    'min_sum_hessian_in_leaf':72
}

In [85]:
from sklearn.model_selection import KFold

# preds = np.zeros(test.shape[0])
logloss_c=[]
logloss_x=[]  # list contains rmse for each fold
logloss_l=[]
proba_cat_li=[]
proba_xgb_li=[]
proba_lgb_li=[]
y_li = []
n=0
df_opt = convert_obj_to_int(df_opt)

categorical_f = ['C1', 'banner_pos','site_domain','site_category','device_id','device_ip','device_model',
                 'device_type','device_conn_type', 'C14','C15','C16','C17','C18', 'C19','C20','C21',
                 'day_of_week', 'user', 'click_history']

kf = KFold(n_splits=10,random_state=42,shuffle=False)
#     preds+=xgb.predict(test[columns])/kf.n_splits
for train_idx, test_idx in kf.split(df_opt,Y_train):
    X_tr,X_val=df_opt_2.iloc[train_idx],df_opt_2.iloc[test_idx]
    y_tr,y_val=Y_train.iloc[train_idx],Y_train.iloc[test_idx]
    
    X_tr_hash,X_val_hash=df_opt.iloc[train_idx],df_opt.iloc[test_idx]
    
    cat = CatBoostClassifier(**best_p_c,od_type='Iter', loss_function='Logloss', cat_features=categorical_f)
    cat.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100,verbose=False)
    proba_cat = cat.predict_proba(X_val.values)[:, 1]
    proba_cat_li.append(proba_cat)
    logloss_c.append(log_loss(y_val, proba_cat))
    print(n+1,logloss_c[n])
    
    xgb = XGBClassifier(**best_p_x, loss_function='Logloss')
    xgb.fit(X_tr_hash,y_tr,eval_set=[(X_val_hash,y_val)],early_stopping_rounds=100,verbose=False)
    proba_xgb = xgb.predict_proba(X_val_hash.values)[:, 1]
    proba_xgb_li.append(proba_xgb)
    logloss_x.append(log_loss(y_val, proba_xgb))
    print(n+1,logloss_x[n])
    
    import lightgbm as lgb
    lgb = lgb.LGBMClassifier(**best_p_l, loss_function='Logloss')
    lgb.fit(X_tr_hash,y_tr,eval_set=[(X_val_hash,y_val)],early_stopping_rounds=100, verbose=False)
    proba_lgb = lgb.predict_proba(X_val_hash.values)[:, 1]
    proba_lgb_li.append(proba_lgb)
    logloss_l.append(log_loss(y_val, proba_lgb))
    print(n+1,logloss_l[n])
    n+=1
    
    y_li.append(y_val)

1 0.40328000650182544
Parameters: { iterations, loss_function, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


1 0.42108933834823564
1 0.4175592359191733
1 0.40328000650182544
Parameters: { iterations, loss_function, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


1 0.42108933834823564
1 0.4175592359191733
1 0.40328000650182544
Parameters: { iterations, loss_function, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters

1 0.42108933834823564
1 0.4175592359191733
1 0.40328000650182544
Parameters: { iterations, loss_function, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


1 0.42108933834823564
1 0.4175592359191733
1 0.40328000650182544
Parameters: { iterations, loss_function, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


1 0.42108933834823564
1 0.4175592359191733
1 0.40328000650182544
Parameters: { iterations, loss_function, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passe

In [None]:
from sklearn.model_selection import KFold

# preds = np.zeros(test.shape[0])
logloss_l=[]
proba_lgb_li=[]
y_li = []
n=0
df_opt = convert_obj_to_int(df_opt)

categorical_f = ['C1', 'banner_pos','site_domain','site_category','device_id','device_ip','device_model',
                 'device_type','device_conn_type', 'C14','C15','C16','C17','C18', 'C19','C20','C21',
                 'day_of_week', 'user', 'click_history']

kf = KFold(n_splits=10,random_state=42,shuffle=False)
#     preds+=xgb.predict(test[columns])/kf.n_splits
for train_idx, test_idx in kf.split(df_opt,Y_train):
    X_tr,X_val=df_opt_2.iloc[train_idx],df_opt_2.iloc[test_idx]
    y_tr,y_val=Y_train.iloc[train_idx],Y_train.iloc[test_idx]
    
    X_tr_hash,X_val_hash=df_opt.iloc[train_idx],df_opt.iloc[test_idx]
    
    import lightgbm as lgb
    lgb = lgb.LGBMClassifier(**best_p_l, loss_function='Logloss')
    lgb.fit(X_tr_hash,y_tr,eval_set=[(X_val_hash,y_val)],early_stopping_rounds=100, verbose=False)
    proba_lgb = lgb.predict_proba(X_val_hash.values)[:, 1]
    proba_lgb_li.append(proba_lgb)
    logloss_l.append(log_loss(y_val, proba_lgb))
    print(n+1,logloss_l[n])
    n+=1
    
    y_li.append(y_val)

1 0.4223801857487906
2 0.4130685394531392
3 0.39848498070879224
4 0.39233600311870004
5 0.4142523146386964
6 0.40700336273515536


In [None]:
logloss_l

In [40]:
weights_range = [0.001, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5,
       0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.999]

In [None]:
%%timeit
# bag of models in Catboost, xgboost, lightgbm
model_num = 3
# from the previous 10 folds
cv_num = 10
# create placeholder for results table
output_wts_li = []
for i in range(cv_num):
    output_wts = np.zeros((len(X_val),model_num+1))
    output_wts_li.append(output_wts)

# getting the possible weights for three models
import itertools
for i in range(cv_num):
    j=0
    for a,b,c in itertools.product(weights_range, repeat=model_num):
        #get combination of weights, sum to 100%
        sum_w = np.array([a,b,c]).sum()
        wts = np.array([a,b,c]) / sum_w
        
        final_proba = np.zeros((len(X_val), ))
        #get oof combination for weighted final_probability
        final_proba+=proba_cat_li[i] * wts[0]
        final_proba+=proba_xgb_li[i] * wts[1]
        final_proba+=proba_lgb_li[i] * wts[2]

        #get the logloss of weighted probability for i-fold
        output_wts_li[i][j,model_num] = log_loss(y_li[i], final_proba[i])

        #record the associated weights
        output_wts[i][j,0:model_num] = wts

        j+=1

### SJ_test (mock)

In [143]:
df_sj = pd.read_csv("sj_test.gz", compression='gzip', header='infer')
Y_test = df_sj['click']

In [144]:
unused_cols_2 = ['site_id', 'click', 'user', 'click_history', 'day_of_week', 'device_ip_count', 'hour_count', 
                 'hourly_user_count']
df_sj.drop(unused_cols_2, axis=1, inplace=True)
categorical_f_2 = ['C1', 'banner_pos','site_domain','site_category','device_id','device_ip','device_model',
                 'device_type','device_conn_type', 'C14','C15','C16','C17','C18', 'C19','C20','C21',
                     ]

In [145]:
df_copy = df_train.copy()
df_copy.drop(unused_cols_2, axis=1, inplace=True)

In [146]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051084 entries, 0 to 2051083
Data columns (total 19 columns):
 #   Column            Dtype 
---  ------            ----- 
 0   hour              int64 
 1   C1                int64 
 2   banner_pos        int64 
 3   site_domain       object
 4   site_category     object
 5   device_id         object
 6   device_ip         object
 7   device_model      object
 8   device_type       int64 
 9   device_conn_type  int64 
 10  C14               int64 
 11  C15               int64 
 12  C16               int64 
 13  C17               int64 
 14  C18               int64 
 15  C19               int64 
 16  C20               int64 
 17  C21               int64 
 18  device_id_count   int64 
dtypes: int64(14), object(5)
memory usage: 297.3+ MB


In [154]:
cat_sj = CatBoostClassifier(**best_p_c, od_type='Iter', eval_metric='Logloss')
cat_sj.fit(df_copy, Y_train, cat_features=categorical_f_2, verbose=True)
proba_ = cat_sj.predict_proba(df_sj.values)[:, 1]

print('logloss of sj_test is: %0.5f'% log_loss(Y_test, proba_))

0:	learn: 0.4475024	total: 550ms	remaining: 1m 21s
1:	learn: 0.4276334	total: 962ms	remaining: 1m 11s
2:	learn: 0.4212526	total: 1.47s	remaining: 1m 12s
3:	learn: 0.4157165	total: 1.82s	remaining: 1m 6s
4:	learn: 0.4148646	total: 2.17s	remaining: 1m 3s
5:	learn: 0.4134438	total: 2.44s	remaining: 58.7s
6:	learn: 0.4128724	total: 2.64s	remaining: 53.9s
7:	learn: 0.4121286	total: 2.92s	remaining: 51.9s
8:	learn: 0.4116849	total: 3.28s	remaining: 51.4s
9:	learn: 0.4106989	total: 3.61s	remaining: 50.5s
10:	learn: 0.4102331	total: 3.97s	remaining: 50.2s
11:	learn: 0.4099835	total: 4.28s	remaining: 49.2s
12:	learn: 0.4095667	total: 4.67s	remaining: 49.2s
13:	learn: 0.4091209	total: 4.93s	remaining: 47.9s
14:	learn: 0.4088399	total: 5.19s	remaining: 46.7s
15:	learn: 0.4085670	total: 5.48s	remaining: 45.9s
16:	learn: 0.4082972	total: 5.8s	remaining: 45.4s
17:	learn: 0.4078205	total: 6.04s	remaining: 44.3s
18:	learn: 0.4075886	total: 6.31s	remaining: 43.5s
19:	learn: 0.4073697	total: 6.63s	remai

In [150]:
cat_test = CatBoostClassifier(iterations=100,learning_rate=0.1,depth=7, eval_metric='Logloss')
cat_test.fit(df_copy, Y_train, verbose=True, cat_features=categorical_f_2)
y_pred_cat = cat_test.predict_proba(df_sj.values)[:, 1]
print("model logloss: %.5f" % log_loss(Y_test, y_pred_cat))

0:	learn: 0.6531283	total: 566ms	remaining: 56s
1:	learn: 0.6203634	total: 945ms	remaining: 46.3s
2:	learn: 0.5932915	total: 1.55s	remaining: 50.1s
3:	learn: 0.5707390	total: 1.92s	remaining: 46.1s
4:	learn: 0.5520063	total: 2.31s	remaining: 43.9s
5:	learn: 0.5361472	total: 2.68s	remaining: 42s
6:	learn: 0.5228734	total: 3.05s	remaining: 40.5s
7:	learn: 0.5116808	total: 3.43s	remaining: 39.4s
8:	learn: 0.5020503	total: 3.88s	remaining: 39.3s
9:	learn: 0.4938029	total: 4.28s	remaining: 38.5s
10:	learn: 0.4866742	total: 4.72s	remaining: 38.2s
11:	learn: 0.4806740	total: 5.2s	remaining: 38.1s
12:	learn: 0.4755052	total: 5.66s	remaining: 37.9s
13:	learn: 0.4711380	total: 6.14s	remaining: 37.7s
14:	learn: 0.4673230	total: 6.54s	remaining: 37.1s
15:	learn: 0.4640916	total: 6.93s	remaining: 36.4s
16:	learn: 0.4613016	total: 7.32s	remaining: 35.7s
17:	learn: 0.4589544	total: 7.68s	remaining: 35s
18:	learn: 0.4569079	total: 8.04s	remaining: 34.3s
19:	learn: 0.4550533	total: 8.41s	remaining: 33.

In [107]:
df_copy = convert_obj_to_int(df_copy)
df_sj = convert_obj_to_int(df_sj)
import lightgbm as lgb
lgb_sj = lgb.LGBMClassifier(**best_p_l, loss_function='Logloss')
lgb_sj.fit(df_copy, Y_train, verbose=True)
proba_ = lgb_sj.predict_proba(df_sj.values)[:, 1]

print('logloss of sj_test is: %0.5f'% log_loss(Y_test, proba_))

logloss of sj_test is: 0.45645


In [None]:
train_df(train_df_train, train_df_val), sj_test, test_modified