In [1]:
import numpy as np
from numpy.core.fromnumeric import _all_dispatcher
import pandas as pd
import joblib
np.random.seed(2021)
import warnings
warnings.filterwarnings('ignore')

# df --> whole training set

In [2]:
df = pd.read_csv("train_modified.gz", compression='gzip', header='infer')
Y = df['click']
# discard some columns
# unused_cols = ["id", 'site_id', 'app_id']
# df.drop(unused_cols, axis=1, inplace=True)

In [3]:
df

Unnamed: 0,click,hour,C1,banner_pos,site_id,site_domain,site_category,device_id,device_ip,device_model,...,C19,C20,C21,day_of_week,device_ip_count,device_id_count,hour_count,user,hourly_user_count,click_history
0,0,0,5,0,078d3465,dd641cc7,8fd0aea4,a99f214a,ddd2926e,44956a24,...,3,-1,67,1,7647,2533255,140117,ddd2926e44956a24,4.0,first string
1,0,0,5,0,078d3465,dd641cc7,8fd0aea4,a99f214a,96809ac8,711ee120,...,3,85,67,1,7,2533255,140117,96809ac8711ee120,3.0,first string
2,0,0,5,0,078d3465,dd641cc7,8fd0aea4,a99f214a,b3cf8def,8a4875bd,...,3,85,67,1,2,2533255,140117,b3cf8def8a4875bd,2.0,first string
3,0,0,5,0,078d3465,dd641cc7,8fd0aea4,a99f214a,e8275b8f,6332421a,...,3,85,67,1,6,2533255,140117,e8275b8f6332421a,2.0,first string
4,0,0,5,1,a2af7bee,cbee4b41,72722551,a99f214a,9644d0bf,779d90c2,...,3,-1,145,1,31,2533255,140117,9644d0bf779d90c2,15.0,first string
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2683782,0,1,5,1,c12ebe86,c1aa3c04,74073276,a99f214a,8546df25,67fb3069,...,7,-1,145,2,229,2533255,147294,8546df2567fb3069,5.0,1110
2683783,0,1,5,1,c12ebe86,c1aa3c04,74073276,a99f214a,98e4ada3,b314d7b9,...,131,-1,49,2,18,2533255,147294,98e4ada3b314d7b9,4.0,010
2683784,1,1,5,0,5bb07e04,b256a9bc,f66779e6,a99f214a,Unknown,76dc4769,...,135,-1,11,2,378614,2533255,147294,dc38aa0776dc4769,1.0,first string
2683785,0,1,5,1,c12ebe86,c1aa3c04,74073276,a99f214a,e5693fd8,7120e05e,...,7,-1,11,2,9,2533255,147294,e5693fd87120e05e,4.0,0000010


### df_train --> our own training set

In [116]:
df_train = pd.read_csv("train_df.gz", compression='gzip', header='infer')
Y_train = df_train['click']
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051084 entries, 0 to 2051083
Data columns (total 27 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   click              int64 
 1   hour               int64 
 2   C1                 int64 
 3   banner_pos         int64 
 4   site_id            object
 5   site_domain        object
 6   site_category      object
 7   device_id          object
 8   device_ip          object
 9   device_model       object
 10  device_type        int64 
 11  device_conn_type   int64 
 12  C14                int64 
 13  C15                int64 
 14  C16                int64 
 15  C17                int64 
 16  C18                int64 
 17  C19                int64 
 18  C20                int64 
 19  C21                int64 
 20  day_of_week        int64 
 21  device_ip_count    int64 
 22  device_id_count    int64 
 23  hour_count         int64 
 24  user               object
 25  hourly_user_count  int64 
 26  click_history 

In [5]:
Y_train[:1000]

0      0
1      0
2      0
3      0
4      0
      ..
995    0
996    0
997    0
998    0
999    0
Name: click, Length: 1000, dtype: int64

### Optimal catboost

In [15]:
from sklearn.model_selection import train_test_split

df_cat = df_train.copy()
unused_cols = ['click']
df_cat.drop(unused_cols, axis=1, inplace=True)
cut_off = int(len(df_cat) * 0.7)
X_train_cat = df_cat.iloc[:cut_off, :]
X_test_cat = df_cat.iloc[cut_off:, :]
y_train_cat = Y_train[:cut_off]
y_test_cat = Y_train[cut_off:]

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import log_loss
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.metrics import make_scorer

categorical_f = ['C1', 'banner_pos','site_id', 'site_domain','site_category','device_id','device_ip','device_model',
                 'device_type','device_conn_type', 'C14','C15','C16','C17','C18', 'C19','C20','C21',
                 'day_of_week', 'user', 'click_history']

cat1 = CatBoostClassifier(iterations=20,learning_rate=0.1,depth=7,loss_function='Logloss', 
                          cat_features=categorical_f,verbose=False)

param = {
    'iterations': Integer(10, 300),
    'depth': Integer(1, 8),
    'learning_rate': Real(0.01, 1.0, 'log-uniform'),
    'random_strength': Real(1e-9, 10, 'log-uniform'),
    'bagging_temperature': Real(0.0, 1.0),
    'border_count': Integer(1, 255),
    'l2_leaf_reg': Integer(2, 30),
    'scale_pos_weight':Real(0.01, 1.0, 'uniform')
}

LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

# log-uniform: understand as search over p = exp(x) by varying x
opt_c = BayesSearchCV(
    cat1,
    param,
    scoring = LogLoss,
    n_iter=64,
    cv=5,
    random_state=42
)

# executes bayesian optimization
opt_c.fit(X_train_cat, y_train_cat)

In [None]:
opt_c.best_score_

In [None]:
opt_c.best_params_

### optimal xgboost

In [165]:
df_xgb = df_train.copy()

def convert_obj_to_int(self):
    
    object_list_columns = self.columns
    object_list_dtypes = self.dtypes
    new_col_suffix = '_int'
    for index in range(0,len(object_list_columns)):
        if object_list_dtypes[index] == object :
            self[object_list_columns[index]+new_col_suffix] = self[object_list_columns[index]].map( lambda  x: hash(x))
            self.drop([object_list_columns[index]],inplace=True,axis=1)
    return self

df_xgb = convert_obj_to_int(df_xgb)

In [167]:
from xgboost import XGBClassifier
from sklearn.metrics import log_loss
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.metrics import make_scorer

xgb1 = XGBClassifier(max_depth=3,learning_rate=0.1, alpha=0, colsample_bytree = 0.5,
                     subsample=0.1,n_estimators=100,gamma=0)

param = {
    'iterations': Integer(10, 400),uuuuuuu
    'max_depth': Integer(3, 8, 'uniform'),
    'learning_rate': Real(0.01, 0.3, 'log-uniform'),
    'alpha': Real(0, 10.0, 'uniform'),
    'colsample_bytree' : Real(0.5,1.0, 'uniform'),
    'subsample': Real(0.1, 1.0, 'uniform'),
    'n_estimators': Integer(100, 300, 'uniform'),
    'gamma': Real(0, 10.0, 'uniform')
}

LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

# log-uniform: understand as search over p = exp(x) by varying x
opt_x = BayesSearchCV(
    xgb1,
    param,
    scoring = LogLoss,
    n_iter=40,
    cv=5,
    random_state=42
)

# executes bayesian optimization
opt_x.fit(df_xgb, Y_train)

Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  pass

Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  pass

Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  pass

Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  pass

Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  pass

Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  pass

Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  pass

Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  pass

Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  pass

Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  pass

Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  pass

Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  pass

Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  pass

Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  pass

Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  pass

Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { iterations } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  pass

BayesSearchCV(cv=5,
              estimator=XGBClassifier(alpha=0, base_score=None, booster=None,
                                      colsample_bylevel=None,
                                      colsample_bynode=None,
                                      colsample_bytree=0.5, gamma=0,
                                      gpu_id=None, importance_type='gain',
                                      interaction_constraints=None,
                                      learning_rate=0.1, max_delta_step=None,
                                      max_depth=3, min_child_weight=None,
                                      missing=nan, monotone_constraints=None,
                                      n_estimators=100, n_job...
                             'iterations': Integer(low=10, high=400, prior='uniform', transform='identity'),
                             'learning_rate': Real(low=0.01, high=1.0, prior='log-uniform', transform='identity'),
                             'max_depth': Intege

In [13]:
opt_x.best_score_

-1.5349627176835216e-06

In [14]:
opt_x.best_params_

OrderedDict([('alpha', 0.0),
             ('colsample_bytree', 1.0),
             ('gamma', 0.0),
             ('iterations', 400),
             ('learning_rate', 0.27950642975302614),
             ('max_depth', 5),
             ('n_estimnators', 100),
             ('subsample', 1.0)])

### optimal lightgbm

In [None]:
df_lgb = df_train.copy()

def convert_obj_to_int(self):
    
    object_list_columns = self.columns
    object_list_dtypes = self.dtypes
    new_col_suffix = '_int'
    for index in range(0,len(object_list_columns)):
        if object_list_dtypes[index] == object :
            self[object_list_columns[index]+new_col_suffix] = self[object_list_columns[index]].map( lambda  x: hash(x))
            self.drop([object_list_columns[index]],inplace=True,axis=1)
    return self

df_lgb = convert_obj_to_int(df_lgb)
cut_off = int(len(df_lgb) * 0.7)
X_train_lgb = df_lgb.iloc[:cut_off, :]
X_test_lgb = df_lgb.iloc[cut_off:, :]
y_train_lgb = Y_train[:cut_off]
y_test_lgb = Y_train[cut_off:]

In [None]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.metrics import make_scorer, auc, log_loss, roc_auc_score

import lightgbm as lgb

# categorical_f = [ca for ca in X_train.columns if X_train[ca].dtype == 'object']
lgb_model = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', metric='binary_logloss', bagging_freq=5)

param = {
    'max_depth': Integer(3, 7),
    'learning_rate': Real(0.01, 1.0, 'log-uniform'),
    'feature_fraction': Real(0.2, 0.9, 'uniform'),
    'bagging_fraction': Real(0.2, 0.9, 'log-uniform'),
    'max_bin': Integer(20, 255, 'uniform'),
    'n_estimators': Integer(100, 1000, 'uniform'),
    'num_leaves': Integer(24, 80, 'uniform'),
    'min_sum_hessian_in_leaf':Integer(0,100, 'uniform'),
}

# log-uniform: understand as search over p = exp(x) by varying x
opt_l = BayesSearchCV(
    lgb_model,
    param,
    scoring = LogLoss,
    n_iter=32,
    cv=5,
    random_state=42,
    verbose=5
)

# executes bayesian optimization
opt_l.fit(df_lgb, Y_train)

## K-fold

In [279]:
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.metrics import log_loss
from sklearn.metrics import make_scorer
def convert_obj_to_int(self):
    
    object_list_columns = self.columns
    object_list_dtypes = self.dtypes
    new_col_suffix = '_int'
    for index in range(0,len(object_list_columns)):
        if object_list_dtypes[index] == object :
            self[object_list_columns[index]+new_col_suffix] = self[object_list_columns[index]].map( lambda  x: hash(x))
            self.drop([object_list_columns[index]],inplace=True,axis=1)
    return self
LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

In [280]:
df_opt = df_train.copy()
df_opt_2 = df_train.copy()
unused_cols = ['site_id', 'click']
df_opt.drop(unused_cols, axis=1, inplace=True)
df_opt_2.drop(unused_cols, axis=1, inplace=True)

In [159]:
best_p_c = {
    'iterations': 400,
    'depth': 8,
    'learning_rate': 0.03,
    'random_strength': 1e-09,
    'bagging_temperature': 0.0,
    'border_count': 48,
    'l2_leaf_reg': 3,
    'scale_pos_weight':1.0
}

best_p_x = {
    'alpha':0.0,
    'colsample_bytree':1.0,
    'gamma':0.0,
    'iterations':400,
    'learning_rate':0.27950642975302614,
    'max_depth':5,
    'n_estimators':100,
    'subsample':1.0
}


best_p_l = {
    'boosting_type':'gbdt', 
    'objective': 'binary', 
    'metric':'binary_logloss', 
    'bagging_freq': 5, 
    'max_depth': 7,
    'learning_rate': 0.11304216699488043,
    'feature_fraction': 0.5066204305086464,
    'bagging_fraction': 0.6657456066570288,
    'max_bin': 188,
    'n_estimators': 482,
    'num_leaves': 60,
    'min_sum_hessian_in_leaf':72
}

In [85]:
from sklearn.model_selection import KFold

# preds = np.zeros(test.shape[0])
logloss_c=[]
logloss_x=[]  # list contains rmse for each fold
logloss_l=[]
proba_cat_li=[]
proba_xgb_li=[]
proba_lgb_li=[]
y_li = []
n=0
df_opt = convert_obj_to_int(df_opt)

categorical_f = ['C1', 'banner_pos','site_domain','site_category','device_id','device_ip','device_model',
                 'device_type','device_conn_type', 'C14','C15','C16','C17','C18', 'C19','C20','C21',
                 'day_of_week', 'user', 'click_history']

kf = KFold(n_splits=10,random_state=42,shuffle=False)
#     preds+=xgb.predict(test[columns])/kf.n_splits
for train_idx, test_idx in kf.split(df_opt,Y_train):
    X_tr,X_val=df_opt_2.iloc[train_idx],df_opt_2.iloc[test_idx]
    y_tr,y_val=Y_train.iloc[train_idx],Y_train.iloc[test_idx]
    
    X_tr_hash,X_val_hash=df_opt.iloc[train_idx],df_opt.iloc[test_idx]
    
    cat = CatBoostClassifier(**best_p_c,od_type='Iter', loss_function='Logloss', cat_features=categorical_f)
    cat.fit(X_tr,y_tr,eval_set=[(X_val,y_val)],early_stopping_rounds=100,verbose=False)
    proba_cat = cat.predict_proba(X_val.values)[:, 1]
    proba_cat_li.append(proba_cat)
    logloss_c.append(log_loss(y_val, proba_cat))
    print(n+1,logloss_c[n])
    
    xgb = XGBClassifier(**best_p_x, loss_function='Logloss')
    xgb.fit(X_tr_hash,y_tr,eval_set=[(X_val_hash,y_val)],early_stopping_rounds=100,verbose=False)
    proba_xgb = xgb.predict_proba(X_val_hash.values)[:, 1]
    proba_xgb_li.append(proba_xgb)
    logloss_x.append(log_loss(y_val, proba_xgb))
    print(n+1,logloss_x[n])
    
    import lightgbm as lgb
    lgb = lgb.LGBMClassifier(**best_p_l, loss_function='Logloss')
    lgb.fit(X_tr_hash,y_tr,eval_set=[(X_val_hash,y_val)],early_stopping_rounds=100, verbose=False)
    proba_lgb = lgb.predict_proba(X_val_hash.values)[:, 1]
    proba_lgb_li.append(proba_lgb)
    logloss_l.append(log_loss(y_val, proba_lgb))
    print(n+1,logloss_l[n])
    n+=1
    
    y_li.append(y_val)

1 0.40328000650182544
Parameters: { iterations, loss_function, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


1 0.42108933834823564
1 0.4175592359191733
1 0.40328000650182544
Parameters: { iterations, loss_function, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


1 0.42108933834823564
1 0.4175592359191733
1 0.40328000650182544
Parameters: { iterations, loss_function, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters

1 0.42108933834823564
1 0.4175592359191733
1 0.40328000650182544
Parameters: { iterations, loss_function, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


1 0.42108933834823564
1 0.4175592359191733
1 0.40328000650182544
Parameters: { iterations, loss_function, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


1 0.42108933834823564
1 0.4175592359191733
1 0.40328000650182544
Parameters: { iterations, loss_function, n_estimnators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passe

In [282]:
from sklearn.model_selection import KFold

# preds = np.zeros(test.shape[0])
logloss_l=[]
proba_lgb_li=[]
y_li = []
n=0
df_opt = convert_obj_to_int(df_opt)

categorical_f = ['C1', 'banner_pos','site_domain','site_category','device_id','device_ip','device_model',
                 'device_type','device_conn_type', 'C14','C15','C16','C17','C18', 'C19','C20','C21',
                 'day_of_week', 'user', 'click_history']

kf = KFold(n_splits=10,random_state=42,shuffle=False)
#     preds+=xgb.predict(test[columns])/kf.n_splits
for train_idx, test_idx in kf.split(df_opt,Y_train):
    X_tr,X_val=df_opt_2.iloc[train_idx],df_opt_2.iloc[test_idx]
    y_tr,y_val=Y_train.iloc[train_idx],Y_train.iloc[test_idx]
    
    X_tr_hash,X_val_hash=df_opt.iloc[train_idx],df_opt.iloc[test_idx]
    
    import lightgbm as lgb
    lgb = lgb.LGBMClassifier(**best_p_l_1, loss_function='Logloss')
    lgb.fit(X_tr_hash,y_tr,eval_set=[(X_val_hash,y_val)],early_stopping_rounds=100, verbose=False)
    proba_lgb = lgb.predict_proba(X_val_hash.values)[:, 1]
    proba_lgb_li.append(proba_lgb)
    logloss_l.append(log_loss(y_val, proba_lgb))
    print(n+1,logloss_l[n])
    n+=1
    
    y_li.append(y_val)

1 0.4223209824421238
2 0.4133231589949397
3 0.3980092164888491
4 0.3923824522567316
5 0.4145008562357522
6 0.4071268200114227
7 0.40979773100012995
8 0.44364494685921424


9 0.45178972523917776
10 0.45605539870490774


In [281]:
logloss_l

[0.4223801857487906,
 0.4130685394531392,
 0.39848498070879224,
 0.39233600311870004,
 0.4142523146386964,
 0.40700336273515536,
 0.4097768056092878,
 0.4436535687588471,
 0.45154139665317516,
 0.4565230681876022]

In [216]:
for i in range(10):
    if len(proba_cat_li[i]) == 205108:
        print('cat', str(i))
    if len(proba_xgb_li[i]) == 205108:
        print('xgb', str(i))
    if len(proba_lgb_li[i]) == 205108:
        print('lgb', str(i))
    if len(y_li[i]) == 205108:
        print('y', str(i))

cat 4
xgb 4
lgb 4
y 4
cat 5
xgb 5
lgb 5
y 5
cat 6
xgb 6
lgb 6
y 6
cat 7
xgb 7
lgb 7
y 7
cat 8
xgb 8
lgb 8
y 8
cat 9
xgb 9
lgb 9
y 9


In [171]:
weights_range = [0.001, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5,
       0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 0.999]

In [222]:
# bag of models in Catboost, xgboost, lightgbm
model_num = 3
# from the previous 10 folds
cv_num = 10
# create placeholder for results table
output_wts_li = []
for i in range(cv_num):
    output_wts = np.zeros((len(X_val)+1, model_num+1))
    output_wts_li.append(output_wts)

# getting the possible weights for three models
import itertools
for i in range(cv_num):
    j=0
    for a,b,c in itertools.product(weights_range, repeat=model_num):
        #get combination of weights, sum to 100%
        sum_w = np.array([a,b,c]).sum()
        wts = np.array([a,b,c]) / sum_w
        if i>=4 and i<=9:
            final_proba = np.zeros((len(X_val), ))
        else:
            final_proba = np.zeros((len(X_val)+1, ))
        #get oof combination for weighted final_probability
        final_proba+=proba_cat_li[i] * wts[0]
        final_proba+=proba_xgb_li[i] * wts[1]
        final_proba+=proba_lgb_li[i] * wts[2]

        #get the logloss of weighted probability for i-fold
        output_wts_li[i][j,model_num] = log_loss(y_li[i], final_proba)

        #record the associated weights
        output_wts_li[i][j,0:model_num] = wts

        j+=1

In [245]:
def take_third(elem):
    return elem[3]
sorted(output_wts_li[0], key=take_third, reverse=True)[9250:9261]

[array([0.00199203, 0.00199203, 0.99601594, 0.42239416]),
 array([0.00181159, 0.00181159, 0.99637681, 0.42239251]),
 array([0.00166113, 0.00166113, 0.99667774, 0.42239119]),
 array([0.00153374, 0.00153374, 0.99693252, 0.42239012]),
 array([0.0014245 , 0.0014245 , 0.997151  , 0.42238922]),
 array([0.00132979, 0.00132979, 0.99734043, 0.42238847]),
 array([0.00124688, 0.00124688, 0.99750623, 0.42238783]),
 array([0.00117371, 0.00117371, 0.99765258, 0.42238728]),
 array([0.00110865, 0.00110865, 0.99778271, 0.4223868 ]),
 array([0.00105042, 0.00105042, 0.99789916, 0.42238638]),
 array([0.000999  , 0.000999  , 0.998002  , 0.42238601])]

In [253]:
sorted(output_wts_li[7], key=take_third, reverse=True)[9250:9262]

[array([0.00199203, 0.00199203, 0.99601594, 0.44369035]),
 array([0.00181159, 0.00181159, 0.99637681, 0.44368664]),
 array([0.00166113, 0.00166113, 0.99667774, 0.44368361]),
 array([0.00153374, 0.00153374, 0.99693252, 0.44368107]),
 array([0.0014245 , 0.0014245 , 0.997151  , 0.44367893]),
 array([0.00132979, 0.00132979, 0.99734043, 0.44367709]),
 array([0.00124688, 0.00124688, 0.99750623, 0.4436755 ]),
 array([0.00117371, 0.00117371, 0.99765258, 0.44367411]),
 array([0.00110865, 0.00110865, 0.99778271, 0.44367288]),
 array([0.00105042, 0.00105042, 0.99789916, 0.44367179]),
 array([0.000999  , 0.000999  , 0.998002  , 0.44367084]),
 array([0., 0., 0., 0.])]

### SJ_test (mock)

In [160]:
df_sj = pd.read_csv("sj_test.gz", compression='gzip', header='infer')
Y_test = df_sj['click']

In [161]:
unused_cols_2 = ['site_id', 'click', 'user', 'click_history', 'day_of_week', 'device_ip_count', 'hour_count', 
                 'hourly_user_count']
df_sj.drop(unused_cols, axis=1, inplace=True)
categorical_f_2 = ['C1', 'banner_pos','site_domain','site_category','device_id','device_ip','device_model',
                 'device_type','device_conn_type', 'C14','C15','C16','C17','C18', 'C19','C20','C21',
                     ]
unused_cols = ['site_id', 'click']
categorical_f = ['C1', 'banner_pos','site_domain','site_category','device_id','device_ip','device_model',
                 'device_type','device_conn_type', 'C14','C15','C16','C17','C18', 'C19','C20','C21',
                 'day_of_week', 'user', 'click_history']

In [162]:
df_copy = df_train.copy()
df_copy.drop(unused_cols, axis=1, inplace=True)

In [163]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051084 entries, 0 to 2051083
Data columns (total 25 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   hour               int64 
 1   C1                 int64 
 2   banner_pos         int64 
 3   site_domain        object
 4   site_category      object
 5   device_id          object
 6   device_ip          object
 7   device_model       object
 8   device_type        int64 
 9   device_conn_type   int64 
 10  C14                int64 
 11  C15                int64 
 12  C16                int64 
 13  C17                int64 
 14  C18                int64 
 15  C19                int64 
 16  C20                int64 
 17  C21                int64 
 18  day_of_week        int64 
 19  device_ip_count    int64 
 20  device_id_count    int64 
 21  hour_count         int64 
 22  user               object
 23  hourly_user_count  int64 
 24  click_history      object
dtypes: int64(18), object(7)
memory usage: 391.2+ 

In [164]:
cat_sj = CatBoostClassifier(**best_p_c, od_type='Iter', eval_metric='Logloss')
cat_sj.fit(df_copy, Y_train, cat_features=categorical_f, verbose=True)
proba_ = cat_sj.predict_proba(df_sj.values)[:, 1]

print('logloss of sj_test is: %0.5f'% log_loss(Y_test, proba_))

0:	learn: 0.6869958	total: 6.44s	remaining: 53m 32s
1:	learn: 0.6809918	total: 12.8s	remaining: 53m 2s
2:	learn: 0.6751979	total: 18s	remaining: 49m 34s
3:	learn: 0.6695498	total: 22.8s	remaining: 47m 5s
4:	learn: 0.6639845	total: 28.4s	remaining: 46m 52s
5:	learn: 0.6586140	total: 33.5s	remaining: 45m 54s
6:	learn: 0.6533110	total: 39.1s	remaining: 45m 54s
7:	learn: 0.6483074	total: 45.6s	remaining: 46m 45s
8:	learn: 0.6433872	total: 50.8s	remaining: 46m 8s
9:	learn: 0.6384648	total: 56.4s	remaining: 46m 4s
10:	learn: 0.6338574	total: 1m 1s	remaining: 45m 55s
11:	learn: 0.6290912	total: 1m 7s	remaining: 45m 26s
12:	learn: 0.6246012	total: 1m 12s	remaining: 45m 33s
13:	learn: 0.6200531	total: 1m 17s	remaining: 44m 43s
14:	learn: 0.6157896	total: 1m 23s	remaining: 44m 59s
15:	learn: 0.6116219	total: 1m 28s	remaining: 44m 43s
16:	learn: 0.6074336	total: 1m 33s	remaining: 44m 26s
17:	learn: 0.6033532	total: 1m 38s	remaining: 44m 9s
18:	learn: 0.5993782	total: 1m 43s	remaining: 43m 50s
19:

152:	learn: 0.4342519	total: 14m 41s	remaining: 33m 20s
153:	learn: 0.4339809	total: 14m 50s	remaining: 33m 19s
154:	learn: 0.4337345	total: 14m 56s	remaining: 33m 16s
155:	learn: 0.4335050	total: 15m 4s	remaining: 33m 14s
156:	learn: 0.4332431	total: 15m 11s	remaining: 33m 11s
157:	learn: 0.4330109	total: 15m 18s	remaining: 33m 7s
158:	learn: 0.4327719	total: 15m 25s	remaining: 33m 5s
159:	learn: 0.4325607	total: 15m 32s	remaining: 33m 1s
160:	learn: 0.4323352	total: 15m 39s	remaining: 32m 58s
161:	learn: 0.4321260	total: 15m 46s	remaining: 32m 54s
162:	learn: 0.4319216	total: 15m 53s	remaining: 32m 50s
163:	learn: 0.4316911	total: 15m 59s	remaining: 32m 46s
164:	learn: 0.4314821	total: 16m 6s	remaining: 32m 41s
165:	learn: 0.4312149	total: 16m 14s	remaining: 32m 40s
166:	learn: 0.4309646	total: 16m 22s	remaining: 32m 38s
167:	learn: 0.4307197	total: 16m 30s	remaining: 32m 37s
168:	learn: 0.4304570	total: 16m 38s	remaining: 32m 35s
169:	learn: 0.4302221	total: 16m 46s	remaining: 32m 3

300:	learn: 0.4169271	total: 37m 18s	remaining: 24m 40s
301:	learn: 0.4168884	total: 37m 28s	remaining: 24m 33s
302:	learn: 0.4168543	total: 37m 39s	remaining: 24m 28s
303:	learn: 0.4168199	total: 37m 51s	remaining: 24m 24s
304:	learn: 0.4167836	total: 38m	remaining: 24m 18s
305:	learn: 0.4167464	total: 38m 10s	remaining: 24m 12s
306:	learn: 0.4167142	total: 38m 19s	remaining: 24m 5s
307:	learn: 0.4166733	total: 38m 29s	remaining: 23m 59s
308:	learn: 0.4166395	total: 38m 39s	remaining: 23m 54s
309:	learn: 0.4165972	total: 38m 49s	remaining: 23m 47s
310:	learn: 0.4165632	total: 38m 58s	remaining: 23m 41s
311:	learn: 0.4165234	total: 39m 7s	remaining: 23m 34s
312:	learn: 0.4164876	total: 39m 20s	remaining: 23m 30s
313:	learn: 0.4164523	total: 39m 29s	remaining: 23m 23s
314:	learn: 0.4164134	total: 39m 38s	remaining: 23m 17s
315:	learn: 0.4163714	total: 39m 48s	remaining: 23m 10s
316:	learn: 0.4163307	total: 40m 2s	remaining: 23m 6s
317:	learn: 0.4163005	total: 40m 13s	remaining: 23m 1s
3

448:	learn: 0.4133759	total: 1h 55s	remaining: 6m 55s
449:	learn: 0.4133588	total: 1h 1m 4s	remaining: 6m 47s
450:	learn: 0.4133422	total: 1h 1m 15s	remaining: 6m 39s
451:	learn: 0.4133253	total: 1h 1m 26s	remaining: 6m 31s
452:	learn: 0.4133069	total: 1h 1m 38s	remaining: 6m 23s
453:	learn: 0.4132960	total: 1h 1m 48s	remaining: 6m 15s
454:	learn: 0.4132794	total: 1h 1m 56s	remaining: 6m 7s
455:	learn: 0.4132679	total: 1h 2m 6s	remaining: 5m 59s
456:	learn: 0.4132515	total: 1h 2m 17s	remaining: 5m 51s
457:	learn: 0.4132348	total: 1h 2m 30s	remaining: 5m 43s
458:	learn: 0.4132241	total: 1h 2m 40s	remaining: 5m 35s
459:	learn: 0.4132054	total: 1h 2m 50s	remaining: 5m 27s
460:	learn: 0.4131892	total: 1h 2m 59s	remaining: 5m 19s
461:	learn: 0.4131733	total: 1h 3m 12s	remaining: 5m 11s
462:	learn: 0.4131574	total: 1h 3m 23s	remaining: 5m 3s
463:	learn: 0.4131420	total: 1h 3m 34s	remaining: 4m 55s
464:	learn: 0.4131261	total: 1h 3m 44s	remaining: 4m 47s
465:	learn: 0.4131102	total: 1h 3m 52s

In [150]:
cat_test = CatBoostClassifier(iterations=100,learning_rate=0.1,depth=7, eval_metric='Logloss')
cat_test.fit(df_copy, Y_train, verbose=True, cat_features=categorical_f_2)
y_pred_cat = cat_test.predict_proba(df_sj.values)[:, 1]
print("model logloss: %.5f" % log_loss(Y_test, y_pred_cat))

0:	learn: 0.6531283	total: 566ms	remaining: 56s
1:	learn: 0.6203634	total: 945ms	remaining: 46.3s
2:	learn: 0.5932915	total: 1.55s	remaining: 50.1s
3:	learn: 0.5707390	total: 1.92s	remaining: 46.1s
4:	learn: 0.5520063	total: 2.31s	remaining: 43.9s
5:	learn: 0.5361472	total: 2.68s	remaining: 42s
6:	learn: 0.5228734	total: 3.05s	remaining: 40.5s
7:	learn: 0.5116808	total: 3.43s	remaining: 39.4s
8:	learn: 0.5020503	total: 3.88s	remaining: 39.3s
9:	learn: 0.4938029	total: 4.28s	remaining: 38.5s
10:	learn: 0.4866742	total: 4.72s	remaining: 38.2s
11:	learn: 0.4806740	total: 5.2s	remaining: 38.1s
12:	learn: 0.4755052	total: 5.66s	remaining: 37.9s
13:	learn: 0.4711380	total: 6.14s	remaining: 37.7s
14:	learn: 0.4673230	total: 6.54s	remaining: 37.1s
15:	learn: 0.4640916	total: 6.93s	remaining: 36.4s
16:	learn: 0.4613016	total: 7.32s	remaining: 35.7s
17:	learn: 0.4589544	total: 7.68s	remaining: 35s
18:	learn: 0.4569079	total: 8.04s	remaining: 34.3s
19:	learn: 0.4550533	total: 8.41s	remaining: 33.

In [None]:
df_copy = convert_obj_to_int(df_copy)
df_sj = convert_obj_to_int(df_sj)
import lightgbm as lgb
lgb_sj = lgb.LGBMClassifier(**best_p_l, loss_function='Logloss')
lgb_sj.fit(df_copy, Y_train, verbose=True)
proba_2 = lgb_sj.predict_proba(df_sj.values)[:, 1]

print('logloss of sj_test is: %0.5f'% log_loss(Y_test, proba_2))

In [258]:
from xgboost import XGBClassifier
df_copy = convert_obj_to_int(df_copy)
df_sj = convert_obj_to_int(df_sj)
xgb_sj = XGBClassifier(**best_p_c, loss_function='Logloss')
xgb_sj.fit(df_copy, Y_train, verbose=True)
proba_3 = xgb_sj.predict_proba(df_sj.values)[:, 1]
print("model logloss: %.5f" % log_loss(Y_test, proba_3))

Parameters: { bagging_temperature, border_count, depth, iterations, l2_leaf_reg, loss_function, random_strength } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




NameError: name 'prob1_3' is not defined

In [259]:
print("model logloss: %.5f" % log_loss(Y_test, proba_3))

model logloss: 0.52193


In [263]:
best_weight = [0.000999  , 0.000999  , 0.998002]
proba_combined = proba_ * best_weight[0]+proba_3 * best_weight[1] +proba_2 * best_weight[2]
print("model logloss: %.5f" % log_loss(Y_test, proba_combined))

model logloss: 0.45646


In [264]:
df_test = pd.read_csv("test_modified.gz", compression='gzip', header='infer')

In [274]:
df_test

Unnamed: 0,hour,C1,banner_pos,device_type,device_conn_type,C14,C15,C16,C17,C18,...,device_id_count,hour_count,hourly_user_count,site_domain_int,site_category_int,device_id_int,device_ip_int,device_model_int,user_int,click_history_int
0,1,5,0,1,0,21694,201,31,2075,3,...,2974153,167769,3.0,-7922621779461760780,-6768992271165658330,630059432763922342,-3266082943187982762,-8936484730165669725,8369042709171464262,-1270970892774514273
1,1,5,1,1,0,16858,201,31,1465,3,...,2974153,167769,7.0,-8693261441984224658,3112977932537050345,630059432763922342,-7631505241619422866,-3956929300217769292,3265855767023884082,-7818156423557073224
2,1,5,0,1,0,21759,201,31,2080,0,...,2974153,167769,2.0,-8688569517418306699,-4131573997570822417,630059432763922342,-784420524355329646,3217593002617818802,-5556256507546026358,-8962407145868498340
3,1,5,1,1,0,19950,201,31,1378,3,...,2974153,167769,19.0,-64485646154536219,3112977932537050345,630059432763922342,-6344371719326813974,2169643850284159377,-3045071717250669269,7695420523787087916
4,1,5,1,1,0,19950,201,31,1378,3,...,2974153,167769,1.0,8089224346688236314,-5321392483428830522,630059432763922342,8988571162419365100,-6038351737313936492,-5606010157720562442,-8962407145868498340
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
473605,5,5,0,1,0,8330,201,31,339,3,...,2974153,324162,2.0,-1698749137548403830,-5321392483428830522,630059432763922342,8988571162419365100,-6038351737313936492,5653339731379379104,-8962407145868498340
473606,5,5,1,1,0,6616,201,31,154,2,...,2974153,324162,7.0,-4017769775047782938,3112977932537050345,630059432763922342,-7838162323056977293,5436563076860855470,-3190853662056000661,-8962407145868498340
473607,5,5,0,1,0,21763,201,31,2080,0,...,2974153,324162,2.0,-8688569517418306699,-4131573997570822417,630059432763922342,2413919770247425263,9222402813130808339,3437200189065378977,-8962407145868498340
473608,5,5,1,1,0,15705,201,31,1300,0,...,2974153,324162,270.0,-3903014210613918693,-5321392483428830522,630059432763922342,8988571162419365100,-7506620169117425748,-3885195663964554779,-8962407145868498340


In [269]:
unused_cols_2 = ['site_id', 'user', 'click_history', 'day_of_week', 'device_ip_count', 'hour_count', 
                 'hourly_user_count']
unused_cols = ['site_id']
df_test.drop(unused_cols, axis=1, inplace=True)
categorical_f_2 = ['C1', 'banner_pos','site_domain','site_category','device_id','device_ip','device_model',
                 'device_type','device_conn_type', 'C14','C15','C16','C17','C18', 'C19','C20','C21',
                     ]
categorical_f = ['C1', 'banner_pos','site_domain','site_category','device_id','device_ip','device_model',
                 'device_type','device_conn_type', 'C14','C15','C16','C17','C18', 'C19','C20','C21',
                 'day_of_week', 'user', 'click_history']

In [270]:
proba_1t = cat_sj.predict_proba(df_test.values)[:, 1]
df_test = convert_obj_to_int(df_test)
proba_2t = lgb_sj.predict_proba(df_test.values)[:, 1]
proba_3t = xgb_sj.predict_proba(df_test.values)[:, 1]
best_weight = [0.000999  , 0.000999  , 0.998002]
proba_combined_t = proba_1t * best_weight[0]+proba_3t * best_weight[1] +proba_2t * best_weight[2]

In [271]:
proba_combined_t

array([0.00400319, 0.02942782, 0.236705  , ..., 0.3060573 , 0.10958013,
       0.07735443])

In [275]:
df_test_prep = pd.read_csv("test.gz", compression='gzip', header='infer')

In [276]:
all_id = df_test_prep['id']
df_out = pd.DataFrame({'id': all_id, 'ctr': proba_combined_t})
df_out.to_csv('Submission.csv', index=False)

In [278]:
best_p_l = {
    'boosting_type':'gbdt', 
    'objective': 'binary', 
    'metric':'binary_logloss', 
    'bagging_freq': 5, 
    'max_depth': 7,
    'learning_rate': 0.11304216699488043,
    'feature_fraction': 0.5066204305086464,
    'bagging_fraction': 0.6657456066570288,
    'max_bin': 188,
    'n_estimators': 482,
    'num_leaves': 60,
    'min_sum_hessian_in_leaf':72
}

best_p_l_1 = {
    'boosting_type':'gbdt', 
    'objective': 'binary', 
    'metric':'binary_logloss', 
    'bagging_freq': 2, 
    'max_depth': 9,
    'learning_rate': 0.037681961372348104,
    'feature_fraction': 0.6186329542584896,
    'bagging_fraction': 0.7686771918501543,
    'max_bin': 198,
    'n_estimators': 854,
    'num_leaves': 58,
    'min_sum_hessian_in_leaf':34,
    'lambda_l1': 8.027647813535458,
    'lambda_l2': 5.230523285313312,
    'min_data_in_leaf': 93,
    'min_split_gain': 0.03929273115755069
}

best_p_l_2 = {
    'boosting_type':'gbdt', 
    'objective': 'binary', 
    'metric':'binary_logloss', 
    'bagging_freq': 7, 
    'max_depth': 10,
    'learning_rate': 0.3,
    'feature_fraction': 0.8999999999999999,
    'bagging_fraction': 0.8999999999999999,
    'max_bin': 59,
    'n_estimators': 218,
    'num_leaves': 80,
    'min_sum_hessian_in_leaf':0,
    'lambda_l1': 1e-08,
    'lambda_l2': 3.3855221440653636,
    'min_data_in_leaf':26,
    'min_split_gain': 0.03132393135883699
}

best_p_c = {
    'iterations': 300,
    'depth': 8,
    'l2_leaf_reg': 30,
    'random_strength': 9.725165337630147,
    'scale_pos_weight':1.0,
    'bagging_temperature':1.0,
    'border_count':1.0,
    'learning_rate': 0.23151807420324574
}

best_p_c1 = {
    'colsample_bylevel':1.0,
    'iterations': 250,
    'depth': 10,
    'l2_leaf_reg': 1000,
    'leaf_estimation_iterations': 5
    'model_size_reg': 0.001
    'random_strength': 10.0,
    'scale_pos_weight':1.0,
    'subsample':1.0
}

best_p_c2 = {
    'colsample_bylevel':0.6,
    'iterations': 250,
    'depth': 10,
    'l2_leaf_reg': 1000,
    'leaf_estimation_iterations': 1
    'model_size_reg': 0.001
    'random_strength': 1e-09,
    'scale_pos_weight':1.0,
    'subsample':0.6
}