In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.metrics import log_loss
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.metrics import make_scorer
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv("train_df.gz", compression="gzip")

In [3]:
categorical_f = ['hour', 'C1', 'banner_pos', 'site_id', 'site_domain','site_category','device_id','device_ip','device_model',
                 'device_type','device_conn_type', 'C14','C15','C16','C17','C18', 'C19','C20','C21',
                 'day_of_week', 'user', 'click_history']

In [4]:
def convert_obj_to_int(self):
    
    object_list_columns = self.columns
    object_list_dtypes = self.dtypes
    new_col_suffix = '_int'
    for index in range(0,len(object_list_columns)):
        if object_list_dtypes[index] == object :
            self[object_list_columns[index]+new_col_suffix] = self[object_list_columns[index]].map( lambda  x: hash(x))
            self.drop([object_list_columns[index]],inplace=True,axis=1)
    return self

X_train = df.drop(["click"],axis=1)
y_train = df["click"]
X_train_hash = X_train.copy()
column_list = ['C1', 'banner_pos', 'device_type', 'device_conn_type', 'C14', 'C15', 'C16', 'C17', 'C18', 
               'C19', 'C20', 'C21']
X_train_hash[column_list] = X_train_hash[column_list].astype('object')
X_train_hash = convert_obj_to_int(X_train_hash)

In [5]:
# df_bootstrapped = df.sample(n=len(df), replace=True, random_state=101)
# X_boot = df_bootstrapped.drop(["click"],axis=1)
# y_boot = df_bootstrapped["click"]
# X_boot_hash = X_boot.copy()
# X_boot_hash[column_list] = X_boot_hash[column_list].astype('object')
# X_boot_hash = convert_obj_to_int(X_boot_hash)

In [8]:
# df_bootstrapped_2 = df.sample(n=len(df), replace=True, random_state=36)
# X_boot_2 = df_bootstrapped_2.drop(["click"],axis=1)
# y_boot_2 = df_bootstrapped_2["click"]
# X_boot_hash_2 = X_boot_2.copy()
# X_boot_hash_2[column_list] = X_boot_hash_2[column_list].astype('object')
# X_boot_hash_2 = convert_obj_to_int(X_boot_hash_2)

In [9]:
# df_bootstrapped_3 = df.sample(n=len(df), replace=True, random_state=70)
# X_boot_3 = df_bootstrapped_3.drop(["click"],axis=1)
# y_boot_3 = df_bootstrapped_3["click"]
# X_boot_hash_3 = X_boot_3.copy()
# X_boot_hash_3[column_list] = X_boot_hash_3[column_list].astype('object')
# X_boot_hash_3 = convert_obj_to_int(X_boot_hash_3)

In [5]:
test = pd.read_csv("sj_test.gz", compression="gzip")
X_test = test.drop(["click"],axis=1)
y_test = test["click"]
X_test_hash = X_test.copy()
column_list = ['C1', 'banner_pos', 'device_type', 'device_conn_type', 'C14', 'C15', 'C16', 'C17', 'C18', 
               'C19', 'C20', 'C21']
X_test_hash[column_list] = X_test_hash[column_list].astype('object')
X_test_hash = convert_obj_to_int(X_test_hash)

In [19]:
# optimal hyperparameters
best_p_l = {
    'boosting_type':'gbdt', 
    'objective': 'binary', 
    'metric':'binary_logloss', 
    'bagging_freq': 5, 
    'max_depth': 7,
    'learning_rate': 0.11304216699488043,
    'feature_fraction': 0.5066204305086464,
    'bagging_fraction': 0.6657456066570288,
    'max_bin': 188,
    'n_estimators': 482,
    'num_leaves': 60,
    'min_sum_hessian_in_leaf':72
}

best_p_l_8 = {
    'boosting_type':'gbdt', 
    'objective': 'binary', 
    'metric':'binary_logloss', 
    'bagging_freq': 2, 
    'max_depth': 9,
    'learning_rate': 0.037681961372348104,
    'feature_fraction': 0.6186329542584896,
    'bagging_fraction': 0.7686771918501543,
    'max_bin': 198,
    'n_estimators': 854,
    'num_leaves': 58,
    'min_sum_hessian_in_leaf':34,
    'lambda_l1': 8.027647813535458,
    'lambda_l2': 5.230523285313312,
    'min_data_in_leaf': 93,
    'min_split_gain': 0.03929273115755069
}

best_p_l_28 = {
    'boosting_type':'gbdt', 
    'objective': 'binary', 
    'metric':'binary_logloss', 
    'bagging_freq': 7, 
    'max_depth': 10,
    'learning_rate': 0.3,
    'feature_fraction': 0.8999999999999999,
    'bagging_fraction': 0.8999999999999999,
    'max_bin': 59,
    'n_estimators': 218,
    'num_leaves': 80,
    'min_sum_hessian_in_leaf':0,
    'lambda_l1': 1e-08,
    'lambda_l2': 3.3855221440653636,
    'min_data_in_leaf':26,
    'min_split_gain': 0.03132393135883699
}

best_p_l_4812 = {
    'boosting_type':'gbdt', 
    'objective': 'binary', 
    'metric':'binary_logloss', 
    'bagging_freq': 1, 
    'max_depth': 4,
    'learning_rate': 0.17830921881011944,
    'feature_fraction': 0.5866476844693964,
    'bagging_fraction': 0.8331767682764499,
    'max_bin': 255,
    'n_estimators': 283,
    'num_leaves': 24,
    'min_sum_hessian_in_leaf':4,
    'lambda_l1': 4.227555053091508,
    'lambda_l2': 3.7698218608912613,
    'min_data_in_leaf':65,
    'min_split_gain': 0.024848679158260303
}

best_p_l_19 = {
    'boosting_type':'gbdt', 
    'objective': 'binary', 
    'metric':'binary_logloss', 
    'bagging_freq': 7, 
    'max_depth': 10,
    'learning_rate': 0.3,
    'feature_fraction': 0.8999999999999999,
    'bagging_fraction': 0.8999999999999999,
    'max_bin': 255,
    'n_estimators': 1000,
    'num_leaves': 80,
    'min_sum_hessian_in_leaf':0,
    'lambda_l1': 10.0,
    'lambda_l2': 1e-08,
    'min_data_in_leaf':20,
    'min_split_gain': 0.1
}

best_p_l_48128 = {
    'boosting_type':'gbdt', 
    'objective': 'binary', 
    'metric':'binary_logloss', 
    'bagging_freq': 1, 
    'max_depth': 10,
    'learning_rate': 0.29287582078739416,
    'feature_fraction': 0.8999999999999999,
    'bagging_fraction': 0.8999999999999999,
    'max_bin': 255,
    'n_estimators': 625,
    'num_leaves': 80,
    'min_sum_hessian_in_leaf':0,
    'lambda_l1': 10.0,
    'lambda_l2': 1e-08,
    'min_data_in_leaf':20,
    'min_split_gain': 0.001
}

best_p_x = {
    'alpha':0.0,
    'colsample_bytree':1.0,
    'gamma':0.0,
    'iterations':400,
    'learning_rate':0.27950642975302614,
    'max_depth':5,
    'n_estimators':100,
    'subsample':1.0
}

best_p_c_2 = {
    'colsample_bylevel':1.0,
    'iterations': 250,
    'depth': 10,
    'l2_leaf_reg': 1000,
    'leaf_estimation_iterations': 5,
    'model_size_reg': 0.001,
    'random_strength': 10.0,
    'scale_pos_weight':1.0,
    'subsample':1.0
}

best_p_c_240 = {
    'colsample_bylevel':0.6,
    'iterations': 250,
    'depth': 10,
    'l2_leaf_reg': 1000,
    'leaf_estimation_iterations': 1,
    'model_size_reg': 0.001,
    'random_strength': 1e-09,
    'scale_pos_weight':1.0,
    'subsample':0.6
}

best_p_c_101 = {
    'colsample_bylevel':0.5,
    'iterations': 250,
    'depth': 10,
    'l2_leaf_reg': 0.001,
    'leaf_estimation_iterations': 1,
    'model_size_reg': 0.01,
    'random_strength': 10,
    'scale_pos_weight':1.0,
    'subsample':1.0
}

best_p_c_16 = {
    'colsample_bylevel':1.0,
    'iterations': 250,
    'depth': 10,
    'l2_leaf_reg': 10000.0,
    'leaf_estimation_iterations': 1,
    'model_size_reg': 0.01,
    'random_strength': 1e-9,
    'scale_pos_weight':1.0,
    'subsample':1.0
}

best_p_c_24 = {
    'colsample_bylevel':1.0,
    'iterations': 250,
    'depth': 10,
    'l2_leaf_reg': 0.001,
    'leaf_estimation_iterations': 5,
    'model_size_reg': 0.01,
    'random_strength': 1e-9,
    'scale_pos_weight':1.0,
    'subsample':1.0
}

### train the optimal models on the train_df, to obtain the optimal weight

In [7]:
LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True) 
logloss_l=[]
proba_lgb_li=[]

import lightgbm as lgb
lgb = lgb.LGBMClassifier(**best_p_l, loss_function='Logloss')
lgb.fit(X_train_hash,y_train,eval_set=[(X_test_hash,y_test)],early_stopping_rounds=100, verbose=False)
proba_lgb = lgb.predict_proba(X_test_hash.values)[:, 1]
proba_lgb_li.append(proba_lgb)
logloss_l.append(log_loss(y_test, proba_lgb))

# import lightgbm as lgb
# lgb = lgb.LGBMClassifier(**best_p_l, loss_function='Logloss')
# lgb.fit(X_boot_hash,y_boot,eval_set=[(X_test_hash,y_test)],early_stopping_rounds=100, verbose=False)
# proba_lgb = lgb.predict_proba(X_test_hash.values)[:, 1]
# proba_lgb_li.append(proba_lgb)
# logloss_l.append(log_loss(y_test, proba_lgb))

# import lightgbm as lgb
# lgb = lgb.LGBMClassifier(**best_p_l, loss_function='Logloss')
# lgb.fit(X_boot_hash_2,y_boot_2,eval_set=[(X_test_hash,y_test)],early_stopping_rounds=100, verbose=False)
# proba_lgb = lgb.predict_proba(X_test_hash.values)[:, 1]
# proba_lgb_li.append(proba_lgb)
# logloss_l.append(log_loss(y_test, proba_lgb))

# import lightgbm as lgb
# lgb = lgb.LGBMClassifier(**best_p_l, loss_function='Logloss')
# lgb.fit(X_boot_hash_3,y_boot_3,eval_set=[(X_test_hash,y_test)],early_stopping_rounds=100, verbose=False)
# proba_lgb = lgb.predict_proba(X_test_hash.values)[:, 1]
# proba_lgb_li.append(proba_lgb)
# logloss_l.append(log_loss(y_test, proba_lgb))



In [8]:
logloss_l

[0.4549539661539116]

In [9]:
logloss_l_8=[]
proba_lgb_li_8=[]

import lightgbm as lgb
lgb = lgb.LGBMClassifier(**best_p_l_8, loss_function='Logloss')
lgb.fit(X_train_hash,y_train,eval_set=[(X_test_hash,y_test)],early_stopping_rounds=100, verbose=False)
proba_lgb = lgb.predict_proba(X_test_hash.values)[:, 1]
proba_lgb_li_8.append(proba_lgb)
logloss_l_8.append(log_loss(y_test, proba_lgb))

# import lightgbm as lgb
# lgb = lgb.LGBMClassifier(**best_p_l_8, loss_function='Logloss')
# lgb.fit(X_boot_hash,y_boot,eval_set=[(X_test_hash,y_test)],early_stopping_rounds=100, verbose=False)
# proba_lgb = lgb.predict_proba(X_test_hash.values)[:, 1]
# proba_lgb_li_8.append(proba_lgb)
# logloss_l_8.append(log_loss(y_test, proba_lgb))

# import lightgbm as lgb
# lgb = lgb.LGBMClassifier(**best_p_l_8, loss_function='Logloss')
# lgb.fit(X_boot_hash_2,y_boot_2,eval_set=[(X_test_hash,y_test)],early_stopping_rounds=100, verbose=False)
# proba_lgb = lgb.predict_proba(X_test_hash.values)[:, 1]
# proba_lgb_li_8.append(proba_lgb)
# logloss_l_8.append(log_loss(y_test, proba_lgb))

# import lightgbm as lgb
# lgb = lgb.LGBMClassifier(**best_p_l_8, loss_function='Logloss')
# lgb.fit(X_boot_hash_3,y_boot_3,eval_set=[(X_test_hash,y_test)],early_stopping_rounds=100, verbose=False)
# proba_lgb = lgb.predict_proba(X_test_hash.values)[:, 1]
# proba_lgb_li_8.append(proba_lgb)
# logloss_l_8.append(log_loss(y_test, proba_lgb))



In [10]:
logloss_l_28=[]
proba_lgb_li_28=[]

import lightgbm as lgb
lgb = lgb.LGBMClassifier(**best_p_l_28, loss_function='Logloss')
lgb.fit(X_train_hash,y_train,eval_set=[(X_test_hash,y_test)],early_stopping_rounds=100, verbose=False)
proba_lgb = lgb.predict_proba(X_test_hash.values)[:, 1]
proba_lgb_li_28.append(proba_lgb)
logloss_l_28.append(log_loss(y_test, proba_lgb))

# import lightgbm as lgb
# lgb = lgb.LGBMClassifier(**best_p_l_28, loss_function='Logloss')
# lgb.fit(X_boot_hash,y_boot,eval_set=[(X_test_hash,y_test)],early_stopping_rounds=100, verbose=False)
# proba_lgb = lgb.predict_proba(X_test_hash.values)[:, 1]
# proba_lgb_li_28.append(proba_lgb)
# logloss_l_28.append(log_loss(y_test, proba_lgb))

# import lightgbm as lgb
# lgb = lgb.LGBMClassifier(**best_p_l_28, loss_function='Logloss')
# lgb.fit(X_boot_hash_2,y_boot_2,eval_set=[(X_test_hash,y_test)],early_stopping_rounds=100, verbose=False)
# proba_lgb = lgb.predict_proba(X_test_hash.values)[:, 1]
# proba_lgb_li_28.append(proba_lgb)
# logloss_l_28.append(log_loss(y_test, proba_lgb))

# import lightgbm as lgb
# lgb = lgb.LGBMClassifier(**best_p_l_28, loss_function='Logloss')
# lgb.fit(X_boot_hash_3,y_boot_3,eval_set=[(X_test_hash,y_test)],early_stopping_rounds=100, verbose=False)
# proba_lgb = lgb.predict_proba(X_test_hash.values)[:, 1]
# proba_lgb_li_28.append(proba_lgb)
# logloss_l_28.append(log_loss(y_test, proba_lgb))



In [11]:
logloss_l_4812=[]
proba_lgb_li_4812=[]

import lightgbm as lgb
lgb = lgb.LGBMClassifier(**best_p_l_4812, loss_function='Logloss')
lgb.fit(X_train_hash,y_train,eval_set=[(X_test_hash,y_test)],early_stopping_rounds=100, verbose=False)
proba_lgb = lgb.predict_proba(X_test_hash.values)[:, 1]
proba_lgb_li_4812.append(proba_lgb)
logloss_l_4812.append(log_loss(y_test, proba_lgb))



In [20]:
logloss_l_19=[]
proba_lgb_li_19=[]

import lightgbm as lgb
lgb = lgb.LGBMClassifier(**best_p_l_19, loss_function='Logloss')
lgb.fit(X_train_hash,y_train,eval_set=[(X_test_hash,y_test)],early_stopping_rounds=100, verbose=False)
proba_lgb = lgb.predict_proba(X_test_hash.values)[:, 1]
proba_lgb_li_19.append(proba_lgb)
logloss_l_19.append(log_loss(y_test, proba_lgb))



In [26]:
logloss_l_19

[0.4553069584127081]

In [12]:
logloss_x=[]
proba_xgb_li=[]

xgb = XGBClassifier(**best_p_x, loss_function='Logloss')
xgb.fit(X_train_hash,y_train,eval_set=[(X_test_hash,y_test)],early_stopping_rounds=100, verbose=False)
proba_xgb = xgb.predict_proba(X_test_hash.values)[:, 1]
proba_xgb_li.append(proba_xgb)
logloss_x.append(log_loss(y_test, proba_xgb))

# xgb = XGBClassifier(**best_p_x, loss_function='Logloss')
# xgb.fit(X_boot_hash_2,y_boot_2,eval_set=[(X_test_hash,y_test)],early_stopping_rounds=100, verbose=False)
# proba_xgb = xgb.predict_proba(X_test_hash.values)[:, 1]
# proba_xgb_li.append(proba_xgb)
# logloss_x.append(log_loss(y_test, proba_xgb))

# xgb = XGBClassifier(**best_p_x, loss_function='Logloss')
# xgb.fit(X_boot_hash_3,y_boot_3,eval_set=[(X_test_hash,y_test)],early_stopping_rounds=100, verbose=False)
# proba_xgb = xgb.predict_proba(X_test_hash.values)[:, 1]
# proba_xgb_li.append(proba_xgb)
# logloss_x.append(log_loss(y_test, proba_xgb))



Parameters: { iterations, loss_function } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [13]:
logloss_x

[0.4590530070074255]

In [14]:
logloss_c_2=[]
proba_cat_li_2=[]

cat = CatBoostClassifier(**best_p_c_2,od_type='Iter', loss_function='Logloss', cat_features=categorical_f)
cat.fit(X_train,y_train,eval_set=[(X_test,y_test)],early_stopping_rounds=100, verbose=False)
proba_cat = cat.predict_proba(X_test.values)[:, 1]
proba_cat_li_2.append(proba_cat)
logloss_c_2.append(log_loss(y_test, proba_cat))

In [15]:
logloss_c_240=[]
proba_cat_li_240=[]

cat = CatBoostClassifier(**best_p_c_240,od_type='Iter', loss_function='Logloss', cat_features=categorical_f)
cat.fit(X_train,y_train,eval_set=[(X_test,y_test)],early_stopping_rounds=100, verbose=False)
proba_cat = cat.predict_proba(X_test.values)[:, 1]
proba_cat_li_240.append(proba_cat)
logloss_c_240.append(log_loss(y_test, proba_cat))

In [16]:
logloss_c_101=[]
proba_cat_li_101=[]

cat = CatBoostClassifier(**best_p_c_101,od_type='Iter', loss_function='Logloss', cat_features=categorical_f)
cat.fit(X_train,y_train,eval_set=[(X_test,y_test)],early_stopping_rounds=100, verbose=False)
proba_cat = cat.predict_proba(X_test.values)[:, 1]
proba_cat_li_101.append(proba_cat)
logloss_c_101.append(log_loss(y_test, proba_cat))

In [17]:
logloss_c_16=[]
proba_cat_li_16=[]

cat = CatBoostClassifier(**best_p_c_16,od_type='Iter', loss_function='Logloss', cat_features=categorical_f)
cat.fit(X_train,y_train,eval_set=[(X_test,y_test)],early_stopping_rounds=100, verbose=False)
proba_cat = cat.predict_proba(X_test.values)[:, 1]
proba_cat_li_16.append(proba_cat)
logloss_c_16.append(log_loss(y_test, proba_cat))

In [18]:
logloss_c_24=[]
proba_cat_li_24=[]

cat = CatBoostClassifier(**best_p_c_24,od_type='Iter', loss_function='Logloss', cat_features=categorical_f)
cat.fit(X_train,y_train,eval_set=[(X_test,y_test)],early_stopping_rounds=100, verbose=False)
proba_cat = cat.predict_proba(X_test.values)[:, 1]
proba_cat_li_24.append(proba_cat)
logloss_c_24.append(log_loss(y_test, proba_cat))

In [56]:
lgb48128 = pd.read_csv('lgb48128_pred.csv')

In [57]:
proba_lgb_li_48128 = np.array(lgb48128['0'])

In [58]:
prob_dict = {
    'lgbm': proba_lgb_li[0],
    'lgbm_8': proba_lgb_li_8[0],
    'lgbm_28': proba_lgb_li_28[0],
    'lgbm_4812':proba_lgb_li_4812[0],
    'lgbm_19': proba_lgb_li_19[0],
    'lgbm_48128': proba_lgb_li_48128,
    'xgb':proba_xgb_li[0],
    'cat_2':proba_cat_li_2[0],
    'cat_240': proba_cat_li_240[0],
    'cat_101': proba_cat_li_101[0],
    'cat_16': proba_cat_li_16[0],
    'cat_24': proba_cat_li_24[0],
}
prob_df = pd.DataFrame(prob_dict)

# Start from here

In [59]:
prob_df

Unnamed: 0,lgbm,lgbm_8,lgbm_28,lgbm_4812,lgbm_19,lgbm_48128,xgb,cat_2,cat_240,cat_101,cat_16,cat_24
0,0.720769,0.708838,0.743060,0.666438,0.731289,0.634651,0.742674,0.557389,0.520828,0.510867,0.522435,0.344089
1,0.259261,0.195213,0.246929,0.211735,0.303465,0.135107,0.134947,0.160330,0.149793,0.155188,0.180702,0.171974
2,0.178526,0.163285,0.161672,0.174675,0.151295,0.128034,0.177632,0.164874,0.152439,0.156001,0.167668,0.150325
3,0.258672,0.279451,0.285455,0.258013,0.194119,0.529545,0.269195,0.267091,0.235199,0.255933,0.249617,0.219441
4,0.224993,0.241033,0.228025,0.198196,0.246160,0.264212,0.181404,0.292495,0.254902,0.287250,0.256893,0.255608
...,...,...,...,...,...,...,...,...,...,...,...,...
632698,0.230936,0.248003,0.257737,0.249031,0.282837,0.226353,0.274747,0.245429,0.240252,0.254258,0.257292,0.218767
632699,0.167198,0.192693,0.133136,0.180714,0.142688,0.212325,0.138745,0.168712,0.177035,0.164318,0.214233,0.141854
632700,0.154964,0.171299,0.151572,0.169375,0.135997,0.167799,0.163699,0.178346,0.162533,0.182450,0.158352,0.170247
632701,0.262074,0.276528,0.274755,0.264907,0.285704,0.311325,0.289288,0.301740,0.280562,0.294510,0.298746,0.277619


In [None]:
from sklearn.svm import l1_min_c
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer,log_loss

LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)

cs = l1_min_c(prob_df, y_test, loss='log') * np.logspace(0, 10, 20)

clf = LogisticRegression(penalty='l1', solver='liblinear')
# clf.fit(prob_df, y_test)

from sklearn.model_selection import GridSearchCV
param_grid = {'C': list(cs)}

grid = GridSearchCV(clf, param_grid, verbose=10,cv=3, scoring=LogLoss)
grid.fit(prob_df, y_test)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] C=5.362030922832332e-06 .........................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ............ C=5.362030922832332e-06, score=-0.693, total=   0.4s
[CV] C=5.362030922832332e-06 .........................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV] ............ C=5.362030922832332e-06, score=-0.693, total=   0.4s
[CV] C=5.362030922832332e-06 .........................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.8s remaining:    0.0s


[CV] ............ C=5.362030922832332e-06, score=-0.693, total=   0.4s
[CV] C=1.801544954615117e-05 .........................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.2s remaining:    0.0s


[CV] ............ C=1.801544954615117e-05, score=-0.550, total=   0.5s
[CV] C=1.801544954615117e-05 .........................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.8s remaining:    0.0s


[CV] ............ C=1.801544954615117e-05, score=-0.550, total=   0.5s
[CV] C=1.801544954615117e-05 .........................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.3s remaining:    0.0s


[CV] ............ C=1.801544954615117e-05, score=-0.550, total=   0.5s
[CV] C=6.052863682078157e-05 .........................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    2.8s remaining:    0.0s


[CV] ............ C=6.052863682078157e-05, score=-0.512, total=   0.6s
[CV] C=6.052863682078157e-05 .........................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    3.4s remaining:    0.0s


[CV] ............ C=6.052863682078157e-05, score=-0.512, total=   0.5s
[CV] C=6.052863682078157e-05 .........................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    3.9s remaining:    0.0s


[CV] ............ C=6.052863682078157e-05, score=-0.512, total=   0.6s
[CV] C=0.00020336522083429175 ........................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    4.5s remaining:    0.0s


[CV] ........... C=0.00020336522083429175, score=-0.462, total=   0.9s
[CV] C=0.00020336522083429175 ........................................
[CV] ........... C=0.00020336522083429175, score=-0.460, total=   0.9s
[CV] C=0.00020336522083429175 ........................................
[CV] ........... C=0.00020336522083429175, score=-0.464, total=   1.0s
[CV] C=0.0006832701877531929 .........................................
[CV] ............ C=0.0006832701877531929, score=-0.439, total=   1.3s
[CV] C=0.0006832701877531929 .........................................
[CV] ............ C=0.0006832701877531929, score=-0.431, total=   1.2s
[CV] C=0.0006832701877531929 .........................................
[CV] ............ C=0.0006832701877531929, score=-0.445, total=   1.2s
[CV] C=0.0022956636712857296 .........................................
[CV] ............ C=0.0022956636712857296, score=-0.437, total=   1.4s
[CV] C=0.0022956636712857296 .........................................
[CV] .

In [None]:
grid.best_estimator_

In [68]:
clf_opt = LogisticRegression(C=0.7288911003040416, penalty='l1', solver='liblinear')
clf_opt.fit(prob_df, y_test)

LogisticRegression(C=0.7288911003040416, penalty='l1', solver='liblinear')

In [75]:
wts = clf_opt.coef_[0] / clf_opt.coef_[0].sum()

In [76]:
wts

array([ 0.        , -0.09637551,  0.04561797, -0.19434204,  0.03252338,
        1.13736628, -0.01169298,  0.5342642 ,  0.4742598 , -0.64531503,
       -0.49830387,  0.22199779])

In [77]:
final_proba = np.zeros((len(prob_df), ))
final_proba+=proba_lgb_li[0] * wts[0]
final_proba+=proba_lgb_li_8[0] * wts[1]
final_proba+=proba_lgb_li_28[0] * wts[2]
final_proba+=proba_lgb_li_4812[0] * wts[3]
final_proba+=proba_lgb_li_19[0] * wts[4]
final_proba+=proba_lgb_li_48128 * wts[5]
final_proba+=proba_xgb_li[0] * wts[6]
final_proba+=proba_cat_li_2[0] * wts[7]
final_proba+=proba_cat_li_240[0] * wts[8]
final_proba+=proba_cat_li_101[0] * wts[9]
final_proba+=proba_cat_li_16[0] * wts[10]
final_proba+=proba_cat_li_24[0] * wts[11]
log_loss(y_test, final_proba)

0.44779935493713907

In [60]:
final_proba = np.zeros((len(prob_df), ))
final_proba+=proba_lgb_li[0] * 0.15
final_proba+=proba_lgb_li_8[0] * 0.15
final_proba+=proba_lgb_li_28[0] * 0.15
final_proba+=proba_lgb_li_4812[0] * 0.15
final_proba+=proba_lgb_li_19[0] * 0.15
final_proba+=proba_lgb_li_48128 * 0.15
final_proba+=proba_xgb_li[0] * 0.001
final_proba+=proba_cat_li_2[0] * 0.001
final_proba+=proba_cat_li_240[0] * 0.001
final_proba+=proba_cat_li_101[0] * 0.001
final_proba+=proba_cat_li_16[0] * 0.001
final_proba+=proba_cat_li_24[0] * 0.095
log_loss(y_test, final_proba)

0.4487179307386303

In [47]:
final_proba

array([0.60432742, 0.20919254, 0.13900652, ..., 0.15464651, 0.25881689,
       0.28517411])

In [25]:
prob_df.to_csv('prob_df.csv')

In [63]:
import itertools
list(itertools.permutations([1,5,10],))

TypeError: 'repeat' is an invalid keyword argument for permutations()

In [45]:
model_num = 9

# create placeholder for results table
output_wts = np.zeros((len(X_test)+1, model_num+1))

# getting the possible weights for three models
import itertools

j=0
for a,b,c,d,e,f,g,h,i in itertools.product([1,5,10], repeat=model_num):
    sum_w = np.array([a,b,c,d,e,f,g,h,i]).sum()
    wts = np.array([a,b,c,d,e,f,g,h,i]) / sum_w
    
    final_proba = np.zeros((len(X_test), ))
    #get oof combination for weighted final_probability
    final_proba+=proba_lgb_li[0] * wts[0]
    final_proba+=proba_lgb_li_8[0] * wts[1]
    final_proba+=proba_lgb_li_28[0] * wts[2]
    final_proba+=proba_xgb_li[0] * wts[3]
    final_proba+=proba_cat_li_2[0] * wts[4]
    final_proba+=proba_cat_li_240[0] * wts[5]
    final_proba+=proba_cat_li_101[0] * wts[6]
    final_proba+=proba_cat_li_16[0] * wts[7]
    final_proba+=proba_cat_li_24[0] * wts[8]
    final_proba+=proba_lgb_li_4812[0] * wts[9]

    #get the logloss of weighted probability for i-fold
    output_wts[j,model_num] = log_loss(y_test, final_proba)

    #record the associated weights
    output_wts[j,0:model_num] = wts

    j+=1

In [46]:
df_wts = pd.DataFrame(columns=['lgbm','lgbm_8','lgbm_28','xgb','cat_2','cat_240',
                               'cat_101','cat_16','cat_24','logloss'],data=output_wts)

In [54]:
df_wts.sort_values('logloss', ascending=False).iloc[19680:19683,:]

Unnamed: 0,lgbm,lgbm_8,lgbm_28,xgb,cat_2,cat_240,cat_101,cat_16,cat_24,logloss
16768,0.285714,0.142857,0.285714,0.028571,0.028571,0.028571,0.028571,0.028571,0.142857,0.454346
18956,0.222222,0.222222,0.222222,0.022222,0.022222,0.022222,0.022222,0.022222,0.222222,0.454303
18955,0.25,0.25,0.25,0.025,0.025,0.025,0.025,0.025,0.125,0.454274
