In [1]:
import pandas as pd
import numpy as np
import pickle, time, math
from datetime import datetime
import os 
import csv

In [2]:
with open('train_data.pickle','rb') as f:
    data=pickle.load(f)
data_test=data[data.isTrain==False]
data=data[data.isTrain==True]
x=data.copy()
if 'totals_transactionRevenue' in data.columns:
    y=x.pop('totals_transactionRevenue').to_frame()
#x=data.copy()

In [3]:
def columns_index(x):
    features=x.columns.tolist()
    category_features=[]
    category_features_idx=[]
    for col in features:
        if str(x[col].dtype)=='category':
            #print(col,np.object)
            category_features.append(col)
            category_features_idx.append(features.index(col))
        if x[col].dtype==np.bool:
            #print(col,np.bool)
            category_features.append(col)
            category_features_idx.append(features.index(col))
    return features, category_features, category_features_idx

In [4]:
x=x.drop(['date','visitStartTime','sessionId','isTrain','visitId'],axis=1)
x['fullVisitorId']=x['fullVisitorId'].astype('category')
features, category_features, category_features_idx=columns_index(x)

In [5]:
from catboost import CatBoostClassifier,Pool
from sklearn.model_selection import StratifiedKFold, train_test_split,GroupKFold

In [6]:
#xtrain,xtest,ytrain,ytest=train_test_split(x, y, test_size=0.1, random_state=42, shuffle=True, stratify=y)
_y=y.copy()
_y.loc[(_y['totals_transactionRevenue']>0),'totals_transactionRevenue']=1
_y.loc[(_y['totals_transactionRevenue']==0),'totals_transactionRevenue']=0

In [7]:
from hyperopt import fmin, tpe, Trials, STATUS_OK, STATUS_FAIL, hp, pyll

## binary classification

In [23]:
fullVisitorIds=np.unique(x.fullVisitorId)

In [25]:
cv_split=[]
gF=GroupKFold(n_splits=2)
for train_idx,test_idx in gF.split(x,_y,groups=x.fullVisitorId):
    pool_train=Pool(x.iloc[train_idx],_y.iloc[train_idx],cat_features=category_features_idx)
    pool_test=Pool(x.iloc[test_idx],_y.iloc[test_idx],cat_features=category_features_idx)
    cv_split.append((pool_train,pool_test))

In [26]:
param_space = {
            'depth': hp.choice('depth', [4,6]),
            'border_count': hp.choice('border_count', [32,64,128]),
            'learning_rate': hp.loguniform('learning_rate', -5, 0),
            'random_strength': hp.choice('random_strength', [1, 5, 10, 20]),
            'one_hot_max_size': hp.choice('one_hot_max_size', [5, 25, 225]),
            'l2_leaf_reg': hp.loguniform('l2_leaf_reg', 0, np.log(10)),
            'bagging_temperature': hp.uniform('bagging_temperature', 0, 1),
            'leaf_estimation_iterations':hp.choice('leaf_estimation_iterations',[1,3,5,7,10]),
            'max_ctr_complexity':hp.quniform('max_ctr_complexity',1,5,1),
            #'leaf_estimation_method':hp.choice('leaf_estimation_method',['Newton','Gradient']),
            'rsm':hp.uniform('rsm',0,1),
            #'fold_len_multiplier':hp.choice('fold_len_multiplier',[2,3,4])
            #'class_weights': (hp.choice('non_class_weights_ratio',[1]), hp.uniform('class_weights_ratio',1,20))
        }

In [27]:
def fit(params=None, dtrain=None, dtest=None, n_estimators=None, seed=0, run_time=None, run_cv_id=0, eval_no=0, verbose=False):
    global metric,column_names
    #print(run_cv_id, eval_no)
    path="./cv_run/"+str(run_time)
    if not os.path.isdir(path):
        os.mkdir(path)
    fpath=path+"/"+str(eval_no)+"."+str(run_cv_id)
    if not os.path.isdir(fpath):
        os.mkdir(fpath)    
    params.update({"iterations": n_estimators})
    params.update({"eval_metric": metric})
    params.update({"logging_level": 'Verbose'})
    params.update({"metric_period": 100})
    params.update({"random_seed": seed})
    params.update({"leaf_estimation_method": "Newton"})
    #params.update({"leaf_estimation_iterations" : 10})
    params.update({"rsm" : 1})
    params.update({"thread_count" : 8})
    params.update({"fold_len_multiplier": 2})
    #params.update({"max_ctr_complexity":5})
    params.update({"train_dir": fpath})
    params.update({"calc_feature_importance" : True})
    params.update({'od_type':'Iter'})
    params.update({'od_wait':30})
    params.update({'od_pval':1e-3})
    
    bst = CatBoostClassifier(**params)
    bst.fit(dtrain, eval_set=dtest, use_best_model=True)
    with open(fpath + "/test_error.tsv", "r") as f:
        reader=np.array(list(csv.reader(f,delimiter='\t'))).squeeze()
    header=reader[0]
    feature=dict()
    for col, val in zip(features,bst.__dict__['_feature_importance']):
        feature.update({col:val})
    #pd.to_pickle(bst.__dict__['_feature_importance'],path+"/feature_importance."+str(eval_no)+"."+str(run_cv_id))
    pd.to_pickle(feature,path+"/feature_importance."+str(eval_no)+"."+str(run_cv_id))
    idx=(header==metric).argmax()
    #print('idx',idx, metric)
    results=(reader[1:reader.shape[0],idx]).astype(np.float)
    
    if metric=='AUC' or metric=='Accuracy':
        #print("metric",metric)
        results=1-results
    #print('results',results)
    return bst, results

In [28]:
def mean_each_iter(results):
    #global metric
    lengs=[]
    _=[lengs.append(len(res)) for res in results]
    #if metric in ['AUC','Accuracy']:
    mlen=np.max(lengs)
    #else:
    #    mlen=np.min(lengs)
    #print(lengs,mlen)
    a=[]
    for run in results:
        a.append((np.pad(run,(0,mlen-len(run)),'constant')).tolist())
    x=np.array(a)
    #print(x.shape)
    means=[]
    for i in range(x.shape[1]):
        mean=0
        count=0
        for j in range(x.shape[0]):
            if x[j,i] > 0:
                count=count+1
                mean=mean+x[j,i]
        means.append(mean/count)
    return means

In [29]:
def run_cv(cv_pairs, params=None, n_est=None, verbose=False, run_time=None):
    global default_params,n_estimators,best_loss,hyperopt_eval_num,metric,hyperopt_evals,metric
    params = params or default_params
    n_estimators = n_est or n_estimators
    #print('run_cv',hyperopt_eval_num)
    evals_results, start_time = [], time.time()
    _loss=[]
    i=0
    for dtrain, dtest in cv_pairs:
        _, evals_result = fit(params, dtrain, dtest, n_estimators, run_time=run_time, run_cv_id=i, eval_no=hyperopt_eval_num+1)
        #evals_results.append(np.mean(evals_result,axis=0))
        evals_results.append(evals_result)
        _loss.append(np.min(evals_result))
        i=i+1
    
    mean_evals_results = mean_each_iter(evals_results)
    best_n_estimators = np.argmin(mean_evals_results) + 1
    eval_time = time.time() - start_time
    
    
    cv_result = {'loss': mean_evals_results[best_n_estimators - 1] ,
                 'best_n_estimators': best_n_estimators, 
                 'eval_time': eval_time,
                 'status': STATUS_FAIL if np.isnan(mean_evals_results[best_n_estimators - 1]) else STATUS_OK,
                 'params': params.copy(),
                 'losses': _loss
                }
    best_loss = min(best_loss, cv_result['loss'])
    hyperopt_eval_num += 1
    cv_result.update({'hyperopt_eval_num': hyperopt_eval_num, 'best_loss': best_loss})
        
    if verbose:
        print ('[{0}/{1}]\teval_time={2:.2f} sec\tcurrent_{3}={4:.6f}\tmin_{3}={5:.6f}'.format(
                    hyperopt_eval_num, hyperopt_evals, eval_time,
                    metric, cv_result['loss'], best_loss))
    return cv_result

In [None]:
n_estimators=1000
max_evals = 20
hyperopt_evals=max_evals
metric="AUC"

this_trials = Trials()
run_time=(datetime.now()).strftime('%Y%m%d%H%M')
hyperopt_eval_num, best_loss, split_pair_data = 0, np.inf, None
args=param_space
_ = fmin(fn=lambda args: run_cv(cv_split, params=args, n_est=n_estimators, verbose=True,run_time=run_time), 
         space=args, algo=tpe.suggest, max_evals=max_evals, trials=this_trials)

with open('./cv_run/'+run_time+'/trails.pickle','wb') as f:
    pickle.dump(this_trials,f)

0:	learn: 0.9776258	test: 0.9774847	best: 0.9774847 (0)	total: 1.49s	remaining: 24m 45s
100:	learn: 0.9888997	test: 0.9879778	best: 0.9879778 (100)	total: 2m 32s	remaining: 22m 34s
200:	learn: 0.9895575	test: 0.9883451	best: 0.9883451 (200)	total: 5m 9s	remaining: 20m 28s


In [16]:
import lightgbm as lgb
vis=x['fullVisitorId'].tolist()
gF=GroupKFold(n_splits=5)
cv_split=[]
for train_idx,test_idx in gF.split(x,_y,groups=vis):
    cv_split.append(((x.iloc[train_idx],_y.iloc[train_idx]),(x.iloc[test_idx],_y.iloc[test_idx])))

params={'learning_rate':0.1,
        'objective':'binary',
        'metric':'auc',
        'num_leaves':4096,
        #'bagging_freq': 3,
        #"bagging_fraction": 0.6,
        "feature_fraction": 0.6,
        'max_depth':-1,
        'random_state':42
       }

clf = lgb.LGBMClassifier(**params)

#sub_clf_preds = np.zeros(X_test.shape[0])
oof_clf_preds=[]
#sub_clf_preds=[]
for fold, ((xtrain,ytrain),(xtest,ytest)) in enumerate(cv_split):
    clf = lgb.LGBMClassifier(**params, n_estimators=1000)
    clf.fit(xtrain, ytrain, eval_set=[(xtest, ytest)], eval_metric='auc',
        early_stopping_rounds=20, verbose=100,catego=category_features_idx)    
    prob=clf.predict_proba(xtest,clf.best_iteration_)
    oof_clf_preds.append((ytest,prob))

In [25]:
#this_trials.trials[0]['result']['params']
#
t=dict({'bagging_temperature': 0.08698568470862977,
 'border_count': 32,
 'depth': 4,
 'fold_len_multiplier': 2,
 'l2_leaf_reg': 6.511032253810222,
 'leaf_estimation_iterations': 3,
 'leaf_estimation_method': 'Gradient',
 'learning_rate': 0.2774845333553973,
 'max_ctr_complexity': 4.0,
 'one_hot_max_size': 225,
 'random_strength': 5,
 'rsm': 1,
 'iterations': 1000,
 'eval_metric': 'Accuracy',
 'logging_level': 'Silent',
 'metric_period': 100,
 'random_seed': 0,
 'thread_count': 8,
 'train_dir': './cv_run/201810100842/1.4',
 'calc_feature_importance': True,
 'od_type': 'Iter',
 'od_wait': 30})

{'bagging_temperature': 0.08698568470862977,
 'border_count': 32,
 'depth': 4,
 'fold_len_multiplier': 2,
 'l2_leaf_reg': 6.511032253810222,
 'leaf_estimation_iterations': 3,
 'leaf_estimation_method': 'Gradient',
 'learning_rate': 0.2774845333553973,
 'max_ctr_complexity': 4.0,
 'one_hot_max_size': 225,
 'random_strength': 5,
 'rsm': 1,
 'iterations': 1000,
 'eval_metric': 'Accuracy',
 'logging_level': 'Silent',
 'metric_period': 100,
 'random_seed': 0,
 'thread_count': 8,
 'train_dir': './cv_run/201810100842/1.4',
 'calc_feature_importance': True,
 'od_type': 'Iter',
 'od_wait': 30}

In [108]:
acc=[]
for t in this_trials.trials:
    print(1-np.mean(t['result']['losses']))
    acc.append(1-np.mean(t['result']['losses']))

0.98725725808
0.98838970014
0.98725725808
0.98745829444
0.98838232212
0.98725725808
0.98827903776
0.98767592952
0.9882845714
0.98816837548
0.98843396492
0.98818497516
0.98773310524
0.98747489382
0.98725725808
0.98725725808
0.98762613164
0.98754313528
0.98785852128
0.98765195284


In [109]:
print(np.max(acc),np.argmax(acc))

0.98843396492 10


In [110]:
acc[10]

0.98843396492

In [15]:
with open('cv_run/201810091006/trails.pickle', 'rb') as f:
    this_trials=pickle.load(f)

In [20]:
metric='Accuracy'
arg=this_trials.trials[4]['result']['params']
run_time=(datetime.now()).strftime('%Y%m%d%H%M')
trainP=Pool(xtrain,ytrain,cat_features=category_features_idx)
testP=Pool(xtest,ytest,cat_features=category_features_idx)
bst,res=fit(params=arg,dtrain=trainP,dtest=testP,n_estimators=2000,seed=13,run_time=run_time)

In [25]:
acc=bst.eval_metrics(testP,metrics=['Accuracy'])['Accuracy']
print(np.max(acc),np.argmax(acc))

0.9882006960621034 149


In [114]:
with open('./model/model_class.param','wb') as f:
    pickle.dump(this_trials.trials[10]['result']['params'],f)

In [111]:
metric='Accuracy'
arg=this_trials.trials[10]['result']['params']
run_time=(datetime.now()).strftime('%Y%m%d%H%M')
#trainP=Pool(xtrain,np.log1p(ytrain),cat_features=category_features_idx)
#testP=Pool(xtest,np.log1p(ytest),cat_features=category_features_idx)
bst2,res=fit(params=arg,dtrain=trainP,dtest=testP,n_estimators=2000,seed=13,run_time=run_time)

In [112]:
acc=bst2.eval_metrics(testP,metrics=['Accuracy'])['Accuracy']
print(np.max(acc),np.argmax(acc))

0.9882726261681726 137


In [4]:
bst2.save_model('./model/model_class.cbm')

NameError: name 'bst2' is not defined

In [19]:
model=CatBoostClassifier()
loadbst=model.load_model('./model/model_class.cbm')

In [89]:
losses2=[]
xt=xtest.copy()
for col in xtest.columns:
    #col_val=xtest[col]
    xtest[col]=((xtest[col]).sample(frac=1)).tolist()
    test_P=None
    #print(col,(xt[col]==xtest[col]).sum())
    test_P=Pool(xtest,ytest,cat_features=category_features_idx)
    loss=bst.eval_metrics(test_P,['Logloss'])
    print(col,loss['Logloss'][-1])
    losses2.append(loss['Logloss'][-1])
    xtest[col]=xt[col]
    #print(col,(xt[col]==xtest[col]).sum())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


channelGrouping 0.07554811397530913


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


date_day 0.07555293826046953
date_dayofweek 0.07554596397910972
date_dayofyear 0.0755599804346336
date_month 0.07555863155786963
date_quarter 0.07555076945290962
date_year 0.07555294311768711
device_browser 0.07555314591442984
device_deviceCategory 0.07555761931161024
device_isMobile 0.07555323766331212
device_operatingSystem 0.07555669679744326
geoNetwork_city 0.07555741348301494
geoNetwork_city_is_revenue_above_mean 0.07555294311768711
geoNetwork_city_trans_view_above_1_per_mile 0.07555294311768711
geoNetwork_continent 0.07554823129803746
geoNetwork_country 0.07558033242436385
geoNetwork_metro 0.07555747190565947
geoNetwork_metro_is_revenue_above_mean 0.07555521715237493
geoNetwork_metro_trans_view_above_1_per_mile 0.07555681786598091
geoNetwork_networkDomain 0.07555294311768711
geoNetwork_region 0.07555103934890954
geoNetwork_subContinent 0.07557322545653955
totals_bounces 0.07555294311768711
totals_bounces_imputed 0.07555294311768711
totals_hits_between_10_30 0.07554509407508869
to

In [116]:
bst2.eval_metrics(testP,['Logloss'])['Logloss'][-1]

0.03410769551448483

In [115]:
bst2.eval_metrics(test_P,['Logloss'])['Logloss'][-1]

0.07472148442650761

In [97]:
(losses2-np.min(losses2))/np.std(losses2)

array([0.74578566, 0.99341327, 0.63542768, 1.35488375, 1.28564673,
       0.8820897 , 0.99366258, 1.00407202, 1.23368875, 1.00878143,
       1.18633667, 1.2231237 , 0.99366258, 0.99366258, 0.75180777,
       2.39953892, 1.2261225 , 1.11038739, 1.19255104, 0.99366258,
       0.8959433 , 2.03474261, 0.99366258, 0.99366258, 0.59077604,
       1.07228749, 1.96768036, 0.76818741, 1.20240297, 0.99366258,
       0.        , 0.93807924, 0.84211263, 1.94139502, 1.5498493 ,
       2.68165906, 0.90912425, 0.99366258, 7.62194042, 0.99366258,
       0.99366258, 0.99366258, 0.99110128, 0.99366258, 0.99479806,
       0.99626633, 1.12676183, 0.87117461, 1.46201532, 0.98333206,
       1.2074246 , 0.92248918, 0.80360015, 0.12619504, 0.80695979,
       0.92208752, 3.2970018 ])

In [117]:
res=pd.DataFrame({'column_name':xtrain.columns.tolist()
                  ,'importance1':list(bst.feature_importances_)
                  ,'importance2':list(bst2.feature_importances_) 
                  , 'perturbance':(losses2-np.min(losses2))/np.std(losses2)})

In [118]:
res

Unnamed: 0,column_name,importance1,importance2,perturbance
0,channelGrouping,1.174018,2.015042,0.745786
1,date_day,4.3e-05,0.002703,0.993413
2,date_dayofweek,0.004169,0.033153,0.635428
3,date_dayofyear,0.118266,0.154129,1.354884
4,date_month,0.180886,0.574437,1.285647
5,date_quarter,0.083781,0.811319,0.88209
6,date_year,0.0,0.455684,0.993663
7,device_browser,0.012908,0.088229,1.004072
8,device_deviceCategory,0.173013,0.385318,1.233689
9,device_isMobile,0.390305,1.02155,1.008781


## Regression

In [48]:
from catboost import CatBoostRegressor

In [12]:
xtrain,xtest,ytrain,ytest=train_test_split(x, y, test_size=0.4, random_state=42, shuffle=True, stratify=_y)

In [41]:
y_t=_y.totals_transactionRevenue.loc[ytrain.index.to_series()]
sF=StratifiedKFold(n_splits=5,shuffle=True)
cv_split=[]
for train_idx,test_idx in sF.split(xtrain,y_t):
    pool_train_log=Pool(xtrain.iloc[train_idx],np.log1p(ytrain.iloc[train_idx]),cat_features=category_features_idx)
    pool_test_log=Pool(xtrain.iloc[test_idx],np.log1p(ytrain.iloc[test_idx]),cat_features=category_features_idx)
    cv_split.append((pool_train_log,pool_test_log))

In [42]:
param=dict(
{'bagging_temperature': 0.334998678341902,
 'border_count': 128,
 'depth': 4,
 'l2_leaf_reg': 7.201514933380306,
 'leaf_estimation_iterations': 5,
 'leaf_estimation_method': 'Gradient',
 'learning_rate': 0.8915171912524651,
 'max_ctr_complexity': 4.0,
 'one_hot_max_size': 225,
 'random_strength': 1,
 'iterations': 2001,
 "logging_level": 'Verbose',
 "metric_period": 100,
 #'eval_metric': 'Accuracy',
 'logging_level': 'Silent',
 'random_seed': 0,
 'rsm': 1,
 'thread_count': 8,
 'fold_len_multiplier': 2,
 'calc_feature_importance': True,
 'od_type': 'Iter',
 'od_wait': 30})

In [63]:
param_space = {
            'depth': hp.choice('depth', [4,6,8]),
            'border_count': hp.choice('border_count', [32,64,128]),
            'learning_rate': hp.loguniform('learning_rate', -5, 0),
            'random_strength': hp.choice('random_strength', [1, 5, 10, 20]),
            'one_hot_max_size': hp.choice('one_hot_max_size', [5, 25, 225]),
            'l2_leaf_reg': hp.loguniform('l2_leaf_reg', 0, np.log(10)),
            'bagging_temperature': hp.uniform('bagging_temperature', 0, 1),
            'leaf_estimation_iterations':hp.choice('leaf_estimation_iterations',[1,3,5,7,10]),
            'max_ctr_complexity':hp.quniform('max_ctr_complexity',1,5,1),
            'leaf_estimation_method':hp.choice('leaf_estimation_method',['Newton','Gradient']),
            'rsm':hp.uniform('rsm',0,1),
            'fold_len_multiplier':hp.choice('fold_len_multiplier',[2,3,4])
            #'class_weights': (hp.choice('non_class_weights_ratio',[1]), hp.uniform('class_weights_ratio',1,20))
        }

In [66]:
def fit(params=None, dtrain=None, dtest=None, n_estimators=None, seed=0, run_time=None, run_cv_id=0, eval_no=0, verbose=False):
    global metric,column_names
    #print(run_cv_id, eval_no)
    path="./cv_run/"+str(run_time)
    if not os.path.isdir(path):
        os.mkdir(path)
    fpath=path+"/"+str(eval_no)+"."+str(run_cv_id)
    if not os.path.isdir(fpath):
        os.mkdir(fpath)    
    params.update({"iterations": n_estimators})
    params.update({"eval_metric": metric})
    params.update({"logging_level": 'Silent'})
    params.update({"metric_period": 100})
    params.update({"random_seed": seed})
    #params.update({"rsm" : 1})
    params.update({"thread_count" : 8})
    #params.update({"fold_len_multiplier": 2})
    params.update({"train_dir": fpath})
    params.update({"calc_feature_importance" : True})
    params.update({'od_type':'Iter'})
    params.update({'od_wait':30})
    bst = CatBoostRegressor(**params)
    bst.fit(dtrain, eval_set=dtest, use_best_model=True)
    with open(fpath + "/test_error.tsv", "r") as f:
        reader=np.array(list(csv.reader(f,delimiter='\t'))).squeeze()
    header=reader[0]
    feature=dict()
    for col, val in zip(features,bst.__dict__['_feature_importance']):
        feature.update({col:val})
    #pd.to_pickle(bst.__dict__['_feature_importance'],path+"/feature_importance."+str(eval_no)+"."+str(run_cv_id))
    pd.to_pickle(feature,path+"/feature_importance."+str(eval_no)+"."+str(run_cv_id))
    idx=(header==metric).argmax()
    #print('idx',idx, metric)
    results=(reader[1:reader.shape[0],idx]).astype(np.float)
    
    if metric=='AUC' or metric=='Accuracy':
        #print("metric",metric)
        results=1-results
    #print('results',results)
    return bst, results

In [67]:
n_estimators=10
max_evals = 2
hyperopt_evals=max_evals
metric="RMSE"

this_trials = Trials()
run_time=(datetime.now()).strftime('%Y%m%d%H%M')
hyperopt_eval_num, best_loss, split_pair_data = 0, np.inf, None
args=param_space
_ = fmin(fn=lambda args: run_cv(cv_split, params=args, n_est=n_estimators, verbose=True,run_time=run_time), 
         space=args, algo=tpe.suggest, max_evals=max_evals, trials=this_trials)

with open('./cv_run/'+run_time+'/trails.pickle','wb') as f:
    pickle.dump(this_trials,f)

[1/2]	eval_time=18.84 sec	current_RMSE=1.812583	min_RMSE=1.812583
[2/2]	eval_time=27.98 sec	current_RMSE=1.726956	min_RMSE=1.726956


In [12]:
trainP=Pool(xtrain,np.log1p(ytrain),cat_features=category_features_idx)
testP=Pool(xtest,np.log1p(ytest),cat_features=category_features_idx)