In [None]:
# https://github.com/catboost/tutorials
# https://github.com/hyperopt/hyperopt/wiki/FMin
# https://www.kaggle.com/felipeleiteantunes/xgboost-hyperopt-cv-via-python-api
# http://fastml.com/optimizing-hyperparams-with-hyperopt/

In [6]:
%reload_ext autoreload
%autoreload 2

In [7]:
import sys; sys.path.append('scripts')
from common import *
from global_common import *
from load_and_proccess_data import *

from catboost import CatBoostClassifier
from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from tqdm import tqdm_notebook

In [3]:
train = pd.read_hdf('../data/train_processed.h5')
test = pd.read_hdf('../data/test_processed.h5')
with open('../data/features_dtypes.pkl', 'rb') as f:
    columns_type_dict = pickle.load(f)

train.dtypes.value_counts().plot.bar(edgecolor = 'k');
plt.title('Variable Type Distribution');

Из целочисленныъ - 4 бинарных.
Целочисленных переменных  : 232
Переменных с плавающей точкой : 225
Переменным присвоен соответствующий тип.


In [None]:
X = train_df.drop(['id','target'], axis=1)
y = train_df.target
X_test = test_df.drop(['id'], axis=1)

cat_features = ['os_category', 'device_type', 'service_7_flag_m1', 
                       'service_7_flag_m2', 'service_7_flag_m3', 'manufacturer_category']
cat_features_idxs = [np.where(o ==  X.columns.values)[0].tolist()[0] for o in cat_features]

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.3, random_state=17)

In [None]:
N_HYPEROPT_PROBES = 500
HYPEROPT_ALGO = tpe.suggest 

D_train = catboost.Pool(X_train, y_train)
D_val = catboost.Pool(X_val, y_val)

def get_catboost_params(space):
    params = dict()
    params['learning_rate'] = space['learning_rate']
    params['depth'] = int(space['depth'])
    params['l2_leaf_reg'] = space['l2_leaf_reg']
    params['rsm'] = space['rsm']
    return params

def objective(space):
    global obj_call_count, cur_best_loss
    obj_call_count += 1
    print('\nCatBoost objective call #{} cur_best_loss={:7.5f}'.format(obj_call_count,cur_best_loss) )
    params = get_catboost_params(space)
    sorted_params = sorted(space.iteritems(), key=lambda z: z[0])
    params_str = str.join(' ', ['{}={}'.format(k, v) for k, v in sorted_params])
    print('Params: {}'.format(params_str) )
    model = catboost.CatBoostClassifier(iterations=5000,
                                        learning_rate=params['learning_rate'],
                                        depth=int(params['depth']),
                                        loss_function='MultiClass',
                                        use_best_model=True,
                                        eval_metric='MultiClass',
                                        l2_leaf_reg=params['l2_leaf_reg'],
                                        auto_stop_pval=1e-3,
                                        random_seed=123456,
                                        verbose=False
                                        )
    model.fit(D_train, eval_set=D_val, verbose=True)
    nb_trees = model.get_tree_count()
    print('nb_trees={}'.format(nb_trees))
    y_pred = model.predict_proba(X_test)
    test_loss = sklearn.metrics.log_loss(y_test, y_pred, labels=list(range(10)))
    acc = sklearn.metrics.accuracy_score(y_test, numpy.argmax(y_pred, axis=1))
    return{'loss':test_loss, 'status': STATUS_OK }

space ={
        'depth': hp.quniform("depth", 4, 7, 1),
        'rsm': hp.uniform ('rsm', 0.75, 1.0),
        'learning_rate': hp.loguniform('learning_rate', -3.0, -0.7),
        'l2_leaf_reg': hp.uniform('l2_leaf_reg', 1, 10),
       }


trials = Trials()
best = hyperopt.fmin(fn=objective,
                     space=space,
                     algo=HYPEROPT_ALGO,
                     max_evals=N_HYPEROPT_PROBES,
                     trials=trials,
                     verbose=1)

print('-'*50)
print('The best params:')
print( best )
print('\n\n')

In [78]:
# Prepare submission
subm = pd.DataFrame()
subm['id'] = test_df.id.values
subm['prediction'] = preds
subm.to_csv('submissions/CATBoost.csv', index=False)