In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import KFold
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score

In [15]:
class Model_ctbst:
    def __init__(self, objective='CrossEntropy'):
        '''input:
        objective - loss function
        eval - metric for loggigng'''
        self.objective = objective
        self.model = None
        '''Try to tune it'''
        self.model_params = dict(
            thread_count=8,
            iterations=2000,
            depth=8,
#             bagging_temperature=0.33,
#             learning_rate=0.1,
#             l2_leaf_reg=3,
#             random_strength=0.7,
            loss_function=self.objective
            )
        self.training_params = dict(
            use_best_model=True,
            early_stopping_rounds=50,
            verbose=10
            )
        
    def _set_model_(self):
        self.model = CatBoostClassifier()
        assert self.objective in ['CrossEntropy', 'Logloss']
        self.model.set_params(**self.model_params)
        
    def train(self, X_train, y_train, X_valid, y_valid):
        train_cat_features_indices = np.where(X_train.dtypes != np.float)[0]
        valid_cat_features_indices = np.where(X_valid.dtypes != np.float)[0]
        '''setting pools without weights'''
        ctbst_train_pool = Pool(data=X_train, label=y_train, cat_features=train_cat_features_indices)
        ctbst_val_pool = Pool(data=X_valid, label=y_valid, cat_features=valid_cat_features_indices)
        '''logging'''
        print('Training Model CatBoost')
        print('X_train = %s Y_train = %s' % (X_train.shape, y_train.shape))
        print('X_valid = %s Y_valid = %s' % (X_valid.shape, y_valid.shape))
        print()
        '''training'''
        self._set_model_()
        self.model = self.model.fit(ctbst_train_pool,
                                    eval_set=ctbst_val_pool,
                                    **self.training_params)
        '''feature importances'''
        print('Top features')
        feature_importances = self.model.get_feature_importance(ctbst_train_pool)
        feature_names = X_train.columns
        for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
            print('{}: {}'.format(name, score))
        
    def predict(self, X):
        if self.model is None:
            raise Exception('Train your model before')
        print('Predicting Model CatBoost')
        print('X = %s' % (X.shape,))
        print()
        X_cat_features_indices = np.where(X.dtypes != np.float)[0]
        ctbst_data_pool = Pool(data=X, cat_features=X_cat_features_indices)
        '''predict'''
        prediction = self.model.predict(ctbst_data_pool, prediction_type='Probability')
        '''get pred for 1 class'''
        prediction = pd.Series(prediction[:, 1], index=X.index)
        return prediction


In [21]:
class Trainer:
    def __init__(self, model, train_type, target, features, eval='roc-auc', n_folds=4, seed=42):
        self.model = model
        self.train_type = train_type
        self.target = target
        self.features = features
        self.eval = eval
        self.n_folds = n_folds
        self.seed = seed
        
    def _generate_folds_(self, data, val_ratio=0.2):
        index = np.asarray(data.index)
        n = index.size
        if self.n_folds == 1:
            random_state = np.random.RandomState(self.seed)
            i_valid = random_state.choice(range(n), size=int(np.floor(n * val_ratio)), replace=False)
            i_train = np.setdiff1d(range(n), i_valid, assume_unique=True)
            folds = [(i_train, i_valid)]
        else:
            fold = KFold(n_splits=self.n_folds, shuffle=True, random_state = self.seed)
            folds = []
            for i_train, i_valid in fold.split(np.arange(n)):
                folds.append((i_train, i_valid))
        return folds
    
    def _get_fold_(self, data, fltr):
        train = data.iloc[fltr[0]].reset_index(drop=True)
        valid = data.iloc[fltr[1]].reset_index(drop=True)
        return train, valid
    
    def _get_error_(self, Y, P):
        assert Y.shape[0] == P.shape[0]
        if self.eval == 'logloss':
            error = log_loss(Y, P)
        elif self.eval == 'roc-auc':
            error = roc_auc_score(Y, P)
        else:
            raise Exception('Error: unknown eval = %s' % (self.eval,))
        return error

    def train(self, data):
        print('Training with %s' % (self.train_type,))
        print()
        if self.train_type == 'validation':
            self._train_with_validation_(data)
        elif self.train_type == 'cross-validation':
            self._train_with_cross_validation_(data)
        else:
            raise Exception('Error: unknown train type = %s' % (self.train_type,))
            
    def _train_basic_(self, train, valid):
        X_train, y_train = train[self.features], train[self.target]
        X_valid, y_valid = valid[self.features], valid[self.target]
        model.train(X_train, y_train, X_valid, y_valid)
        pred_df = model.predict(X_valid)
        error = self._get_error_(y_valid, pred_df)
        print('Error %s: %s' % (self.eval, error))
        print()
        return pred_df, error

    def _train_with_validation_(self, data):
        print('Train with validation...')
        print()
        folds = self._generate_folds_(data)
        train, valid = self._get_fold_(data, folds[0])
        pred_df, error = self._train_basic_(train, valid)
    
    def _train_with_cross_validation_(self, data):
        errors = []
        print('Train with cross-validation...')
        print()
        folds = self._generate_folds_(data)
        print('Cross-validation %d folds' % (self.n_folds,))
        print()
        for i_fold in range(self.n_folds):
            print("Fold = %d / %d" % (i_fold + 1, self.n_folds))
            print()
            train, valid = self._get_fold_(data, folds[i_fold])
            pred_df, error = self._train_basic_(train, valid)
            errors.append(error)
        print('Mean %s error on CV: %s' % (self.eval, np.mean(errors)))
        print()

In [4]:
from catboost import datasets

In [5]:
(train_df, test_df) = datasets.titanic()
train_df = train_df[['Survived', 'Pclass', 'Sex', 'Age', 'Fare', 'SibSp', 'Parch', 'Embarked']].dropna()


In [6]:
train_df.loc[train_df['Sex'] == 'male','Sex'] = 1
train_df.loc[train_df['Sex'] == 'female','Sex'] = 0

In [7]:
train_df.loc[train_df['Embarked'] == 'S', 'Embarked'] = 0
train_df.loc[train_df['Embarked'] == 'C', 'Embarked'] = 1
train_df.loc[train_df['Embarked'] == 'Q', 'Embarked'] = 2

In [16]:
model = Model_ctbst()

In [22]:
trainer = Trainer(model=model,  train_type='cross-validation', target='Survived', 
                  features=['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',  'Embarked'], 
                 eval='logloss')

In [23]:
trainer.train(train_df)

Training with cross-validation

Train with cross-validation...

Cross-validation 4 folds

Fold = 1 / 4

Training Model CatBoost
X_train = (534, 7) Y_train = (534,)
X_valid = (178, 7) Y_valid = (178,)

Learning rate set to 0.0543
0:	learn: 0.6563465	test: 0.6603741	best: 0.6603741 (0)	total: 6.58ms	remaining: 13.2s
10:	learn: 0.4912380	test: 0.5360970	best: 0.5360970 (10)	total: 46.9ms	remaining: 8.47s
20:	learn: 0.4280683	test: 0.5077968	best: 0.5077968 (20)	total: 84.8ms	remaining: 7.99s
30:	learn: 0.4051452	test: 0.4999363	best: 0.4999363 (30)	total: 123ms	remaining: 7.78s
40:	learn: 0.3891748	test: 0.5014700	best: 0.4982983 (34)	total: 163ms	remaining: 7.76s
50:	learn: 0.3797452	test: 0.4982742	best: 0.4982742 (50)	total: 191ms	remaining: 7.3s
60:	learn: 0.3742697	test: 0.4966061	best: 0.4957227 (58)	total: 228ms	remaining: 7.25s
70:	learn: 0.3675097	test: 0.4942653	best: 0.4942653 (70)	total: 272ms	remaining: 7.4s
80:	learn: 0.3565565	test: 0.4973546	best: 0.4942653 (70)	total: 321

120:	learn: 0.3592994	test: 0.4395647	best: 0.4386190 (111)	total: 475ms	remaining: 7.37s
130:	learn: 0.3539192	test: 0.4397818	best: 0.4385812 (122)	total: 524ms	remaining: 7.47s
140:	learn: 0.3495202	test: 0.4395009	best: 0.4385812 (122)	total: 562ms	remaining: 7.4s
150:	learn: 0.3400480	test: 0.4357783	best: 0.4357783 (150)	total: 625ms	remaining: 7.66s
160:	learn: 0.3318381	test: 0.4349315	best: 0.4344150 (154)	total: 701ms	remaining: 8s
170:	learn: 0.3241674	test: 0.4361286	best: 0.4344150 (154)	total: 788ms	remaining: 8.42s
180:	learn: 0.3192465	test: 0.4347804	best: 0.4344150 (154)	total: 855ms	remaining: 8.59s
190:	learn: 0.3116260	test: 0.4409144	best: 0.4344150 (154)	total: 944ms	remaining: 8.94s
200:	learn: 0.3044845	test: 0.4419501	best: 0.4344150 (154)	total: 1.03s	remaining: 9.22s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.434414974
bestIteration = 154

Shrink model to first 155 iterations.
Top features
Sex: 42.88482583382378
Pclass: 15.4173025189