<img src='otus.png'>

# Применение ансамблей моделей 

https://statweb.stanford.edu/~jhf/ftp/trebst.pdf  
https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3885826/  
Интерактив - http://arogozhnikov.github.io/2016/07/05/gradient_boosting_playground.html

http://xgboost.readthedocs.io/en/latest/  
http://xgboost.readthedocs.io/en/latest/model.html  
https://lightgbm.readthedocs.io/  
https://lightgbm.readthedocs.io/en/latest/    
https://tech.yandex.com/catboost/doc/dg/concepts/about-docpage/   
http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/gbm.html#defining-a-gbm-model  

In [3]:
import time
import re
from __future__ import print_function
from collections import defaultdict

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import make_union, make_pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, LabelEncoder, MinMaxScaler,  Imputer, LabelBinarizer, OneHotEncoder
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV

# Ансамбли

import xgboost as xgb
import lightgbm as lgb

%matplotlib inline
plt.rcParams["figure.figsize"] = (15, 8)
pd.options.display.float_format = '{:.2f}'.format

In [4]:
df_train = pd.read_csv('train.csv')
df_train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.46,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.86,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.07,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.13,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.07,,C


In [5]:
df_test = pd.read_csv('test.csv')
df_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.83,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.69,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.66,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.29,,S


In [6]:
# move target to the right
survived = df_train['Survived']
df_train.drop(labels=['Survived'], axis=1, inplace=True)
df_train['Survived'] = survived
df_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.28,C85,C,1
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.92,,S,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,1
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [7]:
class LabelEncoderPipelineFriendly(LabelEncoder):
    
    def fit(self, X, y=None):
        """this would allow us to fit the model based on the X input."""
        super(LabelEncoderPipelineFriendly, self).fit(X)
        
    def transform(self, X, y=None):
        return super(LabelEncoderPipelineFriendly, self).transform(X).reshape(-1, 1)

    def fit_transform(self, X, y=None):
        return super(LabelEncoderPipelineFriendly, self).fit(X).transform(X).reshape(-1, 1)
    

class FeaturesSum(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        return np.sum(X, axis=1).reshape(-1, 1)

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)
    

class AgeFeature(BaseEstimator, TransformerMixin):
    # works with df only
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X, y=None):
        # sex, name
        X['Initial'] = 0
        for i in X:
            X['Initial'] = X.Name.str.extract('([A-Za-z]+)\.') #lets extract the Salutations
    
        X['Initial'].replace(
            ['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],
            ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],
            inplace=True
        )
        X.groupby('Initial')['Age'].mean() # lets check the average age by Initials

        ## Assigning the NaN Values with the Ceil values of the mean ages
        X.loc[(X.Age.isnull()) & (X.Initial=='Mr'), 'Age'] = 33
        X.loc[(X.Age.isnull()) & (X.Initial=='Mrs'), 'Age'] = 36
        X.loc[(X.Age.isnull()) & (X.Initial=='Master'), 'Age'] = 5
        X.loc[(X.Age.isnull()) & (X.Initial=='Miss'), 'Age'] = 22
        X.loc[(X.Age.isnull()) & (X.Initial=='Other'), 'Age'] = 46
        return X['Age'].as_matrix().reshape(-1, 1)

    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)

In [8]:
def get_sex_col(df):
    return df[['Sex']]

def get_age_name_cols(df):
    return df[['Age', 'Name']]

def get_pclass_col(df):
    return df[['Pclass']]

def get_sum_cols(df):
    return df[['Age', 'Fare']]

def get_num_cols(df):
    return df[['Fare', 'SibSp', 'Parch']]

vec = make_union(*[
    make_pipeline(FunctionTransformer(get_pclass_col, validate=False),  OneHotEncoder(sparse=False)),
    make_pipeline(FunctionTransformer(get_sex_col, validate=False),  LabelEncoderPipelineFriendly()),
    make_pipeline(FunctionTransformer(get_num_cols, validate=False), Imputer(strategy='mean'), MinMaxScaler()),
    make_pipeline(FunctionTransformer(get_age_name_cols, validate=False),  AgeFeature()),
])

In [9]:
x_train = vec.fit_transform(df_train)
x_train.shape

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#

(891, 8)

In [10]:
x_train

array([[  0.        ,   0.        ,   1.        , ...,   0.125     ,
          0.        ,  22.        ],
       [  1.        ,   0.        ,   0.        , ...,   0.125     ,
          0.        ,  38.        ],
       [  0.        ,   0.        ,   1.        , ...,   0.        ,
          0.        ,  26.        ],
       ..., 
       [  0.        ,   0.        ,   1.        , ...,   0.125     ,
          0.33333333,  22.        ],
       [  1.        ,   0.        ,   0.        , ...,   0.        ,
          0.        ,  26.        ],
       [  0.        ,   0.        ,   1.        , ...,   0.        ,
          0.        ,  32.        ]])

In [11]:
y_train = df_train['Survived']
y_train.shape

(891,)

In [12]:
lr = LogisticRegressionCV(cv=10)
lr.fit(x_train, y_train)
lr

LogisticRegressionCV(Cs=10, class_weight=None, cv=10, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [13]:
accuracy_score(y_train, lr.predict(x_train))

0.80359147025813693

# Применение модели

In [14]:
def apply_model(model, submission_name):
    x_test = vec.fit_transform(df_test) 
    print('shape of x_test is {}'.format(x_test.shape))
    y_test = model.predict(x_test)
    print('shape of y_test is {}'.format(y_test.shape))
    df_predicted = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Survived': y_test})
    df_predicted.to_csv(submission_name + '.csv', sep=',', index=False)

In [15]:
apply_model(lr, 'linear_regression_cv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


shape of x_test is (418, 8)
shape of y_test is (418,)


# Обучение ансамблей

In [16]:
def randomized_cv(model, param_grid, x_train=x_train, y_train=y_train):
    grid_search = RandomizedSearchCV(model, param_grid, cv=5, scoring='accuracy')
    t_start = time.time()
    grid_search.fit(x_train, y_train)
    t_end = time.time()
    print('model {} best accuracy score is {}'.format(model.__class__.__name__, grid_search.best_score_))
    print('time for training is {} seconds'.format(t_end - t_start))
    return grid_search.best_estimator_

# XGBoost

In [17]:
import xgboost as xgb

param_grid = {
    'max_depth': [2, 3, 4, 5],
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.02, 0.05]
}
xgb = randomized_cv(xgb.XGBClassifier(), param_grid)

model XGBClassifier best accuracy score is 0.830527497194
time for training is 2.16097903252 seconds


In [18]:
apply_model(xgb, 'xgb_cv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


shape of x_test is (418, 8)
shape of y_test is (418,)


Особенности XGBoost
* есть регуляризация
* распараллеливание
* возможность кастомизации
* обработка отсуствующих значений
* встроенная кросс-валидация
* возможность архивировать и восстанавливать модель

# LightGBM 

In [19]:
import lightgbm as lgb

param_grid = {
    'max_depth': [2, 3, 4, 5],
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [0.01, 0.02, 0.05]
}
model = randomized_cv(lgb.LGBMClassifier(), param_grid)

  'Please use {0} argument of the Dataset constructor to pass this parameter.'.format(key))


model LGBMClassifier best accuracy score is 0.83164983165
time for training is 2.31503987312 seconds


Особенности

* использование гистограмм для всех признаков (уже тоже есть в xgboost)
* то же самое, но быстрее (см выше)

# H2O GBM 

In [19]:
import h2o
import numpy as np
import math
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.grid.grid_search import H2OGridSearch
h2o.init(nthreads=-1, strict_version_check=True)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.7.0_151"; OpenJDK Runtime Environment (IcedTea 2.6.11) (7u151-2.6.11-1~deb8u1); OpenJDK 64-Bit Server VM (build 24.151-b01, mixed mode)
  Starting server from /usr/local/lib/python2.7/dist-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpwh3A5d
  JVM stdout: /tmp/tmpwh3A5d/h2o_stroykova_started_from_python.out
  JVM stderr: /tmp/tmpwh3A5d/h2o_stroykova_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,07 secs
H2O cluster version:,3.16.0.2
H2O cluster version age:,20 days
H2O cluster name:,H2O_from_python_stroykova_gl11fm
H2O cluster total nodes:,1
H2O cluster free memory:,3.490 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://127.0.0.1:54321


In [52]:
data = np.c_[x_train, y_train]
data = pd.DataFrame(data, columns=['C{}'.format(idx + 1) for idx in range(data.shape[-1])])

train_df_h2o = h2o.H2OFrame(python_obj=data)
train_df_h2o['C9'] = train_df_h2o['C9'].asfactor()

train_df_h2o.show()

Parse progress: |█████████████████████████████████████████████████████████| 100%


C1,C2,C3,C4,C5,C6,C7,C8,C9
0,0,1,1,0.0141511,0.125,0.0,22,0
1,0,0,0,0.139136,0.125,0.0,38,1
0,0,1,0,0.0154686,0.0,0.0,26,1
1,0,0,0,0.103644,0.125,0.0,35,1
0,0,1,1,0.0157126,0.0,0.0,35,0
0,0,1,1,0.0165095,0.0,0.0,33,0
1,0,0,1,0.101229,0.0,0.0,54,0
0,0,1,1,0.0411357,0.375,0.166667,2,0
0,0,1,0,0.0217308,0.0,0.333333,27,1
0,1,0,0,0.0586943,0.125,0.0,14,1


In [64]:
x_test = vec.fit_transform(df_test)
data_test = pd.DataFrame(x_test, columns=['C{}'.format(idx + 1) for idx in range(x_test.shape[-1])])

test_df_h2o = h2o.H2OFrame(python_obj=data_test)
test_df_h2o.show()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Parse progress: |█████████████████████████████████████████████████████████| 100%


C1,C2,C3,C4,C5,C6,C7,C8
0,0,1,1,0.0152816,0.0,0.0,34.5
0,0,1,0,0.0136631,0.125,0.0,47.0
0,1,0,1,0.0189087,0.0,0.0,62.0
0,0,1,1,0.0169081,0.0,0.0,27.0
0,0,1,0,0.0239836,0.125,0.111111,22.0
0,0,1,1,0.018006,0.0,0.0,14.0
0,0,1,0,0.0148912,0.0,0.0,30.0
0,1,0,1,0.0566042,0.125,0.111111,26.0
0,0,1,0,0.0141105,0.0,0.0,18.0
0,0,1,1,0.0471377,0.25,0.0,21.0


In [48]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
C1    891 non-null float64
C2    891 non-null float64
C3    891 non-null float64
C4    891 non-null float64
C5    891 non-null float64
C6    891 non-null float64
C7    891 non-null float64
C8    891 non-null float64
C9    891 non-null category
dtypes: category(1), float64(8)
memory usage: 56.7 KB


In [55]:
gbm = H2OGradientBoostingEstimator()
gbm.train(x=['C{}'.format(idx + 1) for idx in range(data.shape[-1] - 1)], y='C9', training_frame=train_df_h2o)
print(gbm)

gbm Model Build progress: |███████████████████████████████████████████████| 100%
Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  GBM_model_python_1513807303246_113


ModelMetricsBinomial: gbm
** Reported on train data. **

MSE: 0.0893923322822
RMSE: 0.298985505137
LogLoss: 0.304490577516
Mean Per-Class Error: 0.118301217525
AUC: 0.941251504596
Gini: 0.882503009193
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.479021024668: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,520.0,29.0,0.0528,(29.0/549.0)
1,63.0,279.0,0.1842,(63.0/342.0)
Total,583.0,308.0,0.1033,(92.0/891.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.4790210,0.8584615,180.0
max f2,0.2425732,0.8619718,252.0
max f0point5,0.6237757,0.8961593,147.0
max accuracy,0.4790210,0.8967452,180.0
max precision,0.9855009,1.0,0.0
max recall,0.0835401,1.0,352.0
max specificity,0.9855009,1.0,0.0
max absolute_mcc,0.4790210,0.7801801,180.0
max min_per_class_accuracy,0.3418507,0.8684211,226.0


Gains/Lift Table: Avg response rate: 38.38 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0101010,0.9733939,2.6052632,2.6052632,1.0,1.0,0.0263158,0.0263158,160.5263158,160.5263158
,2,0.0258137,0.9703743,2.6052632,2.6052632,1.0,1.0,0.0409357,0.0672515,160.5263158,160.5263158
,3,0.0325477,0.9703438,2.6052632,2.6052632,1.0,1.0,0.0175439,0.0847953,160.5263158,160.5263158
,4,0.0426487,0.9694137,2.6052632,2.6052632,1.0,1.0,0.0263158,0.1111111,160.5263158,160.5263158
,5,0.0505051,0.9681475,2.6052632,2.6052632,1.0,1.0,0.0204678,0.1315789,160.5263158,160.5263158
,6,0.1010101,0.9559318,2.6052632,2.6052632,1.0,1.0,0.1315789,0.2631579,160.5263158,160.5263158
,7,0.1503928,0.9279993,2.6052632,2.6052632,1.0,1.0,0.1286550,0.3918129,160.5263158,160.5263158
,8,0.2008979,0.8388662,2.5473684,2.5907086,0.9777778,0.9944134,0.1286550,0.5204678,154.7368421,159.0708615
,9,0.3007856,0.6096997,2.1954465,2.4594462,0.8426966,0.9440299,0.2192982,0.7397661,119.5446481,145.9446190



Scoring History: 


0,1,2,3,4,5,6,7,8
,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_lift,training_classification_error
,2017-12-21 01:35:31,0.001 sec,0.0,0.4863193,0.6659120,0.5,1.0,0.6161616
,2017-12-21 01:35:31,0.016 sec,1.0,0.4611267,0.6158597,0.9040680,2.6052632,0.1773288
,2017-12-21 01:35:31,0.022 sec,2.0,0.4407085,0.5774328,0.9021799,2.6052632,0.1739618
,2017-12-21 01:35:31,0.027 sec,3.0,0.4234627,0.5459271,0.9014556,2.6052632,0.1739618
,2017-12-21 01:35:31,0.034 sec,4.0,0.4079521,0.5179697,0.9038097,2.6052632,0.1694725
---,---,---,---,---,---,---,---,---
,2017-12-21 01:35:32,0.378 sec,46.0,0.3013569,0.3085071,0.9393182,2.6052632,0.1066218
,2017-12-21 01:35:32,0.385 sec,47.0,0.3006887,0.3074628,0.9398055,2.6052632,0.1077441
,2017-12-21 01:35:32,0.393 sec,48.0,0.2995177,0.3053450,0.9408041,2.6052632,0.1066218



See the whole table with table.as_data_frame()
Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
C4,330.3993835,1.0,0.4805600
C8,105.9970093,0.3208148,0.1541708
C5,91.3197327,0.2763920,0.1328229
C3,83.0104141,0.2512426,0.1207372
C6,46.7274857,0.1414273,0.0679643
C1,16.5053711,0.0499558,0.0240068
C2,8.2734900,0.0250409,0.0120336
C7,5.2970357,0.0160322,0.0077044





In [60]:
## Depth 10 is usually plenty of depth for most datasets, but you never know
hyper_params = {'max_depth' : range(1,30,2)}
#hyper_params = {max_depth = [4,6,8,12,16,20]} ##faster for larger datasets

#Build initial GBM Model
gbm_grid = H2OGradientBoostingEstimator(
        ## more trees is better if the learning rate is small enough 
        ## here, use "more than enough" trees - we have early stopping
        ntrees=10000,
        ## smaller learning rate is better
        ## since we have learning_rate_annealing, we can afford to start with a 
        #bigger learning rate
        learn_rate=0.05,
        ## learning rate annealing: learning_rate shrinks by 1% after every tree 
        ## (use 1.00 to disable, but then lower the learning_rate)
        learn_rate_annealing = 0.99,
        ## sample 80% of rows per tree
        sample_rate = 0.8,
        ## sample 80% of columns per split
        col_sample_rate = 0.8,
        ## fix a random number generator seed for reproducibility
        seed = 1234,
        ## score every 10 trees to make early stopping reproducible 
        #(it depends on the scoring interval)
        score_tree_interval = 10, 
        ## early stopping once the validation AUC doesn't improve by at least 0.01% for 
        #5 consecutive scoring events
        stopping_rounds = 5,
        stopping_metric = "misclassification",
        stopping_tolerance = 1e-4)

#Build grid search with previously made GBM and hyper parameters
grid = H2OGridSearch(gbm_grid, hyper_params,
                         grid_id = 'depth_grid',
                         search_criteria = {'strategy': "RandomDiscrete"})


#Train grid search
grid.train(x=['C{}'.format(idx + 1) for idx in range(data.shape[-1] - 1)], y='C9', training_frame=train_df_h2o)

gbm Grid Build progress: |████████████████████████████████████████████████| 100%


In [61]:
print(grid)

     max_depth            model_ids              logloss
0           21  depth_grid_model_10  0.20246105669136633
1           25  depth_grid_model_12  0.20261911154719406
2           27  depth_grid_model_13  0.20262775998296126
3           29  depth_grid_model_14  0.20262775998296126
4           23  depth_grid_model_11  0.20273249664306414
5           19   depth_grid_model_9   0.2029692834528766
6           27  depth_grid_model_17  0.20299158583991428
7           29  depth_grid_model_27  0.20299158583991428
8           23  depth_grid_model_21   0.2039713947108158
9           25  depth_grid_model_28  0.20398177460482617
10          21  depth_grid_model_25  0.20431413668073337
11          19  depth_grid_model_18  0.20493492849948586
12          17   depth_grid_model_8  0.20495308387684755
13          17  depth_grid_model_24  0.20495308387684755
14          15   depth_grid_model_7   0.2069009862618864
15          13   depth_grid_model_6  0.21216094436338703
16          13  depth_grid_mode

In [62]:
best_model = h2o.get_model(grid.sorted_metric_table()['model_ids'][0])
best_model


Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  depth_grid_model_10


ModelMetricsBinomial: gbm
** Reported on train data. **

MSE: 0.0559740905509
RMSE: 0.236588441288
LogLoss: 0.202461056691
Mean Per-Class Error: 0.0737225577605
AUC: 0.982093972028
Gini: 0.964187944056
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.400063705216: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,513.0,36.0,0.0656,(36.0/549.0)
1,28.0,314.0,0.0819,(28.0/342.0)
Total,541.0,350.0,0.0718,(64.0/891.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.4000637,0.9075145,191.0
max f2,0.2541906,0.9396163,227.0
max f0point5,0.6050285,0.9313725,153.0
max accuracy,0.4855104,0.9304153,172.0
max precision,0.9930546,1.0,0.0
max recall,0.0694526,1.0,331.0
max specificity,0.9930546,1.0,0.0
max absolute_mcc,0.4855104,0.8522752,172.0
max min_per_class_accuracy,0.3561496,0.9234973,196.0


Gains/Lift Table: Avg response rate: 38.38 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0101010,0.9901383,2.6052632,2.6052632,1.0,1.0,0.0263158,0.0263158,160.5263158,160.5263158
,2,0.0202020,0.9884602,2.6052632,2.6052632,1.0,1.0,0.0263158,0.0526316,160.5263158,160.5263158
,3,0.0303030,0.9873287,2.6052632,2.6052632,1.0,1.0,0.0263158,0.0789474,160.5263158,160.5263158
,4,0.0404040,0.9860989,2.6052632,2.6052632,1.0,1.0,0.0263158,0.1052632,160.5263158,160.5263158
,5,0.0505051,0.9842767,2.6052632,2.6052632,1.0,1.0,0.0263158,0.1315789,160.5263158,160.5263158
,6,0.1010101,0.9739394,2.6052632,2.6052632,1.0,1.0,0.1315789,0.2631579,160.5263158,160.5263158
,7,0.1515152,0.9339860,2.6052632,2.6052632,1.0,1.0,0.1315789,0.3947368,160.5263158,160.5263158
,8,0.2008979,0.8779594,2.6052632,2.6052632,1.0,1.0,0.1286550,0.5233918,160.5263158,160.5263158
,9,0.3007856,0.6849581,2.4589001,2.5566575,0.9438202,0.9813433,0.2456140,0.7690058,145.8900059,155.6657502



Scoring History: 


0,1,2,3,4,5,6,7,8
,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_lift,training_classification_error
,2017-12-21 01:38:56,15.755 sec,0.0,0.4863193,0.6659120,0.5,1.0,0.6161616
,2017-12-21 01:38:56,15.787 sec,10.0,0.3926348,0.4903584,0.9330441,2.6052632,0.1257015
,2017-12-21 01:38:56,15.821 sec,20.0,0.3474203,0.4087974,0.9397815,2.6052632,0.1178451
,2017-12-21 01:38:56,15.858 sec,30.0,0.3229680,0.3619419,0.9462420,2.6052632,0.1156004
,2017-12-21 01:38:56,15.901 sec,40.0,0.3091684,0.3332743,0.9503590,2.6052632,0.1088664
---,---,---,---,---,---,---,---,---
,2017-12-21 01:38:58,17.887 sec,500.0,0.2367757,0.2027238,0.9820540,2.6052632,0.0718294
,2017-12-21 01:38:58,17.933 sec,510.0,0.2367268,0.2026543,0.9820594,2.6052632,0.0718294
,2017-12-21 01:38:59,17.978 sec,520.0,0.2366803,0.2025861,0.9820620,2.6052632,0.0718294



See the whole table with table.as_data_frame()
Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
C8,1075.0728760,1.0,0.3355807
C5,876.1934204,0.8150084,0.2735011
C4,751.6188354,0.6991329,0.2346155
C3,194.5909119,0.1810025,0.0607410
C6,143.7841797,0.1337437,0.0448818
C7,68.9221268,0.0641093,0.0215138
C1,53.2924423,0.0495710,0.0166351
C2,40.1443520,0.0373411,0.0125309




In [77]:
best_model.accuracy()

[[0.48551037759793186, 0.9304152637485971]]

In [65]:
preds = best_model.predict(test_df_h2o)
preds.head()

gbm prediction progress: |████████████████████████████████████████████████| 100%


predict,p0,p1
0,0.961502,0.0384981
0,0.842935,0.157065
0,0.832242,0.167758
1,0.486816,0.513184
1,0.588062,0.411938
0,0.918793,0.0812072
0,0.681731,0.318269
0,0.969699,0.0303009
1,0.319127,0.680873
0,0.920627,0.0793725




In [73]:
pred_df = preds.as_data_frame()

submit = pd.DataFrame()
submit['PassengerId'] = df_test['PassengerId']
submit['Survived'] = pred_df['predict']
submit.to_csv('h2o.csv', sep=',', index=False)

# CatBoost

In [34]:
from catboost import CatBoostClassifier
param_grid = {
    'iterations': [2, 3, 4, 5],
    'depth': [2, 3, 4, 5],
    'learning_rate': [1, 0.1, 0.01, 0.001]
}
cbm = randomized_cv(CatBoostClassifier(), param_grid)

0: learn: 0.6423091	total: 64.8ms	remaining: 195ms
1: learn: 0.5998185	total: 79.2ms	remaining: 79.2ms
2: learn: 0.5769975	total: 93.3ms	remaining: 31.1ms
3: learn: 0.5507308	total: 104ms	remaining: 0us
0: learn: 0.6383077	total: 15.6ms	remaining: 46.9ms
1: learn: 0.6089393	total: 32.9ms	remaining: 32.9ms
2: learn: 0.5758987	total: 62ms	remaining: 20.7ms
3: learn: 0.5469502	total: 86.2ms	remaining: 0us
0: learn: 0.6429124	total: 14.8ms	remaining: 44.3ms
1: learn: 0.6013828	total: 29.2ms	remaining: 29.2ms
2: learn: 0.5693223	total: 53ms	remaining: 17.7ms
3: learn: 0.5509628	total: 68.4ms	remaining: 0us
0: learn: 0.6475638	total: 25.3ms	remaining: 75.9ms
1: learn: 0.6032648	total: 37.8ms	remaining: 37.8ms
2: learn: 0.5770655	total: 55.7ms	remaining: 18.6ms
3: learn: 0.5557374	total: 75.6ms	remaining: 0us
0: learn: 0.6415947	total: 22.9ms	remaining: 68.7ms
1: learn: 0.6147246	total: 38.9ms	remaining: 38.9ms
2: learn: 0.5914148	total: 54ms	remaining: 18ms
3: learn: 0.5622818	total: 63.7ms	

0: learn: 0.6865968	total: 15.7ms	remaining: 15.7ms
1: learn: 0.6801698	total: 32.4ms	remaining: 0us
0: learn: 0.6877683	total: 17.8ms	remaining: 17.8ms
1: learn: 0.6817558	total: 32.8ms	remaining: 0us
0: learn: 0.6871865	total: 7.89ms	remaining: 7.89ms
1: learn: 0.6830308	total: 27.6ms	remaining: 0us
0: learn: 0.6871489	total: 15.6ms	remaining: 15.6ms
1: learn: 0.6815152	total: 48.2ms	remaining: 0us
0: learn: 0.6393512	total: 19.4ms	remaining: 19.4ms
1: learn: 0.6073634	total: 34ms	remaining: 0us
0: learn: 0.633522	total: 14.3ms	remaining: 14.3ms
1: learn: 0.5973396	total: 28.7ms	remaining: 0us
0: learn: 0.6495862	total: 20.4ms	remaining: 20.4ms
1: learn: 0.6040712	total: 34.1ms	remaining: 0us
0: learn: 0.6397715	total: 22.3ms	remaining: 22.3ms
1: learn: 0.600085	total: 29.7ms	remaining: 0us
0: learn: 0.6416142	total: 13ms	remaining: 13ms
1: learn: 0.5999317	total: 20.3ms	remaining: 0us
0: learn: 0.5047101	total: 15.2ms	remaining: 45.6ms
1: learn: 0.4504238	total: 29.2ms	remaining: 29

Параметры модели

https://tech.yandex.com/catboost/doc/dg/concepts/python-reference_parameters-list-docpage/

Настройка параметров

https://tech.yandex.com/catboost/doc/dg/concepts/parameter-tuning-docpage/

Особенности

* уменьшено (?) переобучение
* умеет обрабатывать категориальные признаки
* большое количество визуализаций
* работает лучше по бенчмаркам (но дольше)

# Подбор гиперпараметров. Общий подход.

* выбрать относительно высокий learning_rate (например, 0.05 - 0.2)
* определить необходимое количество деревьев для исключения проблема недообучения и переобучения - поставить побольше и выбрать такое, где ошибка на валидации начинает расти
* зафиксировать параметры из предыдущих пунктов и настроить параметры, связанные с деревьями.
* зафиксировать параметры деревьев и дополнительно настроить learning_rate и количество деревьев

Основные параметры, связанные с бустингом

* learning_rate
* n_estimators
* subsample
* loss

Основные параметры, связанные с деревьями

* max_depth
* max_features
* min_samples_split
* min_samples_leaf
* max_leaf_nodes
* ...