# Семинар 5: Дополнительные материалы

In [1]:
from IPython.display import Image

import warnings
warnings.simplefilter("ignore")

import numpy as np
import seaborn as sns
import pandas as pd

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = 10, 6
%matplotlib inline

from sklearn import metrics
from sklearn import datasets
from sklearn.base import BaseEstimator
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV, train_test_split, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier

-----
<h1 align="center">Useful libs</h1> 

### XGBoost

<img src='pics/xgboost.png' width=100>

Особенности:

1. Базовый алгоритм приближает направление, посчитанное с учетом вторых производных функции потерь.

2. Отклонение направления, построенного базовым алгоритмом, измеряется с помощью модифицированного функционала — из него удалено деление на вторую производную, за счет чего избегаются численные проблемы.

3. Функционал регуляризуется -- добавляются штрафы за количество листьев и за норму коэффициентов.

4. При построении дерева используется критерий информативности, зависящий от оптимального вектора сдвига.

5. Критерий останова при обучении дерева также зависит от оптимального сдвига.



##### Installation

http://xgboost.readthedocs.io/en/latest/build.html


###### Building on Ubuntu/Debian

!git clone --recursive https://github.com/dmlc/xgboost
!cd xgboost
!make -j4

###### Building on MacOS

!brew install gcc5
!pip install xgboost

###### Building on Windows

git submodule init
git submodule update

alias make='mingw32-make'

cp make/mingw64.mk config.mk; make -j4

mkdir build
cd build
cmake .. -G"Visual Studio 12 2013 Win64"

##### Test XGBoost

In [2]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

param = {
    'max_depth': 3,  # максимальная глубина дерева
    'eta': 0.3,  # шаг 
    'silent': 1,  # лог
    'objective': 'multi:softprob',  # как оценивать ошибку для мультиклассовой классификации
    'num_class': 3}  # число классов
num_round = 20  # число итераций

bst = xgb.train(param, dtrain, num_round)

preds = bst.predict(dtest)
best_preds = np.asarray([np.argmax(line) for line in preds])

In [4]:
print('F1-score: {:.4f}'.format(metrics.f1_score(y_test, best_preds, average='macro')))

F1-score: 1.0000


### CatBoost

<img src='pics/catboost.png' width=200>

https://github.com/catboost/catboost

https://tech.yandex.com/catboost/doc/dg/concepts/about-docpage/

##### Installation

In [None]:
!pip install catboost

#### Examples

In [6]:
from catboost import Pool, CatBoostRegressor, CatBoostClassifier

In [None]:
# Simple example

dataset = np.array([[1,4,5,6],[4,5,6,7],[30,40,50,60],[20,15,85,60]])
train_labels = [1.2,3.4,9.5,24.5]
model = CatBoostRegressor(learning_rate=1, depth=6, loss_function='RMSE')
fit_model = model.fit(dataset, train_labels)

print(fit_model.get_params())

In [None]:
## CatBoost Classifier

# initialize data
train_data = np.random.randint(0, 100, size=(100, 10))
train_label = np.random.randint(0, 2, size=(100))
test_data = np.random.randint(0, 100, size=(50, 10))

# specify the training parameters 
model = CatBoostClassifier(iterations=2, depth=2, learning_rate=1, loss_function='Logloss', logging_level='Verbose')

#train the model
model.fit(train_data, train_label, cat_features=[0,2,5])

# make the prediction using the resulting model
preds_class = model.predict(test_data)
preds_proba = model.predict_proba(test_data)
print("class = ", preds_class)
print("proba = ", preds_proba)

In [None]:
## CatBoost Regressor

# initialize data
train_data = np.random.randint(0, 100, size=(100, 10))
train_label = np.random.randint(0, 1000, size=(100))
test_data = np.random.randint(0, 100, size=(50, 10))

# initialize Pool
train_pool = Pool(train_data, train_label, cat_features=[0,2,5])
test_pool = Pool(test_data, cat_features=[0,2,5]) 

# specify the training parameters 
model = CatBoostRegressor(iterations=2, depth=2, learning_rate=1, loss_function='RMSE')

#train the model
model.fit(train_pool)

# make the prediction using the resulting model
preds = model.predict(test_pool)
print(preds)

-----
<h1 align="center">Stacking, Blending etc.</h1> 

### Stacking & Blending

Выборку разбивают на части (фолды), затем последовательно перебирая фолды обучают базовые алгоритмы на всех фолдах, кроме одного, а на оставшемся получают ответы базовых алгоритмов и трактуют их как значения соответствующих признаков на этом фолде. Для получения метапризнаков объектов тестовой выборки базовые алгоритмы обучают на всей обучающей выборке и берут их ответы на тестовой.

<img src='pics/stacking.png'>

Простейшая схема стекинга — блендинг

1. Делим обучающую выборку на два непересекающихся подмножества
2. Обучаем несколько базовых моделей на первой части данных.
3. Тестируем базовые модели на второй части.
4. Используя предсказания из пункта три как признаки, а правильные ответы как таргеты обучаем классификатор "второго уровня"

<img src='pics/blending.png'>

Данные тут: https://www.kaggle.com/mubashir44/simple-ensemble-model-stacking/data

In [5]:
properties = pd.read_csv("properties_2016.csv")
train_df = pd.read_csv("train_2016_v2.csv")
test_df = pd.read_csv("sample_submission.csv")
test_df = test_df.rename(columns={'ParcelId': 'parcelid'})

In [6]:
train = train_df.merge(properties, how = 'left', on = 'parcelid')
test = test_df.merge(properties, on='parcelid', how='left')
y_train = train['logerror'].ravel()

##### Обработка категориальных признаков

In [7]:
from sklearn.preprocessing import LabelEncoder  

lbl = LabelEncoder()

for c in train.columns:
    train[c]=train[c].fillna(0)
    if train[c].dtype == 'object':
        lbl.fit(list(train[c].values))
        train[c] = lbl.transform(list(train[c].values))

for c in test.columns:
    test[c]=test[c].fillna(0)
    if test[c].dtype == 'object':
        lbl.fit(list(test[c].values))
        test[c] = lbl.transform(list(test[c].values))

train_X = train.drop(["parcelid", "transactiondate", "logerror"], axis=1)
test_X = test[train_X.columns]
print(train_X.shape, test_X.shape)
x_train = np.array(train_X)
x_test = np.array(test_X)

(90275, 57) (2985217, 57)


##### Обертка для удобной работы с базовыми моделями

In [8]:
# Полезности
ntrain = train.shape[0]
ntest = test.shape[0]
SEED = 10 # для воспроизводимости результатов
NFOLDS = 5 # число фолдов для out-of-fold предсказаний
kf = KFold(n_splits=NFOLDS, random_state=SEED)

# Расширим Sklearn Regressor
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)

##### Строим простые модели

In [9]:
# Параметры для регрессоров
# Random Forest 
rf_params = {
    'n_jobs': -1,
    'n_estimators': 50,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees 
et_params = {
    'n_jobs': -1,
    'n_estimators':50,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost 
ada_params = {
    'n_estimators': 50,
    'learning_rate' : 0.75
}

# Gradient Boosting 
gb_params = {
    'n_estimators': 50,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

In [10]:
# 4 объекта для представления наших 4 моделей
rf = SklearnHelper(clf=RandomForestRegressor, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesRegressor, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostRegressor, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingRegressor, seed=SEED, params=gb_params)

In [11]:
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [12]:
# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees
rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost 
gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost

print("Training is complete")


Warm-start fitting without increasing n_estimators does not fit new trees.


Warm-start fitting without increasing n_estimators does not fit new trees.


Warm-start fitting without increasing n_estimators does not fit new trees.


Warm-start fitting without increasing n_estimators does not fit new trees.



Training is complete


In [18]:
rf_feature = rf.feature_importances(x_train,y_train)
et_feature = et.feature_importances(x_train, y_train)
ada_feature = ada.feature_importances(x_train, y_train)
gb_feature = gb.feature_importances(x_train,y_train)

[8.59250183e-03 0.00000000e+00 0.00000000e+00 2.34103618e-02
 1.41760367e-02 2.32283597e-03 5.25650999e-03 2.37606327e-02
 3.37316572e-03 1.09364402e-02 6.47641812e-02 5.26738585e-02
 6.39517320e-04 8.74961609e-03 1.45880424e-02 5.02243960e-04
 1.22890914e-04 4.26813999e-03 1.82941328e-02 7.10259273e-03
 1.93509446e-02 3.04611439e-03 1.41268730e-03 8.27984088e-02
 4.83835812e-02 4.21285031e-02 2.67682679e-03 2.57365699e-03
 2.04261261e-04 1.93119450e-04 1.77025440e-03 1.64582824e-02
 1.46680337e-02 4.43016098e-02 3.12424667e-02 2.51366536e-02
 3.53491137e-03 1.18090022e-02 5.56877200e-02 1.61811361e-02
 8.78288162e-04 6.51000725e-03 5.01893934e-04 1.01137071e-02
 8.59955051e-03 4.85056132e-05 2.08874998e-02 2.70307554e-03
 2.60709893e-04 7.22915276e-02 4.53514294e-02 0.00000000e+00
 4.15754978e-02 7.93792947e-02 2.41496741e-03 2.29774205e-03
 1.90944282e-02]



Warm-start fitting without increasing n_estimators does not fit new trees.



[2.12589444e-02 6.18210132e-04 2.08917962e-03 2.44562548e-02
 2.93484602e-02 5.72219829e-03 1.73728433e-02 1.67929407e-02
 1.48383932e-03 5.57849654e-03 4.16359657e-02 4.87151037e-02
 0.00000000e+00 7.76134608e-03 4.79168429e-03 3.09044027e-03
 2.83179384e-03 7.34720562e-03 2.40234566e-02 4.55704381e-03
 3.86450557e-03 3.73141672e-03 9.21582971e-03 3.78166481e-02
 2.50734597e-02 2.51368969e-02 2.02880029e-02 9.31087064e-06
 3.63885195e-04 2.35339003e-03 9.04909089e-03 9.96125455e-03
 1.09031619e-02 2.83182211e-02 7.79754032e-03 7.43713887e-02
 2.16150411e-03 1.12089969e-02 5.02686946e-02 5.79299527e-03
 1.90486615e-04 9.89884859e-03 1.22422903e-04 6.40685323e-03
 1.88423152e-03 3.24193296e-04 3.68324375e-02 2.87097285e-03
 0.00000000e+00 8.75375590e-02 4.22585827e-02 0.00000000e+00
 4.43745183e-02 7.98533472e-02 1.34979225e-02 7.39376988e-03
 5.93922528e-02]
[7.41245803e-04 0.00000000e+00 0.00000000e+00 1.87688296e-03
 0.00000000e+00 0.00000000e+00 0.00000000e+00 2.40323055e-03
 0.0000

In [19]:
base_predictions_train = pd.DataFrame( {'RandomForest': rf_oof_train.ravel(),
     'ExtraTrees': et_oof_train.ravel(),
     'AdaBoost': ada_oof_train.ravel(),
      'GradientBoost': gb_oof_train.ravel()
    })
base_predictions_train.head()

Unnamed: 0,RandomForest,ExtraTrees,AdaBoost,GradientBoost
0,0.007972,0.007753,0.01495,0.006582
1,0.012576,0.015212,0.18742,0.008531
2,0.008243,0.012145,0.016856,-0.009579
3,0.006819,0.006538,0.016856,0.008508
4,0.008599,0.005762,-0.302565,0.003702


In [20]:
data = [
    go.Heatmap(
        z= base_predictions_train.astype(float).corr().values ,
        x=base_predictions_train.columns.values,
        y= base_predictions_train.columns.values,
          colorscale='Portland',
            showscale=True,
            reversescale = True
    )
]
py.iplot(data, filename='labelled-heatmap')

##### Тренируем XGBoost на новых признаках

In [21]:
x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test), axis=1)

In [22]:
X = x_train
y = y_train
y_mean = np.mean(y_train)

In [23]:
from sklearn.model_selection import train_test_split

Xtr, Xv, ytr, yv = train_test_split(X, y, test_size=0.2, random_state=2000)

dtrain = xgb.DMatrix(Xtr, label=ytr)
dvalid = xgb.DMatrix(Xv, label=yv)

watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

# Try different parameters! My favorite is random search :)
xgb_params = {
    'eta': 0.025,
    'max_depth': 7,
    'subsample': 0.80,
    'objective': 'reg:linear',
    'eval_metric': 'mae',
    'lambda': 0.8,   
    'alpha': 0.4, 
    'base_score': y_mean,
    'silent': 1
}

In [24]:
model_xgb = xgb.train(xgb_params, dtrain, 2000, watchlist, early_stopping_rounds=300,
                  maximize=False, verbose_eval=15)

[0]	train-mae:0.068521	valid-mae:0.067813
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 300 rounds.
[15]	train-mae:0.067744	valid-mae:0.067366
[30]	train-mae:0.067471	valid-mae:0.067441
[45]	train-mae:0.067349	valid-mae:0.0676
[60]	train-mae:0.067286	valid-mae:0.067797
[75]	train-mae:0.067226	valid-mae:0.067986
[90]	train-mae:0.06717	valid-mae:0.068163
[105]	train-mae:0.067101	valid-mae:0.068291
[120]	train-mae:0.067043	valid-mae:0.068391
[135]	train-mae:0.066976	valid-mae:0.068463
[150]	train-mae:0.066911	valid-mae:0.068516
[165]	train-mae:0.066843	valid-mae:0.068563
[180]	train-mae:0.066761	valid-mae:0.068587
[195]	train-mae:0.066671	valid-mae:0.068655
[210]	train-mae:0.066593	valid-mae:0.068687
[225]	train-mae:0.066494	valid-mae:0.068702
[240]	train-mae:0.06642	valid-mae:0.068734
[255]	train-mae:0.066313	valid-mae:0.068768
[270]	train-mae:0.066225	valid-mae:0.068795
[285]	train-mae:0.066124	valid-m

In [25]:
dtest = xgb.DMatrix(x_test)
predicted_test_xgb = model_xgb.predict(dtest)

In [26]:
sub = pd.read_csv('sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = predicted_test_xgb

print('Writing csv ...')
sub.to_csv('xgb_stacked.csv', index=False, float_format='%.4f')

Writing csv ...


### Заключение

Ноутбук составлен по мотивам:
1. <a href="https://habrahabr.ru/company/ods/blog/327250/#postanovka-ml-zadachi"> Open Data Science, открытый курс машинного обучения. Тема 10 </a>
2. <a href="https://github.com/esokolov/ml-course-msu/tree/master/ML15-spring/lecture-notes"> Лекции Евгения Соколова </a>
3. <a href="https://alexanderdyakonov.wordpress.com/2017/03/10/cтекинг-stacking-и-блендинг-blending/"> Блог Александра Дьяконова </a>
