# Random Forest (sklearn) vs XGBoost

## sklearn.ensemble.RandomForestClassifier

In [1]:
from sklearn import ensemble , model_selection, learning_curve, metrics 

import numpy as np
import pandas as pd
import xgboost as xgb
import json



ImportError: No module named 'xgboost'

In [None]:
%pylab inline

### Данные

Задача на kaggle: https://www.kaggle.com/c/bioresponse

Данные: https://www.kaggle.com/c/bioresponse/data

По данным характеристикам молекулы требуется определить, будет ли дан биологический ответ (biological response).

Признаки нормализаваны.

Для демонстрации используется обучающая выборка из исходных данных train.csv, файл с данными прилагается.

In [None]:
bioresponce = pd.read_csv('../seminar1/bioresponse.csv', header=0, sep=',')

In [None]:
bioresponce.head()

In [None]:
bioresponce_target = bioresponce.Activity.values

In [None]:
bioresponce_data = bioresponce.iloc[:, 1:]

## Зависимость качества от количества деревьев

### RandomForestClassifier

In [None]:
n_trees = [1] + list(range(10, 55, 5))

In [None]:
%%time
scoring = []
for n_tree in n_trees:
    estimator = ensemble.RandomForestClassifier(n_estimators = n_tree, min_samples_split=5, random_state=1)
    score = model_selection.cross_val_score(
        estimator, bioresponce_data, bioresponce_target, 
        scoring = 'accuracy', cv = 3
    )    
    scoring.append(score)
scoring = np.asmatrix(scoring)

In [None]:
scoring

In [None]:
pylab.plot(n_trees, scoring.mean(axis = 1), marker='.', label='RandomForest')
pylab.grid(True)
pylab.xlabel('n_trees')
pylab.ylabel('score')
pylab.title('Accuracy score')
pylab.legend(loc='lower right')

### XGBoost

In [None]:
%%time
xgb_scoring = []
for n_tree in n_trees:
    estimator = xgb.XGBClassifier(learning_rate=0.1, max_depth=5, n_estimators=n_tree, min_child_weight=3)
    score = model_selection.cross_val_score(
        estimator, bioresponce_data, bioresponce_target,
        scoring = 'accuracy', cv = 3
    )    
    xgb_scoring.append(score)
xgb_scoring = np.asmatrix(xgb_scoring)

In [None]:
xgb_scoring

In [None]:
pylab.plot(n_trees, scoring.mean(axis = 1), marker='.', label='RandomForest')
pylab.plot(n_trees, xgb_scoring.mean(axis = 1), marker='.', label='XGBoost')
pylab.grid(True)
pylab.xlabel('n_trees')
pylab.ylabel('score')
pylab.title('Accuracy score')
pylab.legend(loc='lower right')

#### **Материалы по xgboost:**
python api: http://xgboost.readthedocs.org/en/latest/python/python_api.html

установка: http://xgboost.readthedocs.org/en/latest/python/python_intro.html#install-xgboost

#### Неплохой гайд по установке под windows

https://www.ibm.com/developerworks/community/blogs/jfp/entry/Installing_XGBoost_For_Anaconda_on_Windows?lang=ru

## Задание:

1. Поставить XGBoost так, чтобы примеры из этого блокнота у вас работали
1. Поиграйтесь на этом датасете с параметрами градиентного бустинга, и посмотрите, как изменение отдельных параметров влияет на результат.

In [None]:
parameters = {'learning_rate': [0.035, 0.04], #so called `eta` value
              'max_depth': [6],
              'min_child_weight': [1],
              'subsample': [1.],
              'colsample_bytree': [1.],
              'colsample_bylevel': [1.],
              'n_estimators': [200,220],
              'seed': [42]
             }

In [None]:
xgb_model = xgb.XGBClassifier()
clf = model_selection.GridSearchCV(xgb_model, parameters, n_jobs=1, 
                   cv=model_selection.StratifiedKFold(n_splits=4, shuffle=True), 
                   scoring='accuracy',
                   verbose=4, refit=True)

In [None]:
clf.fit(X=bioresponce_data, y=bioresponce_target)

In [None]:
clf.best_estimator_.get_xgb_params()

In [None]:
np.mean(model_selection.cross_val_score(cv=3, X=bioresponce_data, y=bioresponce_target, estimator=clf.best_estimator_))

# Код оценки качества

#### Подробности о том как присылать решения и в каком формате - позже

#### Итак ваша цель сказать значение параметров xgb, при которых будет самое хорошее качество на кроссвалидации

In [None]:
from sklearn.model_selection import cross_val_score
import xgboost as xgb
import pandas
import numpy as np
import signal

bioresponce = pandas.read_csv('../seminar1/bioresponse.csv', header=0, sep=',')
bioresponce_target = bioresponce.Activity.values
bioresponce_data = bioresponce.iloc[:, 1:]

def signal_handler(signum, frame):
    raise Exception("Timed out!")

def estimate_params(params):
    #signal.signal(signal.SIGALRM, signal_handler)
    #signal.alarm(60)
    estimator = xgb.XGBClassifier(**params)
    try:
        score = np.mean(model_selection.cross_val_score(
            estimator, bioresponce_data, bioresponce_target,
            scoring = 'accuracy', cv = 3
        ))
    except (Exception):
        score = None
    
    return score

In [None]:
print(estimate_params({
    'learning_rate': 0.1, 
    'max_depth': 5, 
    'n_estimators': 100, 
    'min_child_weight': 3,
    'seed': 42
}))

In [None]:
print(estimate_params({
    'learning_rate': 0.049, 
    'max_depth': 6, 
    'n_estimators': 225, 
    'min_child_weight': 3,
    'seed': 42,
    'colsample_bylevel': 1.0,
}))

In [None]:
f = open('./checkers/xgb_params.json', 'w')
f.write(json.dumps(obj={
    'learning_rate': 0.049, 
    'max_depth': 6, 
    'n_estimators': 225, 
    'min_child_weight': 3,
    'seed': 42,
    'colsample_bylevel': 1.0,
}))
f.close()

In [3]:
import checkers.xgboost_params_checker as checkers

ImportError: No module named 'xgboost'

In [2]:
print(checkers.SCRIPT_DIR)
checkers.Checker().check('./checkers/xgb_params.json')

NameError: name 'checkers' is not defined