# Обучаем первые классификаторы в sklearn

### Данные


По данным характеристикам молекулы требуется определить, будет ли дан биологический ответ (biological response).

Для демонстрации используется обучающая выборка из исходных данных bioresponse.csv, файл с данными прилагается.

### Готовим обучающую и тестовую выборки

In [26]:
import pandas as pd
import numpy as np

bioresponce = pd.read_csv('bioresponse.csv', header=0, sep=',')

In [27]:
bioresponce.head(5)

Unnamed: 0,Activity,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,D1767,D1768,D1769,D1770,D1771,D1772,D1773,D1774,D1775,D1776
0,1,0.0,0.497009,0.1,0.0,0.132956,0.678031,0.273166,0.585445,0.743663,...,0,0,0,0,0,0,0,0,0,0
1,1,0.366667,0.606291,0.05,0.0,0.111209,0.803455,0.106105,0.411754,0.836582,...,1,1,1,1,0,1,0,0,1,0
2,1,0.0333,0.480124,0.0,0.0,0.209791,0.61035,0.356453,0.51772,0.679051,...,0,0,0,0,0,0,0,0,0,0
3,1,0.0,0.538825,0.0,0.5,0.196344,0.72423,0.235606,0.288764,0.80511,...,0,0,0,0,0,0,0,0,0,0
4,0,0.1,0.517794,0.0,0.0,0.494734,0.781422,0.154361,0.303809,0.812646,...,0,0,0,0,0,0,0,0,0,0


In [28]:
y = bioresponce.Activity.values

In [29]:
X = bioresponce.iloc[:, 1:]

In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Строим модель и оцениваем качество

In [31]:
from sklearn.linear_model import LogisticRegression

In [32]:
model = LogisticRegression()
model.fit(X_train, y_train)
preds = model.predict(X_test)

In [33]:
type(preds)

numpy.ndarray

In [34]:
10 // 9

1

In [35]:
print(sum(preds == y_test) / len(preds))

0.75605815832


In [36]:
print(sum(preds == y_test) / float(len(preds)))

0.75605815832


In [37]:
from sklearn.metrics import accuracy_score

print(accuracy_score(preds, y_test))

0.75605815832


### Качество на кросс-валидации

In [38]:
from sklearn.model_selection import cross_val_score

print(cross_val_score(model, X_train, y_train, cv=5))

[ 0.74404762  0.73956262  0.72310757  0.75099602  0.75896414]


In [39]:
print(cross_val_score(model, X_train, y_train, cv=5).mean())

0.743335594477


### Пробуем другие классификаторы

In [40]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [41]:
%%time

models = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    LinearSVC(),
    RandomForestClassifier(n_estimators=100), 
    GradientBoostingClassifier(n_estimators=100)
]

for model in models:
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(accuracy_score(preds, y_test), model)

0.718901453958 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
0.699515347334 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
0.743134087237 LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
0.785137318255 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
 

## Опциональное задание:

Попробуйте разные классификаторы с разными параметрами и постарайтесь добиться максимального качества на тестовой выборке

In [56]:
from sklearn.model_selection import GridSearchCV
from sklearn import grid_search

In [113]:
def grid_search_CV(classifier, parameters_grid):
    grid_cv = grid_search.GridSearchCV(classifier, parameters_grid, scoring='accuracy', cv=5)
    %%time
    grid_cv.fit(X_train, y_train)
    print(classifier)
    print("Grid cv")
    print("Best estimator = {}".format(grid_cv.best_estimator_))
    print("Best score = {}".format(grid_cv.best_score_))
    print("Best params = {}".format(grid_cv.best_params_))
    # print("Grid cv: best 10 scores = {}".format(grid_cv.grid_scores_[:10]))
    randomized_grid_cv = grid_search.RandomizedSearchCV(classifier, parameters_grid, scoring='accuracy', cv=5, n_iter = 20, 
                                                       random_state = 0)
    %%time
    randomized_grid_cv.fit(X_train, y_train)
    print("Randomized grid cv")
    print("Best score = {}".format(randomized_grid_cv.best_score_))
    print("Best params = {}".format(randomized_grid_cv.best_params_))
    return grid_cv, randomized_grid_cv

In [114]:
KNeighbors_parameters_grid = {
    'metric' : ['minkowski', 'manhattan'],
    'weights' : ['uniform', 'distance'],
    'leaf_size' : np.arange(27, 33),
    'n_neighbors' : np.arange(4, 9)
}

In [115]:
KNeighbors_grid_cv = grid_search_CV(models[0], KNeighbors_parameters_grid)

Wall time: 0 ns
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
Grid cv
Best estimator = KNeighborsClassifier(algorithm='auto', leaf_size=27, metric='manhattan',
           metric_params=None, n_jobs=1, n_neighbors=7, p=2,
           weights='distance')
Best score = 0.7576601671309192
Best params = {'leaf_size': 27, 'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'distance'}
Wall time: 0 ns
Randomized grid cv
Best score = 0.7576601671309192
Best params = {'weights': 'distance', 'n_neighbors': 7, 'metric': 'manhattan', 'leaf_size': 31}


In [116]:
DecisionTree_parameters_grid = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : np.arange(1, 6),
    'min_samples_split' : np.arange(2, 5),
    'min_samples_leaf' : np.arange(1, 4),
    'max_features' : ['log2', 'sqrt']
}

In [117]:
DecisionTree_grid_cv = grid_search_CV(models[1], DecisionTree_parameters_grid)

Wall time: 0 ns
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Grid cv
Best estimator = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
Best score = 0.6908077994428969
Best params = {'criterion': 'gini', 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 3}
Wall time: 0 ns
Randomized grid cv
Best score = 0.645841623557501
Best params = {'min_samples_split': 3, 

In [118]:
LinearSVC_parameters_grid = {
    'multi_class' : ['ovr', 'crammer_singer'],
    'fit_intercept' : ['True', 'False'],
    'max_iter' : np.arange(500, 1501, 250)
}

In [119]:
LinearSVC_grid_cv = grid_search_CV(models[2], LinearSVC_parameters_grid)

Wall time: 0 ns
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
Grid cv
Best estimator = LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept='True',
     intercept_scaling=1, loss='squared_hinge', max_iter=500,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
Best score = 0.7337843215280542
Best params = {'fit_intercept': 'True', 'max_iter': 500, 'multi_class': 'ovr'}
Wall time: 0 ns
Randomized grid cv
Best score = 0.7309988062077198
Best params = {'multi_class': 'ovr', 'max_iter': 1250, 'fit_intercept': 'True'}


In [120]:
RandomForest_parameters_grid = {
    'n_estimators' : np.arange(50, 301, 50),
    'criterion' : ['gini', 'entropy'],
    'max_features' : ['sqrt', 'log2'],
    'bootstrap' : ['True', 'False'],
    'min_samples_split' : np.arange(2, 5),
    'min_samples_leaf' : np.arange(1, 4)
}

In [121]:
RandomForest_grid_cv = grid_search_CV(models[3], RandomForest_parameters_grid)

Wall time: 0 ns
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Grid cv
Best estimator = RandomForestClassifier(bootstrap='False', class_weight=None,
            criterion='entropy', max_depth=None, max_features='sqrt',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=2,
            min_samples_split=4, min_weight_fraction_leaf=0.0,
            n_estimators=250, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
Best score = 0.8030242737763629
Best params = {'bootstrap': 'False', 'criterion': 'entropy', 'max_features': 

In [122]:
GradientBoosting_parameters_grid = {
    'n_estimators' : np.arange(50, 301, 50),
    'loss' : ['deviance', 'exponential'],
    'max_depth' : np.arange(3, 6),
    'max_features' : ['sqrt', 'log2'],
    'criterion' : ['friedman_mse', 'mse', 'mae']
}

In [123]:
GradientBoosting_grid_cv = grid_search_CV(models[4], GradientBoosting_parameters_grid)

Wall time: 0 ns
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
Grid cv
Best estimator = GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=4,
              max_features='sqrt', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=250,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
Best score = 0.

На данной выборке наилучший результат показал алгоритм RandomForest.

In [125]:
print("Best score = {}".format(RandomForest_grid_cv[0].best_score_))

Best score = 0.8030242737763629


## GridSearchCV VS RandomizedSearchCV

GridSearchCV - позволяет подбирать наилучшие параметры по сетке (которую задаем мы сами: {model}_parameters_grid) методом полного перебора. RandomizedSearchCV - выбирает случайным образом из нашей сетки параметров N наборов параметров (в нашем случае это N = 20) и возвращает из них лучший набор параметров.

GridSearchCV подходит больше для небольшой выборки, поскольку этот метод делает полный перебор по всей сетке. В свою очередь RandomizedSearchCV подходит для выборок любого размера. 

Хотя RandomizedSearchCV не вседа дает лучший результат, этот метод дает неплохое приближение к результату, который получается в случае GridSearchCV: 

* для методов GradientBoosting и LinearSVC - отличие в третьем порядке, 
* для методов DecisionTree и RandomForest - отличие во втором порядке, 
* а в случае с KNeighbors - нам повезло и RandomizedSearchCV дал тот же результат, что и GridSearchCV.