# Sklearn

## sklearn.cross_validation

документация: http://scikit-learn.org/stable/modules/cross_validation.html

In [32]:
from sklearn import model_selection, datasets

import numpy as np

### Разовое разбиение данных на обучение и тест с помощью train_test_split

In [33]:
iris = datasets.load_iris()

In [34]:
train_data, test_data, train_labels, test_labels = cross_validation.train_test_split(iris.data, iris.target, 
                                                                                     test_size = 0.3)

In [35]:
#убедимся, что тестовая выборка действительно составляет 0.3 от всех данных
float(len(test_labels))/len(iris.data)

0.3

In [36]:
print('Размер обучающей выборки: {} объектов \nРазмер тестовой выборки: {} объектов'.format(len(train_data),
                                                                                            len(test_data)))

Размер обучающей выборки: 105 объектов 
Размер тестовой выборки: 45 объектов


In [37]:
print('Обучающая выборка:\n', train_data[:5])
print('\n')
print('Тестовая выборка:\n', test_data[:5])

Обучающая выборка:
 [[ 7.2  3.   5.8  1.6]
 [ 5.4  3.   4.5  1.5]
 [ 7.6  3.   6.6  2.1]
 [ 4.8  3.   1.4  0.1]
 [ 5.7  3.   4.2  1.2]]


Тестовая выборка:
 [[ 5.8  2.7  5.1  1.9]
 [ 6.3  2.8  5.1  1.5]
 [ 5.5  2.6  4.4  1.2]
 [ 5.   3.2  1.2  0.2]
 [ 5.   3.   1.6  0.2]]


In [38]:
print('Метки классов на обучающей выборке:\n', train_labels)
print('\n')
print('Метки классов на тестовой выборке:\n', test_labels)

Метки классов на обучающей выборке:
 [2 1 2 0 1 2 1 2 1 0 0 0 1 0 1 2 0 2 1 1 1 2 1 0 0 2 0 1 2 2 2 2 0 1 1 0 2
 1 0 1 0 0 2 2 1 1 1 2 2 0 1 0 0 0 2 1 0 2 2 1 2 0 1 2 0 1 0 1 2 2 1 0 1 1
 2 2 1 1 2 2 1 2 2 2 0 0 0 0 1 2 0 2 0 1 1 0 0 2 2 0 0 1 1 0 2]


Метки классов на тестовой выборке:
 [2 2 1 0 0 2 0 0 2 1 1 1 1 1 1 1 2 2 2 1 0 2 1 0 0 2 0 1 0 2 2 1 1 0 0 0 0
 0 1 1 0 2 0 2 2]


### Стратегии проведения кросс-валидации

#### KFold

In [39]:
for train_indices, test_indices in cross_validation.KFold(10, n_folds = 5):
    print(train_indices, test_indices)

[2 3 4 5 6 7 8 9] [0 1]
[0 1 4 5 6 7 8 9] [2 3]
[0 1 2 3 6 7 8 9] [4 5]
[0 1 2 3 4 5 8 9] [6 7]
[0 1 2 3 4 5 6 7] [8 9]


In [40]:
for train_indices, test_indices in cross_validation.KFold(10, n_folds = 2, shuffle = True):
    print(train_indices, test_indices)

[0 3 5 6 8] [1 2 4 7 9]
[1 2 4 7 9] [0 3 5 6 8]


In [41]:
for train_indices, test_indices in cross_validation.KFold(10, n_folds = 2, shuffle = True, random_state = 1):
    print(train_indices, test_indices)

[1 3 5 7 8] [0 2 4 6 9]
[0 2 4 6 9] [1 3 5 7 8]


#### StratifiedKFold

In [42]:
target = np.array([0] * 5 + [1] * 5)
print(target)
for train_indices, test_indices in cross_validation.StratifiedKFold(target, n_folds = 2, shuffle = True, random_state = 0):
    print(train_indices, test_indices)

[0 0 0 0 0 1 1 1 1 1]
[3 4 8 9] [0 1 2 5 6 7]
[0 1 2 5 6 7] [3 4 8 9]


In [43]:
target = np.array([0, 1] * 5)
print(target)
for train_indices, test_indices in cross_validation.StratifiedKFold(target, n_folds = 2,shuffle = True):
    print(train_indices, test_indices)

[0 1 0 1 0 1 0 1 0 1]
[5 6 7 8] [0 1 2 3 4 9]
[0 1 2 3 4 9] [5 6 7 8]


#### ShuffleSplit

In [51]:
?cross_validation.cross_val_score ()

for train_indices, test_indices in cross_validation.ShuffleSplit(10, n_iter = 10, test_size = 0.2):
    print(train_indices, test_indices)

[5 9 8 0 6 3 4 1] [7 2]
[0 7 4 6 1 5 9 3] [8 2]
[2 1 9 5 3 0 8 6] [7 4]
[4 6 2 5 1 7 3 9] [8 0]
[7 9 5 6 8 2 0 3] [4 1]
[4 5 1 0 6 7 3 8] [2 9]
[6 9 8 3 1 4 5 2] [7 0]
[9 6 4 1 2 0 3 5] [7 8]
[9 4 0 3 5 8 2 7] [1 6]
[3 4 1 9 8 2 0 5] [7 6]


#### StratifiedShuffleSplit

In [45]:
target = np.array([0] * 5 + [1] * 5)
print(target)
for train_indices, test_indices in cross_validation.StratifiedShuffleSplit(target, n_iter = 4, test_size = 0.2):
    print(train_indices, test_indices)

[0 0 0 0 0 1 1 1 1 1]
[8 3 4 1 7 5 9 0] [2 6]
[1 5 9 6 2 0 8 4] [7 3]
[2 5 6 3 8 9 0 1] [7 4]
[5 7 2 3 6 9 4 1] [8 0]


#### Leave-One-Out

In [46]:
for train_indices, test_index in cross_validation.LeaveOneOut(10):
    print(train_indices, test_index)

[1 2 3 4 5 6 7 8 9] [0]
[0 2 3 4 5 6 7 8 9] [1]
[0 1 3 4 5 6 7 8 9] [2]
[0 1 2 4 5 6 7 8 9] [3]
[0 1 2 3 5 6 7 8 9] [4]
[0 1 2 3 4 6 7 8 9] [5]
[0 1 2 3 4 5 7 8 9] [6]
[0 1 2 3 4 5 6 8 9] [7]
[0 1 2 3 4 5 6 7 9] [8]
[0 1 2 3 4 5 6 7 8] [9]


Больше стратегий проведения кросс-валидации доступно здесь: http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators