# Sklearn

## sklearn.cross_validation

документация: http://scikit-learn.org/stable/modules/cross_validation.html

In [1]:
from sklearn import cross_validation, datasets

import numpy as np



### Разовое разбиение данных на обучение и тест с помощью train_test_split

In [2]:
iris = datasets.load_iris()

In [3]:
train_data, test_data, train_labels, test_labels = cross_validation.train_test_split(iris.data, iris.target, 
                                                                                     test_size = 0.3)
# получается разбиение на обучающую и тестовую выборку происходит с помощью модуля cross_validation

In [16]:
train_labels

array([1, 0, 2, 1, 2, 1, 0, 1, 1, 0, 0, 2, 0, 0, 0, 1, 1, 0, 1, 1, 0, 2,
       2, 1, 2, 2, 0, 0, 0, 1, 2, 2, 0, 0, 2, 1, 0, 1, 0, 2, 2, 0, 2, 2,
       0, 1, 0, 1, 2, 0, 2, 2, 2, 0, 2, 2, 1, 1, 1, 0, 1, 2, 2, 1, 2, 0,
       0, 2, 0, 1, 0, 0, 1, 2, 2, 0, 1, 1, 2, 0, 2, 2, 1, 0, 2, 1, 0, 2,
       0, 1, 2, 1, 1, 0, 1, 1, 0, 0, 0, 1, 2, 1, 1, 1, 2])

In [17]:
test_labels

array([0, 0, 2, 1, 2, 1, 0, 1, 2, 2, 0, 0, 2, 2, 0, 1, 1, 1, 0, 2, 2, 1,
       1, 0, 1, 0, 0, 1, 1, 2, 2, 1, 0, 2, 2, 2, 0, 2, 0, 2, 1, 0, 1, 2,
       1])

In [4]:
#убедимся, что тестовая выборка действительно составляет 0.3 от всех данных
float(len(test_labels))/len(iris.data)

0.3

In [14]:
float(len(test_data))/len(iris.data)

0.3

In [13]:
print ('Размер обучающей выборки: {} объектов \nРазмер тестовой выборки: {} объектов'.format(len(train_data),
                                                                                            len(test_data)))

Размер обучающей выборки: 105 объектов 
Размер тестовой выборки: 45 объектов


In [18]:
print ('Обучающая выборка:\n', train_data[:5])
print ('\n')
print ('Тестовая выборка:\n', test_data[:5])

Обучающая выборка:
 [[5.  2.3 3.3 1. ]
 [5.4 3.9 1.7 0.4]
 [7.7 3.  6.1 2.3]
 [6.  3.4 4.5 1.6]
 [6.7 3.1 5.6 2.4]]


Тестовая выборка:
 [[5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [6.3 3.4 5.6 2.4]
 [5.6 2.5 3.9 1.1]
 [6.  2.2 5.  1.5]]


In [19]:
print ('Метки классов на обучающей выборке:\n', train_labels)
print ('\n')
print ('Метки классов на тестовой выборке:\n', test_labels)

Метки классов на обучающей выборке:
 [1 0 2 1 2 1 0 1 1 0 0 2 0 0 0 1 1 0 1 1 0 2 2 1 2 2 0 0 0 1 2 2 0 0 2 1 0
 1 0 2 2 0 2 2 0 1 0 1 2 0 2 2 2 0 2 2 1 1 1 0 1 2 2 1 2 0 0 2 0 1 0 0 1 2
 2 0 1 1 2 0 2 2 1 0 2 1 0 2 0 1 2 1 1 0 1 1 0 0 0 1 2 1 1 1 2]


Метки классов на тестовой выборке:
 [0 0 2 1 2 1 0 1 2 2 0 0 2 2 0 1 1 1 0 2 2 1 1 0 1 0 0 1 1 2 2 1 0 2 2 2 0
 2 0 2 1 0 1 2 1]


### Стратегии проведения кросс-валидации

#### KFold

In [55]:
for train_indices, test_indices in cross_validation.KFold(10, n_folds = 5):
    print (train_indices, test_indices)

[2 3 4 5 6 7 8 9] [0 1]
[0 1 4 5 6 7 8 9] [2 3]
[0 1 2 3 6 7 8 9] [4 5]
[0 1 2 3 4 5 8 9] [6 7]
[0 1 2 3 4 5 6 7] [8 9]


In [57]:
for train_indices, test_indices in cross_validation.KFold(10, n_folds = 2, shuffle = True):
    print (train_indices, test_indices)

[0 1 4 8 9] [2 3 5 6 7]
[2 3 5 6 7] [0 1 4 8 9]


In [58]:
for train_indices, test_indices in cross_validation.KFold(10, n_folds = 2, shuffle = True, random_state = 1):
    print (train_indices, test_indices)

[1 3 5 7 8] [0 2 4 6 9]
[0 2 4 6 9] [1 3 5 7 8]


#### StratifiedKFold

In [68]:
target = np.array([0] * 5 + [1] * 5)
print (target)
for train_indices, test_indices in cross_validation.StratifiedKFold(target, n_folds = 2, shuffle = True, random_state = 0):
    print (train_indices, test_indices)
    
# first fold (with its training&test data) and second fold (with its training&test data)
# so 2-fold means there are two datasets extracted from one dataset

[0 0 0 0 0 1 1 1 1 1]
[3 4 8 9] [0 1 2 5 6 7]
[0 1 2 5 6 7] [3 4 8 9]


In [74]:
target = np.array([0, 1] * 5)
print (target)
for train_indices, test_indices in cross_validation.StratifiedKFold(target, n_folds = 2,shuffle = True):
    print (train_indices, test_indices)
# same as before but Shuffle added

[0 1 0 1 0 1 0 1 0 1]
[0 1 2 3] [4 5 6 7 8 9]
[4 5 6 7 8 9] [0 1 2 3]


#### ShuffleSplit

In [87]:
for train_indices, test_indices in cross_validation.ShuffleSplit(50, n_iter = 10, test_size = 0.2):
    print (train_indices, test_indices)
    
# first shuffles and then makes usual split to training Vs test data

[44 32 29 36 13 14 18 31  5  7  6 21 42 23  9  8 27 33 26  4 25 45 19 35
  2  3 24 12 41 11 38 48 15  1 28 46 22  0 47 37] [39 20 17 43 30 40 10 34 16 49]
[36 17  2  9 31 22 13 21 42 23  8 47 45  7 34 44 38 41 30 33 49 32 35 14
 16 48 25 20 28  1 46  3 10  6 12 15  4 19 40 18] [ 0 27 39 11 24 29 26 37 43  5]
[ 5 39  4 24 34 30 28 11 46 41  6 29 33 25 16 19 20 47 36  9 45 31 37  3
 14 23 26 10  2 49 43 21 13 15  0 22 42 44  8 12] [35 18 40  7 38 32 17 48 27  1]
[14 30 24  7 38 46 28  0 35 22 39 20 34 49 36 41 33  4  5 43 48 27 11 29
 13 25 10  8 45  9 23 32 26 21  1 12 42 16 40 47] [37  3  2 15 18 44 17 31  6 19]
[44 11 42 32 46 22  4 10 24 31 33 12 27 34 43 14 21 13 29  7 23 25 38 19
 28  0 40  2 16 37 48  9 36 49  5 30 15 26  6 17] [20  1 41 18 35  8 45 47  3 39]
[24  8 39 28  0 22 12 34 25 44 42 49 45  2  4 41  9  5 21 32 46 36 26 15
 30 48 27  1  3 29 40 38 23 13 16 31 17 47 18 10] [37 35 19 14 33  6 43 11 20  7]
[18 20 40 17 13 24  6 38 34 49 28 27 33 37 43 36 35 23  0  3  2 16 41 

#### StratifiedShuffleSplit

In [81]:
target = np.array([0] * 5 + [1] * 5)
print (target)
for train_indices, test_indices in cross_validation.StratifiedShuffleSplit(target, n_iter = 4, test_size = 0.2):
    print (train_indices, test_indices)
# variation of Shuffle split
# first shuffles data, then takes splits by preserving the same percentage for each target class as in the complete set.

[0 0 0 0 0 1 1 1 1 1]
[3 9 1 2 5 0 7 6] [8 4]
[1 8 0 6 3 7 2 5] [9 4]
[4 2 6 8 3 0 9 5] [1 7]
[7 6 5 3 4 9 2 1] [8 0]


#### Leave-One-Out

In [83]:
for train_indices, test_index in cross_validation.LeaveOneOut(10):
    print (train_indices, test_index)
# Each learning set is created by taking all the samples except one, the test set being the sample left out.

[1 2 3 4 5 6 7 8 9] [0]
[0 2 3 4 5 6 7 8 9] [1]
[0 1 3 4 5 6 7 8 9] [2]
[0 1 2 4 5 6 7 8 9] [3]
[0 1 2 3 5 6 7 8 9] [4]
[0 1 2 3 4 6 7 8 9] [5]
[0 1 2 3 4 5 7 8 9] [6]
[0 1 2 3 4 5 6 8 9] [7]
[0 1 2 3 4 5 6 7 9] [8]
[0 1 2 3 4 5 6 7 8] [9]


Больше стратегий проведения кросс-валидации доступно здесь: http://scikit-learn.org/stable/modules/cross_validation.html#cross-validation-iterators