# Cross validation strategies with sklearn
## References
- [validationの切り方いろいろ（sklearnの関数まとめ）【kaggle Advent Calendar 4日目】 - u++の備忘録](https://upura.hatenablog.com/entry/2018/12/04/224436)
    - It's so helpful to understand how to split data with each strategies!
- [3.1. Cross-validation: evaluating estimator performance — scikit-learn 0.21.2 documentation](https://scikit-learn.org/stable/modules/cross_validation.html#multimetric-cross-validation)
- [API Reference — scikit-learn 0.21.2 documentation](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection)


In [1]:
import pandas as pd
from sklearn.model_selection import (GroupKFold, KFold, ShuffleSplit,
                                     StratifiedKFold,RepeatedKFold)

# Data Preparation

In [4]:
df = pd.DataFrame()
df['y'] = [
    0, 0, 0,
    1, 1, 1,
]

df['sex'] = [
    'male', 'female',
    'male', 'female',
    'male', 'female',

]

df['rank'] = [
    1,1,
    2,2,
    3,3,
]

df

Unnamed: 0,y,sex,rank
0,0,male,1
1,0,female,1
2,0,male,2
3,1,female,2
4,1,male,3
5,1,female,3


In [5]:
X = df[['sex', 'rank']]
y = df[['y']]

# KFold
- 挙動
    - 前から順に切っていく
    - shuffleした場合は混ざる
    - validでの重複は**起こらない**
- 問題が起こる場合
    - クラスバランスの偏り


In [6]:
# check data order and class blance
# Provides train/test indices to split data in train/test sets.
# Split dataset into k consecutive folds (without shuffling by default).
kf = KFold(n_splits=3)

for train_index, valid_index in kf.split(X, y):
    print(train_index, valid_index)
    print(df.iloc[train_index])
    print('------------------')

[2 3 4 5] [0 1]
   y     sex  rank
2  0    male     2
3  1  female     2
4  1    male     3
5  1  female     3
------------------
[0 1 4 5] [2 3]
   y     sex  rank
0  0    male     1
1  0  female     1
4  1    male     3
5  1  female     3
------------------
[0 1 2 3] [4 5]
   y     sex  rank
0  0    male     1
1  0  female     1
2  0    male     2
3  1  female     2
------------------


In [7]:
# check data order
# Provides train/test indices to split data in train/test sets.
# Split dataset into k consecutive folds (without shuffling by default).
kf = KFold(n_splits=3,
           random_state=0)

for train_index, valid_index in kf.split(X, y):
    print(train_index, valid_index)
    print(df.iloc[train_index])
    print('------------------')

[2 3 4 5] [0 1]
   y     sex  rank
2  0    male     2
3  1  female     2
4  1    male     3
5  1  female     3
------------------
[0 1 4 5] [2 3]
   y     sex  rank
0  0    male     1
1  0  female     1
4  1    male     3
5  1  female     3
------------------
[0 1 2 3] [4 5]
   y     sex  rank
0  0    male     1
1  0  female     1
2  0    male     2
3  1  female     2
------------------


In [8]:
# with shuffling
kf = KFold(n_splits=3,
           random_state=0,
           shuffle=True)

for train_index, valid_index in kf.split(X, y):
    print(train_index, valid_index)
    print(df.iloc[train_index])
    print('------------------')

[0 1 3 4] [2 5]
   y     sex  rank
0  0    male     1
1  0  female     1
3  1  female     2
4  1    male     3
------------------
[0 2 4 5] [1 3]
   y     sex  rank
0  0    male     1
2  0    male     2
4  1    male     3
5  1  female     3
------------------
[1 2 3 5] [0 4]
   y     sex  rank
1  0  female     1
2  0    male     2
3  1  female     2
5  1  female     3
------------------


# StratifiledKFold

In [9]:
skf = StratifiedKFold(n_splits=3,
                      random_state=0,
                      shuffle=True)

for train_index, valid_index in skf.split(X, y):
    print(train_index, valid_index)
    print(df.iloc[train_index])
    print('------------------')

[0 1 3 4] [2 5]
   y     sex  rank
0  0    male     1
1  0  female     1
3  1  female     2
4  1    male     3
------------------
[0 2 4 5] [1 3]
   y     sex  rank
0  0    male     1
2  0    male     2
4  1    male     3
5  1  female     3
------------------
[1 2 3 5] [0 4]
   y     sex  rank
1  0  female     1
2  0    male     2
3  1  female     2
5  1  female     3
------------------


# GroupKFold

In [10]:
gkf = GroupKFold(n_splits=2)

groups = [0, 1,
          0, 1,
          0, 1]

for train_index, valid_index in gkf.split(X, y, groups):
    print(train_index, valid_index)
    print(df.iloc[train_index])
    print('------------------')

[0 2 4] [1 3 5]
   y   sex  rank
0  0  male     1
2  0  male     2
4  1  male     3
------------------
[1 3 5] [0 2 4]
   y     sex  rank
1  0  female     1
3  1  female     2
5  1  female     3
------------------


In [11]:
X

Unnamed: 0,sex,rank
0,male,1
1,female,1
2,male,2
3,female,2
4,male,3
5,female,3


In [12]:
gkf = GroupKFold(n_splits=2)

rank_groups = [
    0,1,2,
    0,1,2,
]

for train_index, valid_index in gkf.split(X, y, groups=rank_groups):
    print(train_index, valid_index)
    print(df.iloc[train_index])
    print('------------------')

[1 4] [0 2 3 5]
   y     sex  rank
1  0  female     1
4  1    male     3
------------------
[0 2 3 5] [1 4]
   y     sex  rank
0  0    male     1
2  0    male     2
3  1  female     2
5  1  female     3
------------------


# ShuffleSplit

In [13]:
ss = ShuffleSplit(n_splits=2, 
                  train_size=0.50,
                  random_state=0)

for train_index, valid_index in ss.split(X, y):
    print(train_index, valid_index)
    print(df.iloc[train_index])
    print('------------------')

[3 0 4] [5 2 1]
   y     sex  rank
3  1  female     2
0  0    male     1
4  1    male     3
------------------
[0 2 5] [1 3 4]
   y     sex  rank
0  0    male     1
2  0    male     2
5  1  female     3
------------------


In [14]:
ss = ShuffleSplit(n_splits=3, 
                  train_size=0.50,
                  random_state=0)

for train_index, valid_index in ss.split(X, y):
    print(train_index, valid_index)
    print(df.iloc[train_index])
    print('------------------')

[3 0 4] [5 2 1]
   y     sex  rank
3  1  female     2
0  0    male     1
4  1    male     3
------------------
[0 2 5] [1 3 4]
   y     sex  rank
0  0    male     1
2  0    male     2
5  1  female     3
------------------
[2 4 0] [3 5 1]
   y   sex  rank
2  0  male     2
4  1  male     3
0  0  male     1
------------------


In [15]:
ss = ShuffleSplit(n_splits=2,
                  train_size=0.67,
                  random_state=0)

for train_index, valid_index in ss.split(X, y):
    print(train_index, valid_index)
    print(df.iloc[train_index])
    print('------------------')

[1 3 0 4] [5 2]
   y     sex  rank
1  0  female     1
3  1  female     2
0  0    male     1
4  1    male     3
------------------
[4 0 2 5] [1 3]
   y     sex  rank
4  1    male     3
0  0    male     1
2  0    male     2
5  1  female     3
------------------


In [16]:
rkf = RepeatedKFold(n_splits=3,
                    n_repeats=1,
                    random_state=0)

for train_index, valid_index in rkf.split(X, y):
    print(train_index, valid_index)
    print(df.iloc[train_index])
    print('------------------')

[0 1 3 4] [2 5]
   y     sex  rank
0  0    male     1
1  0  female     1
3  1  female     2
4  1    male     3
------------------
[0 2 4 5] [1 3]
   y     sex  rank
0  0    male     1
2  0    male     2
4  1    male     3
5  1  female     3
------------------
[1 2 3 5] [0 4]
   y     sex  rank
1  0  female     1
2  0    male     2
3  1  female     2
5  1  female     3
------------------


In [17]:
rkf = RepeatedKFold(n_splits=3,
                    n_repeats=2,
                    random_state=0)

for train_index, valid_index in rkf.split(X, y):
    print(train_index, valid_index)
    print('------------------')

[0 1 3 4] [2 5]
------------------
[0 2 4 5] [1 3]
------------------
[1 2 3 5] [0 4]
------------------
[0 2 4 5] [1 3]
------------------
[1 2 3 5] [0 4]
------------------
[0 1 3 4] [2 5]
------------------
