In [50]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold

In [51]:
dat=pd.DataFrame(dict(
    x1=np.random.randint(0, 10, 200),
    x2=np.random.randint(0, 10, 200),
    y=[1]*50 + [0]*150
))
# x1, x2はランダムな整数。
# yは、最初の50個が1, 残りが0。正例・負例が偏ったデータ

# 方法その1 train_test_split

In [63]:
_TEST_SIZE = 0.2
_RANDOM_STATE = 1
_SHUFFLE = True
_STRATIFY = dat.y

dat_learn, dat_cv = train_test_split(
    dat,
    test_size =  _TEST_SIZE,
    random_state = _RANDOM_STATE,
    shuffle = _SHUFFLE,
    stratify = _STRATIFY
)

In [64]:
print(f"""
#dat_learn = {len(dat_learn)}
#dat_cv = {len(dat_cv)}

%dat_learn 1s = {np.sum(dat_learn.y == 1) / len(dat_learn):.2f}
%dat_learn 1s = {np.sum(dat_cv.y == 1) / len(dat_cv):.2f}

intersection = {np.intersect1d(dat_learn.index, dat_cv.index)}
""")


#dat_learn = 160
#dat_cv = 40

%dat_learn 1s = 0.25
%dat_learn 1s = 0.25

intersection = []



<bf>train_test_split</bf> を使うと、正負の割合を保った分割を行ってくれる

# 方法その2 StratifiedShuffleSplit

In [61]:
_N_SPLITS = 3
_TEST_SIZE = 0.2
_RANDOM_STATE = 1
sss = StratifiedShuffleSplit(n_splits=_N_SPLITS, test_size=_TEST_SIZE, random_state=_RANDOM_STATE)

In [62]:
lind = np.zeros(len(dat))
cind = np.zeros(len(dat))
for i, (learn_index, cv_index) in enumerate(sss.split(dat, dat.y)):
    dat_learn = dat.loc[learn_index]
    dat_cv = dat.loc[cv_index]
    print(f"""
    len(dat_learn): {len(dat_learn)}
    len(dat_cv): {len(dat_cv)}
    # 1 in learn: {np.sum(dat_learn.y==1)}
    # 1 in cv: {np.sum(dat_cv.y==1)}
    """)
    lind[learn_index] += 1
    cind[cv_index] += 1
print(lind)    
print(cind)


    len(dat_learn): 160
    len(dat_cv): 40
    # 1 in learn: 40
    # 1 in cv: 10
    

    len(dat_learn): 160
    len(dat_cv): 40
    # 1 in learn: 40
    # 1 in cv: 10
    

    len(dat_learn): 160
    len(dat_cv): 40
    # 1 in learn: 40
    # 1 in cv: 10
    
[3. 3. 3. 3. 3. 2. 3. 1. 3. 3. 1. 3. 2. 3. 2. 2. 3. 1. 3. 1. 2. 2. 3. 3.
 3. 2. 3. 2. 3. 2. 3. 3. 1. 3. 3. 3. 3. 3. 2. 2. 3. 3. 2. 1. 3. 2. 2. 1.
 2. 2. 2. 1. 3. 3. 3. 3. 2. 0. 2. 3. 3. 2. 3. 2. 3. 2. 2. 3. 2. 2. 2. 3.
 2. 3. 3. 2. 3. 2. 3. 2. 3. 3. 1. 3. 3. 3. 3. 0. 2. 2. 3. 2. 2. 2. 3. 3.
 2. 1. 3. 3. 0. 3. 1. 3. 3. 3. 3. 2. 2. 3. 2. 1. 3. 2. 2. 3. 3. 3. 2. 2.
 3. 2. 2. 3. 3. 3. 3. 3. 3. 2. 3. 2. 3. 3. 3. 2. 2. 2. 2. 2. 3. 2. 3. 3.
 3. 3. 1. 1. 2. 3. 2. 2. 2. 2. 3. 3. 3. 3. 3. 3. 2. 3. 2. 3. 3. 2. 3. 3.
 3. 3. 3. 2. 1. 2. 3. 3. 2. 1. 2. 1. 3. 3. 3. 2. 2. 2. 2. 2. 3. 2. 2. 3.
 2. 3. 2. 3. 2. 3. 2. 2.]
[0. 0. 0. 0. 0. 1. 0. 2. 0. 0. 2. 0. 1. 0. 1. 1. 0. 2. 0. 2. 1. 1. 0. 0.
 0. 1. 0. 1. 0. 1. 0. 0. 2. 0. 0. 0. 0. 0. 1. 1. 0

StratifiedShuffleSplitを使うと、毎回、分割を新しく実行している

# 方法その3 StratifiedKFold

In [56]:
_N_SPLITS = 3
_RANDOM_STATE = 1
_SHUFFLE = True
kf = StratifiedKFold(n_splits=_N_SPLITS, random_state=_RANDOM_STATE, shuffle=_SHUFFLE)

In [60]:
lind = np.zeros(len(dat))
cind = np.zeros(len(dat))
for i, (learn_index, cv_index) in enumerate(kf.split(dat, dat.y)):
    dat_learn = dat.loc[learn_index]
    dat_cv = dat.loc[cv_index]
    print(f"""
    len(dat_learn): {len(dat_learn)}
    len(dat_cv): {len(dat_cv)}
    # 1 in learn: {np.sum(dat_learn.y==1)}
    # 1 in cv: {np.sum(dat_cv.y==1)}
    """)
    lind[learn_index] += 1
    cind[cv_index] += 1
print(lind)    
print(cind)


    len(dat_learn): 133
    len(dat_cv): 67
    # 1 in learn: 33
    # 1 in cv: 17
    

    len(dat_learn): 133
    len(dat_cv): 67
    # 1 in learn: 33
    # 1 in cv: 17
    

    len(dat_learn): 134
    len(dat_cv): 66
    # 1 in learn: 34
    # 1 in cv: 16
    
[2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1

StratifiedKFoldを使うと、データをきれいに分割し、各データが、均等な回数実験に現れるようになる。

# まとめ
- train_test_split: 1回だけ、交差検証を実施したいときに使う
- StratifiedShuffleSplit: train_test_split を複数回、実施する時に使う
- StratifiedKFold: KFoldを実施したいときに使う。

StratifiedShuffleSplit/StratifiedKFoldでn_splitsの意味が違うので注意。前者は、n_splitsというより、n_timesとした方が意味が分かりやすい気がする。