# 【範例】學習使用 sklearn 中的 train_test_split, K fold 等套件，進行資料的切分

In [1]:
from sklearn.model_selection import train_test_split, KFold
import numpy as np

In [2]:
x = np.arange(50).reshape(10,5)
y = np.zeros(10)
y[:5] = 1
print('shape of x : ',x.shape)
print('shape of y : ',y.shape)

shape of x :  (10, 5)
shape of y :  (10,)


In [3]:
print('shape of x : ',x.shape)
print(x)
print(' ')
print('shape of y : ',y.shape)
print(y)

shape of x :  (10, 5)
[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]
 [20 21 22 23 24]
 [25 26 27 28 29]
 [30 31 32 33 34]
 [35 36 37 38 39]
 [40 41 42 43 44]
 [45 46 47 48 49]]
 
shape of y :  (10,)
[1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]


# 使用 train_test_split 函數進行切分

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [6]:
x_train

array([[35, 36, 37, 38, 39],
       [10, 11, 12, 13, 14],
       [45, 46, 47, 48, 49],
       [20, 21, 22, 23, 24],
       [15, 16, 17, 18, 19],
       [30, 31, 32, 33, 34]])

In [7]:
y_train

array([0., 1., 0., 1., 1., 0.])

# 使用 K-fold Cross-validation 來切分資料

In [8]:
kf = KFold(n_splits = 5)
i = 0
for train_index, test_index in kf.split(x):
    i += 1
    x_train, y_train = x[train_index], y[train_index]
    x_test, y_test = x[test_index],y[test_index]
    print('FOLD {}'.format(i))
    print('x_test',x_test)
    print('y_test',y_test)
    print('x_train',x_train)
    print('y_train',y_train)
    print('-'*30)

FOLD 1
x_test [[0 1 2 3 4]
 [5 6 7 8 9]]
y_test [1. 1.]
x_train [[10 11 12 13 14]
 [15 16 17 18 19]
 [20 21 22 23 24]
 [25 26 27 28 29]
 [30 31 32 33 34]
 [35 36 37 38 39]
 [40 41 42 43 44]
 [45 46 47 48 49]]
y_train [1. 1. 1. 0. 0. 0. 0. 0.]
------------------------------
FOLD 2
x_test [[10 11 12 13 14]
 [15 16 17 18 19]]
y_test [1. 1.]
x_train [[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [20 21 22 23 24]
 [25 26 27 28 29]
 [30 31 32 33 34]
 [35 36 37 38 39]
 [40 41 42 43 44]
 [45 46 47 48 49]]
y_train [1. 1. 1. 0. 0. 0. 0. 0.]
------------------------------
FOLD 3
x_test [[20 21 22 23 24]
 [25 26 27 28 29]]
y_test [1. 0.]
x_train [[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]
 [30 31 32 33 34]
 [35 36 37 38 39]
 [40 41 42 43 44]
 [45 46 47 48 49]]
y_train [1. 1. 1. 1. 0. 0. 0. 0.]
------------------------------
FOLD 4
x_test [[30 31 32 33 34]
 [35 36 37 38 39]]
y_test [0. 0.]
x_train [[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]
 [20 21 22 2

# 作業
假設我們資料中類別的數量並不均衡，在評估準確率時可能會有所偏頗，試著切分出 y_test 中，0 類別與 1 類別的數量是一樣的 (亦即 y_test 的類別是均衡的)

In [14]:
X = np.arange(1000).reshape(200, 5)
Y = np.zeros(200)
Y[:40] = 1

In [15]:
Y

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [16]:
# train_test_split + shuffle = True
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42,shuffle = True)

In [18]:
print('y_test',Y_test)
print('y_train',Y_train)

y_test [0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0.]
y_train [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 1. 0. 0. 1. 1. 0. 0. 1. 1. 0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0.]


In [20]:
# Kfold + shuffle = True
kf = KFold(n_splits = 5, shuffle = True)
i = 0
for train_index, test_index in kf.split(X):
    i +=1 
    train_X, train_Y = X[train_index], Y[train_index]
    test_X, test_Y = X[test_index], Y[test_index]
    print("FOLD {}: ".format(i))
    print("Y_test: ", Y_test)
    print("-"*30)

FOLD 1: 
Y_test:  [0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0.]
------------------------------
FOLD 2: 
Y_test:  [0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0.]
------------------------------
FOLD 3: 
Y_test:  [0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0.]
------------------------------
FOLD 4: 
Y_test:  [0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 0.]
---------------------------

使用shaffle已經可以蠻平均的分配了

接下來嘗試拆開資料分組後合併

In [28]:
# train_test_split + 分組處理
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X[:40], Y[:40], test_size=0.33, random_state=42)
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X[40:], Y[40:], test_size=0.33, random_state=42)

In [34]:
X_train = np.vstack((X_train1,X_train2))
Y_train = np.hstack((Y_train1,Y_train2))
X_test = np.vstack((X_test1,X_test2))
Y_test = np.hstack((Y_test1,Y_test2))

In [36]:
print('Y_test : ',Y_test)
print('Y_train : ',Y_train)

Y_test :  [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Y_train :  [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [37]:
# train_test_split + 分組處理
y_1_index, y_0_index = np.where(Y==1)[0],np.where(Y==0)[0] ## 選出 y 等於 1 的 index 與 y 等於 0 的 index
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X[y_1_index], Y[y_1_index], test_size=0.33, random_state=42)
X_train0, X_test0, Y_train0, Y_test0 = train_test_split(X[y_0_index], Y[y_0_index], test_size=0.33, random_state=42)

In [45]:
#不同的np合併法
X_train = np.concatenate((X_train1,X_train2),axis = 0)
Y_train = np.concatenate((Y_train1,Y_train2),axis = 0)
X_test = np.concatenate((X_test1,X_test2))#axis默認為0
Y_test = np.concatenate((Y_test1,Y_test2))

In [46]:
print('Y_test : ',Y_test)
print('Y_train : ',Y_train)

Y_test :  [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Y_train :  [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
