## 作業目標：運用scikit-learn API 實現K-fold分割資料

---

### 讀入資料

In [1]:
import pandas as pd
dataset = pd.read_csv(r'Social_Network_Ads.csv')

In [2]:
dataset

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19.0,19000.0,0
1,15810944,Male,35.0,20000.0,0
2,15668575,Female,26.0,43000.0,0
3,15603246,Female,27.0,57000.0,0
4,15804002,Male,19.0,76000.0,0
...,...,...,...,...,...
395,15691863,Female,46.0,41000.0,1
396,15706071,Male,51.0,23000.0,1
397,15654296,Female,50.0,20000.0,1
398,15755018,Male,36.0,33000.0,0


### 取出訓練特徵與標註

In [3]:
#X訓練 Y驗證
X = dataset[['User ID', 'Gender', 'Age', 'EstimatedSalary']].values
Y = dataset['Purchased'].values

---

In [4]:
import numpy as np
from sklearn.model_selection import KFold

### 將訓練資料按照順序切割成10等分

In [10]:
len(dataset)

400

In [11]:
#shuffle在每次划分时，是否进行洗牌，shuffle=False其效果等同于random_state等于整数，每次划分的结果相同
kf = KFold(n_splits=10)
kf.get_n_splits(X)

print(kf)


KFold(n_splits=10, random_state=None, shuffle=False)


### 將訓練資料隨機切割成10等分

In [12]:
#shuffle在每次划分时，是否进行洗牌，shuffle=True每次划分的结果都不一样，表示经过洗牌，随机取样的
kf = KFold(n_splits=10, shuffle=True)
kf.get_n_splits(X)

print(kf)

KFold(n_splits=10, random_state=None, shuffle=True)


---

### 取出 切割資料對應位置

In [13]:
train_split = kf.split(X)
next(train_split)

(array([  0,   1,   2,   3,   4,   5,   6,   7,  10,  11,  12,  13,  14,
         15,  16,  17,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,
         29,  30,  31,  32,  34,  35,  36,  37,  38,  39,  40,  41,  42,
         43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
         57,  58,  59,  60,  61,  62,  63,  64,  66,  68,  69,  70,  71,
         72,  74,  75,  77,  78,  79,  81,  82,  83,  84,  87,  88,  89,
         90,  91,  92,  94,  95,  96,  97,  99, 100, 101, 102, 103, 105,
        106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 117, 118, 119,
        120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 133,
        135, 136, 137, 138, 140, 141, 142, 143, 144, 145, 146, 147, 148,
        149, 150, 151, 153, 154, 156, 158, 159, 160, 161, 162, 164, 165,
        166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 179,
        180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192,
        193, 194, 195, 196, 197, 198, 199, 200, 201

### Or

In [14]:
for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)

TRAIN: [  0   2   3   4   6   7   8   9  10  11  12  13  14  15  16  17  18  19
  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37
  38  39  40  41  42  43  45  46  48  49  50  51  52  53  54  55  56  57
  58  59  60  61  62  63  64  65  66  68  69  70  71  72  73  74  75  76
  77  79  80  81  82  83  84  85  86  87  89  90  91  92  93  94  95  96
  97  98  99 100 101 102 103 104 105 106 107 108 109 110 112 113 114 115
 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133
 134 136 139 140 141 142 143 145 146 147 148 152 153 154 155 156 157 158
 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 176 177
 178 179 180 182 183 184 185 186 187 188 189 190 192 193 195 196 197 198
 199 201 202 203 204 205 206 207 209 210 211 212 214 215 216 217 218 219
 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
 238 239 240 241 242 243 244 245 246 248 249 250 251 252 253 255 256 257
 258 259 260 261 262 263 264 265 266 267 268

### 取出切割資料：trainset / testset 特徵(x_train/x_test)/標註(y_train/y_test)

In [18]:
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

In [21]:
X_train

array([[15624510, 'Male', 19.0, 19000.0],
       [15810944, 'Male', 35.0, 20000.0],
       [15668575, 'Female', 26.0, 43000.0],
       ...,
       [15654296, 'Female', 50.0, 20000.0],
       [15755018, 'Male', 36.0, 33000.0],
       [15594041, 'Female', 49.0, 36000.0]], dtype=object)

In [22]:
X_test

array([[15598044, 'Female', 27.0, 84000.0],
       [15581198, 'Male', 31.0, 74000.0],
       [15591915, 'Female', 33.0, 51000.0],
       [15792008, 'Male', 30.0, 15000.0],
       [15649136, 'Female', 24.0, 55000.0],
       [15595324, 'Female', 31.0, 68000.0],
       [15782530, 'Female', 33.0, 113000.0],
       [15631912, 'Female', 28.0, 85000.0],
       [15582492, 'Male', 28.0, 123000.0],
       [15739160, 'Male', 42.0, 80000.0],
       [15811613, 'Female', 36.0, 75000.0],
       [15622833, 'Female', 34.0, 25000.0],
       [15767871, 'Male', 20.0, 74000.0],
       [15578738, 'Female', 18.0, 86000.0],
       [15587013, 'Male', 21.0, 88000.0],
       [15603319, 'Male', 33.0, 43000.0],
       [15708228, 'Male', 35.0, 22000.0],
       [15708196, 'Male', 49.0, 74000.0],
       [15809347, 'Female', 41.0, 71000.0],
       [15766609, 'Female', 47.0, 47000.0],
       [15732987, 'Male', 59.0, 143000.0],
       [15593715, 'Male', 60.0, 102000.0],
       [15795224, 'Female', 36.0, 126000.0],
     

In [23]:
Y_train

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0,

In [24]:
Y_test

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0], dtype=int64)