In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

In [2]:
NUM_CLASSES = 8
N_FOLD = 5
ROOT_PATH = '/home/ryan/Machine_Learning/AI4VN'
SAVE_PATH = '/home/ryan/Machine_Learning/AI4VN/fold'
kf = KFold(n_splits=N_FOLD, shuffle=True)

In [3]:
train_df = pd.read_csv(ROOT_PATH + '/' + "train.csv")

In [4]:
class_dict = {}
for Class in range(NUM_CLASSES):
    class_dict["Class_{}".format(Class)] = []

In [5]:
for i in range(len(train_df)):
    name_img = train_df['image_id'][i]
    label = train_df['label'][i]
    class_dict["Class_{}".format(label)].append(name_img)

for Class in range(NUM_CLASSES):
    print("Number image of class {} : {}".format(Class, len(class_dict["Class_{}".format(Class)])))

Number image of class 0 : 6094
Number image of class 1 : 3265
Number image of class 2 : 4064
Number image of class 3 : 3743
Number image of class 4 : 3035
Number image of class 5 : 4375
Number image of class 6 : 3264
Number image of class 7 : 4250


In [6]:
train_folds = {}
train_label_folds = {}
valid_folds = {}
valid_label_folds = {}
for i in range(N_FOLD):
    train_folds["Fold_{}".format(i)] = []
    train_label_folds["Fold_{}".format(i)] = []
    valid_folds["Fold_{}".format(i)] = []
    valid_label_folds["Fold_{}".format(i)] = []

for Class in range(NUM_CLASSES):
    cnt_fold = 0
    for train_index, valid_index in kf.split(class_dict["Class_{}".format(Class)]):
        #print(len(train_index), len(valid_index))
        for index in train_index:
            train_folds["Fold_{}".format(cnt_fold)].append(class_dict["Class_{}".format(Class)][index])
            train_label_folds["Fold_{}".format(cnt_fold)].append(Class)
        for index in valid_index:
            valid_folds["Fold_{}".format(cnt_fold)].append(class_dict["Class_{}".format(Class)][index])
            valid_label_folds["Fold_{}".format(cnt_fold)].append(Class)
        cnt_fold += 1

In [7]:
for fold in range(N_FOLD):
    print("Length fold {} (train set) = {}".format(fold, len(train_folds["Fold_{}".format(fold)])))
    print("Length fold {} (label train set) = {}".format(fold, len(train_label_folds["Fold_{}".format(fold)])))
    print("Length fold {} (valid set) = {}".format(fold, len(valid_folds["Fold_{}".format(fold)])))
    print("Length fold {} (label valid set) = {}".format(fold, len(valid_label_folds["Fold_{}".format(fold)])))

Length fold 0 (train set) = 25671
Length fold 0 (label train set) = 25671
Length fold 0 (valid set) = 6419
Length fold 0 (label valid set) = 6419
Length fold 1 (train set) = 25671
Length fold 1 (label train set) = 25671
Length fold 1 (valid set) = 6419
Length fold 1 (label valid set) = 6419
Length fold 2 (train set) = 25671
Length fold 2 (label train set) = 25671
Length fold 2 (valid set) = 6419
Length fold 2 (label valid set) = 6419
Length fold 3 (train set) = 25672
Length fold 3 (label train set) = 25672
Length fold 3 (valid set) = 6418
Length fold 3 (label valid set) = 6418
Length fold 4 (train set) = 25675
Length fold 4 (label train set) = 25675
Length fold 4 (valid set) = 6415
Length fold 4 (label valid set) = 6415


In [8]:
for fold in range(N_FOLD):
    fold_name = "Fold_" + str(fold)
    train_dict_df = {'image_id': train_folds[fold_name], 'label':train_label_folds[fold_name]}
    df = pd.DataFrame(train_dict_df)
    df.to_csv(SAVE_PATH + '/' + "train_fold_{}.csv".format(fold), index = False)
    
    valid_dict_df = {'image_id': valid_folds[fold_name], 'label':valid_label_folds[fold_name]}
    df = pd.DataFrame(valid_dict_df)
    df.to_csv(SAVE_PATH + '/' + "valid_fold_{}.csv".format(fold), index = False)