# Split training-validation set

The original file is part of the semantic segmentation tutorial project.

Code source: https://github.com/TommyZihao/Label2Everything.git

File Path: labelme2mask/【C】划分训练-测试集.ipynb

Translated and modified by Mingyu Bi.

## Import package

In [None]:
import os
import shutil
import random
from tqdm import tqdm

## Dataset path

In [None]:
Dataset_Path = 'dataset_split'

## View the dataset directory structure

In [None]:
# !pip install seedir

In [None]:
# !pip install emoji

In [None]:
import seedir as sd

In [None]:
sd.seedir(Dataset_Path, style='emoji', depthlimit=1)

## Create folders

In [None]:
os.chdir(Dataset_Path)
os.mkdir('train')
os.mkdir('val')

In [None]:
len(os.listdir('img_dir'))

In [None]:
len(os.listdir('ann_dir'))

## Delete redundant files automatically generated by the system

### View redundant files to be deleted

In [None]:
!find . -iname '__MACOSX'

In [None]:
!find . -iname '.DS_Store'

In [None]:
!find . -iname '.ipynb_checkpoints'

### Delete redundant files

In [None]:
!for i in `find . -iname '__MACOSX'`; do rm -rf $i;done

In [None]:
!for i in `find . -iname '.DS_Store'`; do rm -rf $i;done

In [None]:
!for i in `find . -iname '.ipynb_checkpoints'`; do rm -rf $i;done

### Verify that the extra files have been deleted

In [None]:
!find . -iname '__MACOSX'

In [None]:
!find . -iname '.DS_Store'

In [None]:
!find . -iname '.ipynb_checkpoints'

## In the image folder, divide the training set and the validation set

In [None]:
test_frac = 0.2  # Validation set ratio
random.seed(123) # Random number seed for reproduction

In [None]:
folder = 'img_dir'

In [None]:
img_paths = os.listdir(folder)
random.shuffle(img_paths) # Random shuffle

val_number = int(len(img_paths) * test_frac) # Number of validation set files
train_files = img_paths[val_number:]         # List of training set file names
val_files = img_paths[:val_number]           # List of validation set file names

print('Total number of dataset files', len(img_paths))
print('Number of training set files', len(train_files))
print('Number of validation set files', len(val_files))

## Move the training set images to the `train` directory

In [None]:
for each in tqdm(train_files):
    src_path = os.path.join(folder, each)
    dst_path = os.path.join('train', each)
    shutil.move(src_path, dst_path)

## Move the validation set images to the `val` directory

In [None]:
for each in tqdm(val_files):
    src_path = os.path.join(folder, each)
    dst_path = os.path.join('val', each)
    shutil.move(src_path, dst_path)

In [None]:
len(os.listdir('train')) + len(os.listdir('val'))

## Cut `train` and `val` to `img_dir`

In [None]:
shutil.move('train', 'img_dir/train')
shutil.move('val', 'img_dir/val')

## In the annotation folder, divide the training set and the validation set

In [None]:
folder = 'ann_dir'

In [None]:
os.mkdir('train')
os.mkdir('val')

## Move the training set annotations to the `train` directory

In [None]:
for each in tqdm(train_files):
    src_path = os.path.join(folder, each.split('.')[0]+'.png')
    dst_path = os.path.join('train', each.split('.')[0]+'.png')
    shutil.move(src_path, dst_path)

## Move the validation set annotations to the `train` directory

In [None]:
for each in tqdm(val_files):
    src_path = os.path.join(folder, each.split('.')[0]+'.png')
    dst_path = os.path.join('val', each.split('.')[0]+'.png')
    shutil.move(src_path, dst_path)

In [None]:
len(os.listdir('train')) + len(os.listdir('val'))

## Cut `train` and `val` to `ann_dir`

In [None]:
shutil.move('train', 'ann_dir/train')
shutil.move('val', 'ann_dir/val')

## Delete redundant files automatically generated by the system

In [None]:
os.chdir('../')

### View redundant files to be deleted

In [None]:
!find . -iname '__MACOSX'

In [None]:
!find . -iname '.DS_Store'

In [None]:
!find . -iname '.ipynb_checkpoints'

### Delete redundant files

In [None]:
!for i in `find . -iname '__MACOSX'`; do rm -rf $i;done

In [None]:
!for i in `find . -iname '.DS_Store'`; do rm -rf $i;done

In [None]:
!for i in `find . -iname '.ipynb_checkpoints'`; do rm -rf $i;done

### Verify that the extra files have been deleted

In [None]:
!find . -iname '__MACOSX'

In [None]:
!find . -iname '.DS_Store'

In [None]:
!find . -iname '.ipynb_checkpoints'