# 划分训练-测试集

张子豪 2023-2-22 6-9

## 导入工具包

In [1]:
import os
import shutil
import random

from tqdm import tqdm

## 指定数据集路径

In [2]:
Dataset_Path = 'Watermelon87_Semantic_Seg_Labelme'

## 查看数据集目录结构

In [3]:
import seedir as sd
sd.seedir(Dataset_Path, style='emoji', depthlimit=1)

📁 Watermelon87_Semantic_Seg_Labelme/
├─📁 img_dir/
└─📁 ann_dir/


## 创建文件夹

In [4]:
os.chdir(Dataset_Path)
os.mkdir('train')
os.mkdir('val')

In [5]:
len(os.listdir('img_dir'))

87

In [6]:
len(os.listdir('ann_dir'))

87

## 删除系统自动生成的多余文件

### 查看待删除的多余文件

In [7]:
!find . -iname '__MACOSX'

In [8]:
!find . -iname '.DS_Store'

In [9]:
!find . -iname '.ipynb_checkpoints'

### 删除多余文件

In [10]:
!for i in `find . -iname '__MACOSX'`; do rm -rf $i;done

In [11]:
!for i in `find . -iname '.DS_Store'`; do rm -rf $i;done

In [12]:
!for i in `find . -iname '.ipynb_checkpoints'`; do rm -rf $i;done

### 验证多余文件已删除

In [13]:
!find . -iname '__MACOSX'

In [14]:
!find . -iname '.DS_Store'

In [15]:
!find . -iname '.ipynb_checkpoints'

## 在图像文件夹中，划分训练集和测试集

In [16]:
test_frac = 0.2  # 测试集比例
random.seed(123) # 随机数种子，便于复现

In [17]:
folder = 'img_dir'

In [18]:
img_paths = os.listdir(folder)
random.shuffle(img_paths) # 随机打乱

val_number = int(len(img_paths) * test_frac) # 测试集文件个数
train_files = img_paths[val_number:]         # 训练集文件名列表
val_files = img_paths[:val_number]           # 测试集文件名列表

print('数据集文件总数', len(img_paths))
print('训练集文件个数', len(train_files))
print('测试集文件个数', len(val_files))

数据集文件总数 87
训练集文件个数 70
测试集文件个数 17


## 将训练集图像移动至`train`目录

In [19]:
for each in tqdm(train_files):
    src_path = os.path.join(folder, each)
    dst_path = os.path.join('train', each)
    shutil.move(src_path, dst_path)

100%|██████████| 70/70 [00:00<00:00, 881.71it/s]


## 将测试集图像移动至`val`目录

In [20]:
for each in tqdm(val_files):
    src_path = os.path.join(folder, each)
    dst_path = os.path.join('val', each)
    shutil.move(src_path, dst_path)

100%|██████████| 17/17 [00:00<00:00, 909.07it/s]


In [21]:
len(os.listdir('train')) + len(os.listdir('val'))

87

## 将`train`和`val`剪切至`img_dir`

In [22]:
shutil.move('train', 'img_dir/train')
shutil.move('val', 'img_dir/val')

'img_dir/val'

## 在标注文件夹中，划分训练集和测试集

In [24]:
folder = 'ann_dir'

In [25]:
os.mkdir('train')
os.mkdir('val')

## 将训练集标注移动至`train`目录

In [26]:
for each in tqdm(train_files):
    src_path = os.path.join(folder, each.split('.')[0]+'.png')
    dst_path = os.path.join('train', each.split('.')[0]+'.png')
    shutil.move(src_path, dst_path)

100%|██████████| 70/70 [00:00<00:00, 1349.30it/s]


## 将测试集标注移动至`val`目录

In [27]:
for each in tqdm(val_files):
    src_path = os.path.join(folder, each.split('.')[0]+'.png')
    dst_path = os.path.join('val', each.split('.')[0]+'.png')
    shutil.move(src_path, dst_path)

100%|██████████| 17/17 [00:00<00:00, 1537.04it/s]


In [28]:
len(os.listdir('train')) + len(os.listdir('val'))

87

## 将`train`和`val`剪切至`ann_dir`

In [29]:
shutil.move('train', 'ann_dir/train')
shutil.move('val', 'ann_dir/val')

'ann_dir/val'

## 删除系统自动生成的多余文件

In [49]:
os.chdir('../')

### 查看待删除的多余文件

In [60]:
!find . -iname '__MACOSX'

In [61]:
!find . -iname '.DS_Store'

In [62]:
!find . -iname '.ipynb_checkpoints'

### 删除多余文件

In [63]:
!for i in `find . -iname '__MACOSX'`; do rm -rf $i;done

In [64]:
!for i in `find . -iname '.DS_Store'`; do rm -rf $i;done

In [65]:
!for i in `find . -iname '.ipynb_checkpoints'`; do rm -rf $i;done

### 验证多余文件已删除

In [66]:
!find . -iname '__MACOSX'

In [67]:
!find . -iname '.DS_Store'

In [68]:
!find . -iname '.ipynb_checkpoints'

## 得到划分好训练集测试集的完整语义分割数据集