Skip to content

Commit

Permalink
Refine ppdet download (open-mmlab#3628)
Browse files Browse the repository at this point in the history
* refine download sh to py

* update QUICK_STARTED
  • Loading branch information
heavengate committed Oct 17, 2019
1 parent db627af commit 8116ac6
Show file tree
Hide file tree
Showing 11 changed files with 131 additions and 80 deletions.
20 changes: 0 additions & 20 deletions dataset/coco/download.sh

This file was deleted.

25 changes: 25 additions & 0 deletions dataset/coco/download_coco.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import sys
import os.path as osp
import logging

from ppdet.utils.download import download_dataset

logging.basicConfig(level=logging.INFO)

download_path = osp.split(osp.realpath(sys.argv[0]))[0]
download_dataset(download_path, 'coco')
10 changes: 0 additions & 10 deletions dataset/fruit/download.sh

This file was deleted.

25 changes: 25 additions & 0 deletions dataset/fruit/download_fruit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import sys
import os.path as osp
import logging

from ppdet.utils.download import download_dataset

logging.basicConfig(level=logging.INFO)

download_path = osp.split(osp.realpath(sys.argv[0]))[0]
download_dataset(download_path, 'fruit')
16 changes: 0 additions & 16 deletions dataset/voc/download.sh

This file was deleted.

25 changes: 25 additions & 0 deletions dataset/voc/download_voc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import sys
import os.path as osp
import logging

from ppdet.utils.download import download_dataset

logging.basicConfig(level=logging.INFO)

download_path = osp.split(osp.realpath(sys.argv[0]))[0]
download_dataset(download_path, 'voc')
8 changes: 4 additions & 4 deletions docs/INSTALL.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,15 +110,15 @@ On the other hand, to download the datasets, run the following commands:
- COCO

```
cd dataset/coco
./download.sh
export PYTHONPATH=$PYTHONPATH:.
python dataset/coco/download_coco.py
```

- Pascal VOC

```
cd dataset/voc
./download.sh
export PYTHONPATH=$PYTHONPATH:.
python dataset/voc/download_voc.py
```

**Download datasets automatically:**
Expand Down
8 changes: 4 additions & 4 deletions docs/INSTALL_cn.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,15 +109,15 @@ ln -sf <path/to/voc> <path/to/paddle_detection>/dataset/voc
- COCO

```
cd dataset/coco
./download.sh
export PYTHONPATH=$PYTHONPATH:.
python dataset/coco/download_coco.py
```

- Pascal VOC

```
cd dataset/voc
./download.sh
export PYTHONPATH=$PYTHONPATH:.
python dataset/voc/download_voc.py
```

**自动下载数据集:**
Expand Down
6 changes: 3 additions & 3 deletions docs/QUICK_STARTED.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@ This tutorial fine-tunes a tiny dataset by pretrained detection model for users

## Data Preparation

Dataset refers to [Kaggle](https://www.kaggle.com/mbkinaci/fruit-images-for-object-detection), which contains 240 images in train dataset and 60 images in test dataset. Data categories are apple, orange and banana. Download [here](https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit-detection.tar) and uncompress the dataset after download, script for data preparation is located at [download.sh](../dataset/fruit/download.sh). Command is as follows:
Dataset refers to [Kaggle](https://www.kaggle.com/mbkinaci/fruit-images-for-object-detection), which contains 240 images in train dataset and 60 images in test dataset. Data categories are apple, orange and banana. Download [here](https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit-detection.tar) and uncompress the dataset after download, script for data preparation is located at [download_fruit.py](../dataset/fruit/download_fruit.py). Command is as follows:

```bash
cd dataset/fruit
sh download.sh
export PYTHONPATH=$PYTHONPATH:.
python dataset/fruit/download_fruit.py
```

- **Note: before started, run the following command and specifiy the GPU**
Expand Down
6 changes: 3 additions & 3 deletions docs/QUICK_STARTED_cn.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@

## 数据准备

数据集参考[Kaggle数据集](https://www.kaggle.com/mbkinaci/fruit-images-for-object-detection),其中训练数据集240张图片,测试数据集60张图片,数据类别为3类:苹果,橘子,香蕉。[下载链接](https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit-detection.tar)。数据下载后分别解压即可, 数据准备脚本位于[download.sh](../dataset/fruit/download.sh)。下载数据方式如下:
数据集参考[Kaggle数据集](https://www.kaggle.com/mbkinaci/fruit-images-for-object-detection),其中训练数据集240张图片,测试数据集60张图片,数据类别为3类:苹果,橘子,香蕉。[下载链接](https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit-detection.tar)。数据下载后分别解压即可, 数据准备脚本位于[download_fruit.py](../dataset/fruit/download_fruit.py)。下载数据方式如下:

```bash
cd dataset/fruit
sh download.sh
export PYTHONPATH=$PYTHONPATH:.
python dataset/fruit/download_fruit.py
```

- **注:在开始前,运行如下命令并指定GPU**
Expand Down
62 changes: 42 additions & 20 deletions ppdet/utils/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/weights")
DATASET_HOME = osp.expanduser("~/.cache/paddle/dataset")

# dict of {dataset_name: (downalod_info, sub_dirs)}
# dict of {dataset_name: (download_info, sub_dirs)}
# download info: (url, md5sum)
DATASETS = {
'coco': ([
Expand All @@ -60,6 +60,11 @@
'http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar',
'b6e924de25625d8de591ea690078ad9f', ),
], ["VOCdevkit/VOC_all"]),
'fruit': ([
(
'https://dataset.bj.bcebos.com/PaddleDetection_demo/fruit-detection.tar',
'374554a7633b1b68d6a5fbb7c061b8ba', ),
], ["fruit-detection"]),
}

DOWNLOAD_RETRY_LIMIT = 3
Expand Down Expand Up @@ -103,25 +108,7 @@ def get_dataset_path(path, annotation, image_dir):

# voc should merge dir and create list after download
if name == 'voc':
logger.info("Download voc dataset successed, merge "
"VOC2007 and VOC2012 to VOC_all...")
output_dir = osp.join(data_dir, dataset[1][0])
devkit_dir = "/".join(output_dir.split('/')[:-1])
years = ['2007', '2012']
# merge dir in output_tmp_dir at first, move to
# output_dir after merge sucessed.
output_tmp_dir = osp.join(data_dir, 'tmp')
if osp.isdir(output_tmp_dir):
shutil.rmtree(output_tmp_dir)
# NOTE(dengkaipeng): since using auto download VOC
# dataset, VOC default label list should be used,
# do not generate label_list.txt here. For default
# label, see ../data/source/voc_loader.py
merge_and_create_list(devkit_dir, years, output_tmp_dir)
shutil.move(output_tmp_dir, output_dir)
# remove source directory VOC2007 and VOC2012
shutil.rmtree(osp.join(devkit_dir, "VOC2007"))
shutil.rmtree(osp.join(devkit_dir, "VOC2012"))
_merge_voc_dir(data_dir, dataset[1][0])
return data_dir

# not match any dataset in DATASETS
Expand All @@ -130,6 +117,28 @@ def get_dataset_path(path, annotation, image_dir):
"'voc' and 'coco' currently".format(path, osp.split(path)[-1]))


def _merge_voc_dir(data_dir, output_subdir):
logger.info("Download voc dataset successed, merge "
"VOC2007 and VOC2012 to VOC_all...")
output_dir = osp.join(data_dir, output_subdir)
devkit_dir = "/".join(output_dir.split('/')[:-1])
years = ['2007', '2012']
# merge dir in output_tmp_dir at first, move to
# output_dir after merge sucessed.
output_tmp_dir = osp.join(data_dir, 'tmp')
if osp.isdir(output_tmp_dir):
shutil.rmtree(output_tmp_dir)
# NOTE: since using auto download VOC
# dataset, VOC default label list should be used,
# do not generate label_list.txt here. For default
# label, see ../data/source/voc_loader.py
merge_and_create_list(devkit_dir, years, output_tmp_dir)
shutil.move(output_tmp_dir, output_dir)
# remove source directory VOC2007 and VOC2012
shutil.rmtree(osp.join(devkit_dir, "VOC2007"))
shutil.rmtree(osp.join(devkit_dir, "VOC2012"))


def map_path(url, root_dir):
# parse path after download to decompress under root_dir
fname = url.split('/')[-1]
Expand Down Expand Up @@ -173,6 +182,19 @@ def get_path(url, root_dir, md5sum=None):
return fullpath


def download_dataset(path, dataset=None):
if dataset not in DATASETS.keys():
logger.error("Unknown dataset {}, it should be "
"{}".format(dataset, DATASETS.keys()))
return
dataset_info = DATASETS[dataset][0]
for info in dataset_info:
get_path(info[0], path, info[1])
if dataset == 'voc':
_merge_voc_dir(path, DATASETS[dataset][1][0])
logger.info("Download dataset {} finished.".format(dataset))


def _dataset_exists(path, annotation, image_dir):
"""
Check if user define dataset exists
Expand Down

0 comments on commit 8116ac6

Please sign in to comment.