# Download data for FGVC-Aircraft Benchmark

http://www.robots.ox.ac.uk/~vgg/data/fgvc-aircraft/

In [1]:
!mkdir -p data_fgvc

In [2]:
!wget -O data_fgvc/fgvc-aircraft-2013b.tar.gz http://www.robots.ox.ac.uk/~vgg/data/fgvc-aircraft/archives/fgvc-aircraft-2013b.tar.gz

--2018-09-04 08:53:54--  http://www.robots.ox.ac.uk/~vgg/data/fgvc-aircraft/archives/fgvc-aircraft-2013b.tar.gz
Resolving www.robots.ox.ac.uk (www.robots.ox.ac.uk)... 129.67.94.2
Connecting to www.robots.ox.ac.uk (www.robots.ox.ac.uk)|129.67.94.2|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2753340328 (2.6G) [application/x-gzip]
Saving to: 'data_fgvc/fgvc-aircraft-2013b.tar.gz'


2018-09-04 11:08:24 (333 KB/s) - 'data_fgvc/fgvc-aircraft-2013b.tar.gz' saved [2753340328/2753340328]



In [4]:
!cd data_fgvc && tar xzvf fgvc-aircraft-2013b.tar.gz > tarlog.txt && cd ..

### Organize as folder

Original split is (train, valid, test) = (3333, 3333, 3333).
But for our purpose, validation set is too much and training set is too small.

So I split as (train, valid, test) = (6000, 666, 3333)

In [3]:
!rm -rf data_fgvc/train
!rm -rf data_fgvc/valid
!rm -rf data_fgvc/test

In [4]:
!mkdir -p data_fgvc/train
!mkdir -p data_fgvc/valid
!mkdir -p data_fgvc/test

In [5]:
with open("data_fgvc/fgvc-aircraft-2013b/data/variants.txt") as f:
    variants_to_id_dict = {vari.rstrip("\n"): id for id, vari in enumerate(f)}

In [6]:
len(variants_to_id_dict.keys())

100

In [7]:
variants_to_id_dict['707-320'], variants_to_id_dict['727-200']

(0, 1)

In [8]:
with open("data_fgvc/fgvc-aircraft-2013b/data/images_variant_trainval.txt") as f:
    trainval_file_var_list = [line.rstrip("\n").split(" ", 1) for line in f]

In [9]:
len(trainval_file_var_list)

6667

In [10]:
trainval_file_var_list[0:5]

[['1025794', '707-320'],
 ['1340192', '707-320'],
 ['0056978', '707-320'],
 ['0698580', '707-320'],
 ['0450014', '707-320']]

In [27]:
trainval_file_dict = {}

In [28]:
for fpath, cat in trainval_file_var_list:
    trainval_file_dict.setdefault(str(variants_to_id_dict[cat]), []).append([fpath, cat])

In [29]:
len(trainval_file_dict['0']),len(trainval_file_dict['1']) 

(67, 67)

In [30]:
len(trainval_file_dict.keys())

100

In [31]:
trainval_file_dict['0'][0:5]

[['1025794', '707-320'],
 ['1340192', '707-320'],
 ['0056978', '707-320'],
 ['0698580', '707-320'],
 ['0450014', '707-320']]

In [34]:
from sklearn.model_selection import train_test_split

In [35]:
train_file_var_list = []
val_file_var_list = []

In [37]:
for catint in range(0, 100):
    train_file_var_list_one, val_file_var_list_one = train_test_split(trainval_file_dict[str(catint)], test_size=0.1, random_state=42+catint)
    train_file_var_list.extend(train_file_var_list_one)
    val_file_var_list.extend(val_file_var_list_one)

In [38]:
len(train_file_var_list)

5967

In [39]:
len(val_file_var_list)

700

In [41]:
temp_dict = {}

In [42]:
for fpath, cat in val_file_var_list:
    temp_dict.setdefault(str(variants_to_id_dict[cat]), []).append([fpath, cat])

In [43]:
[len(temp_dict[key]) for key in temp_dict.keys()]

[7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7,
 7]

In [44]:
train_file_var_list[0:5]

[['1042824', '707-320'],
 ['0869692', '707-320'],
 ['0582363', '707-320'],
 ['0577855', '707-320'],
 ['1518652', '707-320']]

In [45]:
import tqdm
import os

In [46]:
### data_type: "train", "valid", "test
def hard_links(file_var_list, data_type):
    for file, variants in tqdm.tqdm(file_var_list):
        filename = "{}.jpg".format(file)
        dirid = variants_to_id_dict[variants]
        from_path = "data_fgvc/fgvc-aircraft-2013b/data/images/{}".format(filename)
        to_dir = "data_fgvc/{}/{}".format(data_type, dirid)
        to_path = "{}/{}".format(to_dir, filename)
        if not os.path.exists(to_dir):
            os.mkdir(to_dir)
        os.link(from_path, to_path)

In [47]:
hard_links(train_file_var_list, "train")

100%|██████████| 5967/5967 [00:00<00:00, 77119.66it/s]


In [48]:
len(train_file_var_list)

5967

In [49]:
#with open("data_fgvc/fgvc-aircraft-2013b/data/images_variant_val.txt") as f:
#    val_file_var_list = [line.rstrip("\n").split(" ", 1) for line in f]

In [50]:
hard_links(val_file_var_list, "valid")

100%|██████████| 700/700 [00:00<00:00, 60927.03it/s]


In [19]:
with open("data_fgvc/fgvc-aircraft-2013b/data/images_variant_test.txt") as f:
    test_file_var_list = [line.rstrip("\n").split(" ", 1) for line in f]

In [20]:
hard_links(test_file_var_list, "test")

100%|██████████| 3333/3333 [00:00<00:00, 42764.72it/s]
