In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
ds_path = '/home/ariya/workspace/datasets/animals10-dvc'
ds_images_path = os.path.join(ds_path, 'images')
os.listdir(ds_images_path)

['butterfly',
 'cat',
 'chicken',
 'cow',
 'dog',
 'elephant',
 'horse',
 'sheep',
 'spider',
 'squirrel']

In [3]:
# cols: image_name, abs_path, split, GT
classes = np.array(['butterfly', 'cat', 'chicken', 'cow', 'dog', 
                    'elephant', 'horse', 'sheep', 'spider', 'squirrel'])

In [4]:
# example of how it works
[*('cat' == classes).astype(int)]

[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]

In [5]:
data = []
exts = ('.jpeg', '.jpg', '.png') # checked in check_ds_info.ipynb
for abs_path, subdirs, files in os.walk(ds_images_path):
    for file in tqdm(files):
        if file.endswith(exts):
            gt_class = os.path.split(abs_path)[-1]
            one_hot_gt = (gt_class == classes).astype(int)
            row = [file, os.path.join(abs_path, file), *one_hot_gt]
            data.append(row)
annot_df = pd.DataFrame(data, columns=['image_name', 'abs_path', *classes])

0it [00:00, ?it/s]
100%|██████████| 2112/2112 [00:00<00:00, 117825.68it/s]
100%|██████████| 1668/1668 [00:00<00:00, 153561.30it/s]
100%|██████████| 3098/3098 [00:00<00:00, 178819.98it/s]
100%|██████████| 1866/1866 [00:00<00:00, 180564.57it/s]
100%|██████████| 4863/4863 [00:00<00:00, 236463.87it/s]
100%|██████████| 1446/1446 [00:00<00:00, 209143.89it/s]
100%|██████████| 2623/2623 [00:00<00:00, 238389.15it/s]
100%|██████████| 1820/1820 [00:00<00:00, 204990.29it/s]
100%|██████████| 4821/4821 [00:00<00:00, 204435.75it/s]
100%|██████████| 1862/1862 [00:00<00:00, 236983.59it/s]


In [6]:
annot_df

Unnamed: 0,image_name,abs_path,butterfly,cat,chicken,cow,dog,elephant,horse,sheep,spider,squirrel
0,OIP--04ndbWy7I04gsPgu9qOeQHaHs.jpeg,/home/ariya/workspace/datasets/animals10-dvc/i...,1,0,0,0,0,0,0,0,0,0
1,OIP--3xxJkezSkNzxrQaxlhv-QHaHa.jpeg,/home/ariya/workspace/datasets/animals10-dvc/i...,1,0,0,0,0,0,0,0,0,0
2,OIP--4zW12JF-JpwKcDtEK7thgHaLC.jpeg,/home/ariya/workspace/datasets/animals10-dvc/i...,1,0,0,0,0,0,0,0,0,0
3,OIP--7K5JzW1ZEmrY2YRyVjFCQHaE7.jpeg,/home/ariya/workspace/datasets/animals10-dvc/i...,1,0,0,0,0,0,0,0,0,0
4,OIP--8RSoaU1iEvZhemy7FlYegHaD9.jpeg,/home/ariya/workspace/datasets/animals10-dvc/i...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
26174,OIP-zpD7SDCbIqwjYm2A06OybwHaID.jpeg,/home/ariya/workspace/datasets/animals10-dvc/i...,0,0,0,0,0,0,0,0,0,1
26175,OIP-zpZ5t_rN3DA6xByG0o05WwAAAA.jpeg,/home/ariya/workspace/datasets/animals10-dvc/i...,0,0,0,0,0,0,0,0,0,1
26176,OIP-zwWLqRd95U-BjagmhkAgoQHaFK.jpeg,/home/ariya/workspace/datasets/animals10-dvc/i...,0,0,0,0,0,0,0,0,0,1
26177,OIP-zyCc-DDxjCSB5m9DR3uNKQHaE8.jpeg,/home/ariya/workspace/datasets/animals10-dvc/i...,0,0,0,0,0,0,0,0,0,1


## Train/Valid/Test split

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
# 80/10/10 split
train_df, valid_test_df = train_test_split(annot_df, test_size=0.2, random_state=42, shuffle=True)
valid_df, test_df = train_test_split(valid_test_df, test_size=0.5, random_state=42, shuffle=True)

In [9]:
train_df.loc[:,'split'] = 'train'
valid_df.loc[:,'split'] = 'valid'
test_df.loc[:,'split'] = 'test'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.loc[:,'split'] = 'train'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  valid_df.loc[:,'split'] = 'valid'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.loc[:,'split'] = 'test'


In [10]:
final_annot_df = pd.concat([train_df, valid_df, test_df])
len(final_annot_df), np.unique(final_annot_df['split'], return_counts=True)

(26179,
 (array(['test', 'train', 'valid'], dtype=object),
  array([ 2618, 20943,  2618])))

In [11]:
final_annot_df.columns

Index(['image_name', 'abs_path', 'butterfly', 'cat', 'chicken', 'cow', 'dog',
       'elephant', 'horse', 'sheep', 'spider', 'squirrel', 'split'],
      dtype='object')

In [12]:
final_annot_df.to_csv(os.path.join(ds_path, 'annotation_df.csv'), index=False)