In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
import shutil


## dataset list

In [2]:
dataset = []

dir_path = Path('../data/StanfordDogs/Images')
for jpg_path in dir_path.glob('**/*.jpg'):
  breed_name = jpg_path.parent.name
  dataset.append([jpg_path, breed_name])
dataset = np.array(dataset)
dataset

array([[PosixPath('../data/StanfordDogs/Images/Newfoundland/n02111277_341.jpg'),
        'Newfoundland'],
       [PosixPath('../data/StanfordDogs/Images/Newfoundland/n02111277_2885.jpg'),
        'Newfoundland'],
       [PosixPath('../data/StanfordDogs/Images/Newfoundland/n02111277_14422.jpg'),
        'Newfoundland'],
       ...,
       [PosixPath('../data/StanfordDogs/Images/Brabancon_griffon/n02112706_1983.jpg'),
        'Brabancon_griffon'],
       [PosixPath('../data/StanfordDogs/Images/Brabancon_griffon/n02112706_2467.jpg'),
        'Brabancon_griffon'],
       [PosixPath('../data/StanfordDogs/Images/Brabancon_griffon/n02112706_1922.jpg'),
        'Brabancon_griffon']], dtype=object)

## train test split

In [3]:
train_image, test_image, train_target, test_target = train_test_split(dataset[:, 0], dataset[:, 1], stratify=dataset[:, 1], test_size=0.2)

In [4]:
pd.Series(train_target).value_counts() / len(train_target)

Maltese_dog            0.012209
Afghan_hound           0.011602
Scottish_deerhound     0.011237
Pomeranian             0.010630
Samoyed                0.010569
                         ...   
Doberman               0.007289
Border_collie          0.007289
Irish_water_spaniel    0.007289
Pekinese               0.007228
redbone                0.007168
Length: 120, dtype: float64

In [5]:
pd.Series(test_target).value_counts() / len(test_target)

Maltese_dog                    0.012391
Afghan_hound                   0.011662
Scottish_deerhound             0.011419
Samoyed                        0.010690
Bernese_mountain_dog           0.010690
                                 ...   
Border_collie                  0.007289
German_short-haired_pointer    0.007289
kuvasz                         0.007289
Pekinese                       0.007289
borzoi                         0.007289
Length: 120, dtype: float64

In [6]:
train_image.shape, test_image.shape, train_target.shape, test_target.shape

((16463,), (4116,), (16463,), (4116,))

In [7]:
len(set(test_target))

120

## train, test dataset

### directory 생성

In [8]:
for breed_name in set(test_target):
  Path(f'../data/StanfordDogs/test/{breed_name}').mkdir(parents=True)
  Path(f'../data/StanfordDogs/train/{breed_name}').mkdir(parents=True)
  Path(f'../data/StanfordDogs/valid/{breed_name}').mkdir(parents=True)


### 파일 이동

In [9]:
for filepath, target_dir in zip(train_image.tolist(), train_target.tolist()):
  source_path = filepath
  filename = str(filepath).split('/')[-1]
  target_dir_path = f'../data/StanfordDogs/train/{target_dir}/{filename}'
  shutil.copy(source_path, target_dir_path)

for filepath, target_dir in zip(test_image.tolist(), test_target.tolist()):
  source_path = filepath
  filename = str(filepath).split('/')[-1]
  target_dir_path = f'../data/StanfordDogs/test/{target_dir}/{filename}'
  shutil.copy(source_path, target_dir_path)

## train, validation split

In [14]:
train_dataset = np.array([[img, tar] for img, tar in zip(train_image, train_target)])
train_dataset

array([[PosixPath('../data/StanfordDogs/Images/Irish_water_spaniel/n02102973_399.jpg'),
        'Irish_water_spaniel'],
       [PosixPath('../data/StanfordDogs/Images/Chesapeake_Bay_retriever/n02099849_110.jpg'),
        'Chesapeake_Bay_retriever'],
       [PosixPath('../data/StanfordDogs/Images/Gordon_setter/n02101006_3062.jpg'),
        'Gordon_setter'],
       ...,
       [PosixPath('../data/StanfordDogs/Images/kuvasz/n02104029_3942.jpg'),
        'kuvasz'],
       [PosixPath('../data/StanfordDogs/Images/bull_mastiff/n02108422_90.jpg'),
        'bull_mastiff'],
       [PosixPath('../data/StanfordDogs/Images/Border_collie/n02106166_1246.jpg'),
        'Border_collie']], dtype=object)

In [16]:
train_image, valid_image, train_target, valid_target = train_test_split(train_dataset[:, 0], train_dataset[:, 1], stratify=train_dataset[:, 1], test_size=0.2)

In [17]:
for filepath, target_dir in zip(valid_image.tolist(), valid_target.tolist()):
  source_path = filepath
  filename = str(filepath).split('/')[-1]
  target_dir_path = f'../data/StanfordDogs/valid/{target_dir}/{filename}'
  shutil.move(source_path, target_dir_path)