In [1]:
import os
import matplotlib.image as mpimg
import cv2 as cv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
import multiprocessing
from tqdm import tqdm
from random import randrange

In [2]:
def img_to_imgs(file):
    size = 2600 # about 60 seconds

    img = cv.imread(file)
    
    width = img.shape[1]
    split_num = int(np.floor(width/size))
    residual = width - split_num*size
    end_split_index = width - residual
    
    imgs = []
    if split_num > 0:
        imgs = np.array_split(img[:, :end_split_index, :], split_num, axis=1)

    if residual > 100: # about 2 seconds
        residual_img = img[:, end_split_index:, :]
        residual_img = np.pad(img, ((0, 0), (0, randrange(7, 13, 1)), (0, 0)), 'constant')
        repeats = int(np.ceil(size/residual_img.shape[1]))
        
        imgs.append(np.tile(residual_img, (1, repeats, 1))[:, :size, :])
    
    return imgs

In [3]:
def split_save_imgs(file, bird_storage_dir):
    file_label = file.split('/')[-1].split('.')[0]
    
    imgs = img_to_imgs(file)

    for i, img in enumerate(imgs):
        mpimg.imsave(bird_storage_dir + "/" + file_label + '-' + str(i + 1)  + '.jpeg', img)

In [4]:
def save_bird_imgs(files, bird_storage_dir): 
    if not os.path.exists(bird_storage_dir):
        os.mkdir(bird_storage_dir)

    for file in files:
        split_save_imgs(file, bird_storage_dir)

In [5]:
working_dir = "./data/train_img/"
birds = []
flist = {} 

for root, dirs, files in os.walk(working_dir):
    if root == working_dir:
        birds = dirs

for i, bird in enumerate(birds):
    blist = [] 
    
    for root, dirs, files in os.walk(working_dir + bird):
        for file in files:
            if file.endswith(".jpeg") and 'ipynb_checkpoints' not in root:
                blist.append(os.path.join(root, file))

    blist.sort()
    flist[bird] = blist

In [6]:
num_cores = multiprocessing.cpu_count()

In [7]:
import warnings
warnings.filterwarnings('ignore')

storage_dir = "./data/train_img_final/"
birds_handled = 0

for bird, files in flist.items():
    bird_dir = storage_dir + bird
    
    if not os.path.exists(bird_dir):
        os.mkdir(bird_dir)

    birds_handled += 1
    print(birds_handled, bird)

    Parallel(n_jobs=num_cores - 1)(
        delayed(split_save_imgs)(file, bird_dir) for file in files
    )

1 olsfly
2 houspa
3 semplo
4 evegro
5 brnthr
6 truswa
7 brespa
8 gryfly
9 canwre
10 belkin1
11 bkhgro
12 sagthr
13 comrav
14 bnhcow
15 rufhum
16 blujay
17 ribgul
18 cliswa
19 carwre
20 comgol
21 daejun
22 pygnut
23 btywar
24 gadwal
25 calgul
26 bongul
27 rebwoo
28 caster1
29 mouchi
30 horgre
31 hoomer
32 pinsis
33 banswa
34 whtswi
35 annhum
36 rebnut
37 comgra
38 amerob
39 brebla
40 blugrb1
41 magwar
42 merlin
43 rocpig
44 reevir1
45 westan
46 amered
47 norcar
48 nrwswa
49 grycat
50 reshaw
51 semsan
52 gnwtea
53 rufgro
54 bewwre
55 greegr
56 whcspa
57 barswa
58 bkbwar
59 sora
60 macwar
61 chiswi
62 easblu
63 norpin
64 scoori
65 sonspa
66 greroa
67 y00475
68 amepip
69 bulori
70 gockin
71 buwwar
72 spotow
73 lotduc
74 gcrfin
75 boboli
76 rthhum
77 amekes
78 wooscj2
79 chukar
80 yehbla
81 chispa
82 bktspa
83 balori
84 juntit1
85 canwar
86 labwoo
87 bawwar
88 yelwar
89 rudduc
90 linspa
91 whtspa
92 casvir
93 comnig
94 eursta
95 prawar
96 warvir
97 lesyel
98 grtgra
99 herthr
100 gnttow
101 