In [1]:
import os
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
import multiprocessing
from tqdm import tqdm
from random import randrange

In [2]:
def img_to_imgs(file):
    size = 2000 # about 40 seconds

    img = mpimg.imread(file)
    
    width = img.shape[1]
    split_num = int(np.floor(width/size))
    residual = width - split_num*size
    end_split_index = width - residual
    
    imgs = []
    if split_num > 0:
        imgs = np.array_split(img[:, :end_split_index, :], split_num, axis=1)

    if residual > 60: # about 1.5 seconds
        residual_img = img[:, end_split_index:, :]
        residual_img = np.pad(img, ((0, 0), (0, randrange(7, 13, 1)), (0, 0)), 'constant')
        repeats = int(np.ceil(size/residual_img.shape[1]))
        
        imgs.append(np.tile(residual_img, (1, repeats, 1))[:, :size, :])
    
    return imgs

In [3]:
def split_save_imgs(file, bird_storage_dir):
    file_label = file.split('/')[-1].split('.')[0]
    
    imgs = img_to_imgs(file)

    for i, img in enumerate(imgs):
        mpimg.imsave(bird_storage_dir + "/" + file_label + '-' + str(i + 1)  + '.jpeg', img)

In [4]:
def save_bird_imgs(files, bird_storage_dir): 
    if not os.path.exists(bird_storage_dir):
        os.mkdir(bird_storage_dir)

    for file in files:
        split_save_imgs(file, bird_storage_dir)

In [5]:
working_dir = "./data/train_img/"
birds = []
flist = {} 

for root, dirs, files in os.walk(working_dir):
    if root == working_dir:
        birds = dirs

for i, bird in enumerate(birds):
    blist = [] 
    
    for root, dirs, files in os.walk(working_dir + bird):
        for file in files:
            if file.endswith(".jpeg") and 'ipynb_checkpoints' not in root:
                blist.append(os.path.join(root, file))

    blist.sort()
    flist[bird] = blist

In [6]:
num_cores = multiprocessing.cpu_count()

In [None]:
import warnings
warnings.filterwarnings('ignore')

storage_dir = "./data/train_img_split/"
birds_handled = 0

for bird, files in flist.items():
    bird_dir = storage_dir + bird
    
    if not os.path.exists(bird_dir):
        os.mkdir(bird_dir)

    birds_handled += 1
    print(birds_handled)

    Parallel(n_jobs=num_cores - 2)(
        delayed(split_save_imgs)(file, bird_dir) for file in files
    )