In [56]:
import os
from pathlib import PurePath
import shutil
import zipfile

import cv2
import numpy as np

In [6]:
zip_dirpath = 'D:/_resources/datasets'

div2k_train_zip_filepath = os.path.join(zip_dirpath, 'DIV2K_train_HR.zip')
flickr2k_zip_filepath = os.path.join(zip_dirpath, 'Flickr2K.zip')
div2k_valid_zip_filepath = os.path.join(zip_dirpath, 'DIV2K_valid_HR.zip')

assert os.path.exists(div2k_train_zip_filepath)
assert os.path.exists(flickr2k_zip_filepath)
assert os.path.exists(div2k_valid_zip_filepath)

In [14]:
zip_filepaths = [div2k_train_zip_filepath, flickr2k_zip_filepath, div2k_valid_zip_filepath]

def extractall_zip(zip_filepath: str, dest_folder: str, *, verbose: bool, max_load_line_length: int = 50):
    if verbose:
        print(f'Extracting {zip_filepath} to {dest_folder}')
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)
        if verbose:
            print(f'Folder {dest_folder} does not exist and has just been created')
    with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
        if verbose:
            i = 0
            for member in zip_ref.namelist():
                zip_ref.extract(member, dest_folder)
                if i == max_load_line_length:
                    i = 0
                    print()
                print('/', end='')
                i += 1
        else:
            zip_ref.extractall(dest_folder)
    if verbose:
        print()
        print(f'Completely extracted')

for zip_filepath in zip_filepaths:
    dest_folder = os.path.join(os.path.curdir, PurePath(zip_filepath).stem)

    extractall_zip(zip_filepath, dest_folder, verbose=True)

Extracting D:\_resources\datasets\DIV2K_train_HR.zip to .\DIV2K_train_HR
Folder .\DIV2K_train_HR does not exist and has just been created
//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
//////////////////////////////////////////////////
/
Completely extracted
Extracting D:\_resource

In [75]:
cropped_count = 1

def crop_images(folder: str, dest_folder: str, *,
                max_load_line_length=80, n_images_per_cell=100) -> None:
    print(f'Cropping images from {folder} to {dest_folder}')
    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)
        print(f'Folder {dest_folder} did not exist and has just been created')
    patchsize = 256
    stride = 256
    global cropped_count
    line_cell_i = 0
    image_i = 0
    print(f'Each "/" represents {n_images_per_cell} cropped image(s)')
    for img_n in sorted(os.listdir(folder)):
        img = cv2.imread(os.path.join(folder, img_n))
        h, w, _ = img.shape
        h_number = h // patchsize
        w_number = w // patchsize
        for i in range(h_number):
            for j in range(w_number):
                start_ij_l = j * stride
                start_ij_u = i * stride
                end_ij_l = start_ij_l + stride
                end_ij_u = start_ij_u + stride
                img_crop = img[start_ij_u:end_ij_u, start_ij_l:end_ij_l]
                cv2.imwrite(os.path.join(dest_folder, '{:0>6d}.png'.format(cropped_count)), img_crop)
                cropped_count += 1
                image_i += 1
                if image_i == n_images_per_cell:
                    image_i = 0
                    print('/', end='')
                    line_cell_i += 1
                    if line_cell_i == max_load_line_length:
                        print()
                        line_cell_i = 0
    print()
    print('Completely cropped')

cropping_info = [
    ('./DIV2K_train_HR/DIV2K_train_HR/', './train'),
    ('./DIV2K_valid_HR/DIV2K_valid_HR/', './valid'),
    ('./Flickr2K/Flickr2K/', './train')
]

for image_folder, dest_folder in cropping_info:
    crop_images(image_folder, dest_folder)

Cropping images from ./DIV2K_train_HR/DIV2K_train_HR/ to ./train
Folder ./train did not exist and has just been created
Each "/" represents 100 cropped image(s)
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////
Completely cropped
Cropping images from ./DIV2K_valid_HR/DIV2K_valid_HR/ to ./valid
Folder ./valid did not exist and has just been created
Each "/" represents 100 cropped image(s)
///////////////////////////////////
Completely cropped
Cropping images from ./Flickr2K/Flickr2K/ to ./train
Each "/" represents 100 cropped image(s)
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////

In [83]:
def get_folder_size_bytes(folder: str) -> int:
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(folder):
        for filename in filenames:
            filepath = os.path.join(dirpath, filename)
            # Skip if it is a symbolic link
            if not os.path.islink(filepath):
                total_size += os.path.getsize(filepath)
    return total_size

def get_folder_file_count(folder: str) -> int:
    file_count = 0
    for dirpath, dirnames, filenames in os.walk(folder):
        file_count += len(filenames)
    return file_count


folders = [
    './DIV2K_train_HR', './Flickr2K', './DIV2K_valid_HR',
    './train', './valid',
    './train-00', './train-01', './train-02', './train-03'
]

for folder in folders:
    if os.path.exists(folder):
        bytes = get_folder_size_bytes(folder)
        n_files = get_folder_file_count(folder)
        print(f'{folder:20}\t{bytes / (1024 ** 3):3.2f} GiB\t{n_files:7d} file(s)')
    else:
        print(f'{folder:20}\tDOES NOT EXIST')

./DIV2K_train_HR    	3.29 GiB	    800 file(s)
./Flickr2K          	10.86 GiB	   2650 file(s)
./DIV2K_valid_HR    	0.42 GiB	    100 file(s)
./train             	12.50 GiB	 118101 file(s)
./valid             	0.38 GiB	   3598 file(s)
./train-00          	3.12 GiB	  29526 file(s)
./train-01          	3.13 GiB	  29525 file(s)
./train-02          	3.12 GiB	  29525 file(s)
./train-03          	3.13 GiB	  29525 file(s)


In [88]:
need_to_zip = [f'./train-{i:0>2d}' for i in range(4)]

for folder in need_to_zip:
    print(f'Making archive of {folder}')
    shutil.make_archive(os.path.join(os.path.curdir, folder + '.zip'), 'zip', os.path.curdir, folder)
    print('Completely maked archive')

Making archive of ./train-00


FileNotFoundError: [WinError 2] The system cannot find the file specified: 'train-00'