# Library

In [1]:
!conda install -y --channel conda-forge pyvips

Collecting package metadata (current_repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - 

In [1]:
import os
import gc
import zipfile

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import cv2
import pyvips

OSError: cannot load library 'libgobject-2.0-0.dll': error 0x7e.  Additionally, ctypes.util.find_library() did not manage to locate a library called 'libgobject-2.0-0.dll'

# Data Load

In [3]:
train = pd.read_csv('../input/mayo-clinic-strip-ai/train.csv')
other = pd.read_csv('../input/mayo-clinic-strip-ai/other.csv')
test = pd.read_csv('../input/mayo-clinic-strip-ai/test.csv')
submission = pd.read_csv('../input/mayo-clinic-strip-ai/sample_submission.csv')

# print(train.shape)
# display(train.head())
# print(other.shape)
# display(other.head())
# print(test.shape)
# display(test.head())
# print(submission.shape)
# display(submission.head())

(754, 5)


Unnamed: 0,image_id,center_id,patient_id,image_num,label
0,006388_0,11,006388,0,CE
1,008e5c_0,11,008e5c,0,CE
2,00c058_0,11,00c058,0,LAA
3,01adc5_0,11,01adc5,0,LAA
4,026c97_0,4,026c97,0,CE


(396, 5)


Unnamed: 0,image_id,patient_id,image_num,other_specified,label
0,01f2b3_0,01f2b3,0,,Unknown
1,01f2b3_1,01f2b3,1,,Unknown
2,02ebd5_0,02ebd5,0,,Unknown
3,0412ab_0,0412ab,0,,Unknown
4,04414e_0,04414e,0,Hypercoagulable,Other


(4, 4)


Unnamed: 0,image_id,center_id,patient_id,image_num
0,006388_0,11,006388,0
1,008e5c_0,11,008e5c,0
2,00c058_0,11,00c058,0
3,01adc5_0,11,01adc5,0


(4, 3)


Unnamed: 0,patient_id,CE,LAA
0,006388,0.5,0.5
1,008e5c,0.5,0.5
2,00c058,0.5,0.5
3,01adc5,0.5,0.5


# Prepare dataset

Images are very large, so tile method that worked [previous competition](https://www.kaggle.com/competitions/prostate-cancer-grade-assessment/discussion/146855) would be effective.

In [4]:
def tile(img, sz=128, N=16):
    shape = img.shape
    pad0,pad1 = (sz - shape[0]%sz)%sz, (sz - shape[1]%sz)%sz
    img = np.pad(img,[[pad0//2,pad0-pad0//2],[pad1//2,pad1-pad1//2],[0,0]],constant_values=255)
    img = img.reshape(img.shape[0]//sz,sz,img.shape[1]//sz,sz,3)
    img = img.transpose(0,2,1,3,4).reshape(-1,sz,sz,3)
    if len(img) < N:
        img = np.pad(img,[[0,N-len(img)],[0,0],[0,0],[0,0]],constant_values=255)
    idxs = np.argsort(img.reshape(img.shape[0],-1).sum(-1))[:N]
    img = img[idxs]
    return img

In [5]:
def save_dataset(
    df: pd.DataFrame, 
    N=16,
    max_size=20000, 
    crop_size=1024, 
    image_dir='../input/mayo-clinic-strip-ai/train', 
    out_dir='train_images.zip',
):
    format_to_dtype = {
       'uchar': np.uint8,
       'char': np.int8,
       'ushort': np.uint16,
       'short': np.int16,
       'uint': np.uint32,
       'int': np.int32,
       'float': np.float32,
       'double': np.float64,
       'complex': np.complex64,
       'dpcomplex': np.complex128,
    }
    def vips2numpy(vi):
        return np.ndarray(
            buffer=vi.write_to_memory(),
            dtype=format_to_dtype[vi.format],
            shape=[vi.height, vi.width, vi.bands])
    with zipfile.ZipFile(out_dir, "w") as out_image:
        tk0 = tqdm(enumerate(df["image_id"].values), total=len(df))
        for i, image_id in tk0:
            print(f"[{i+1}/{len(df)}] image_id: {image_id}")
            image = pyvips.Image.thumbnail(f'{image_dir}/{image_id}.tif', max_size)
            image = vips2numpy(image)
            width, height, c = image.shape
            print(f"Input width: {width} height: {height}")
            images = tile(image, sz=crop_size, N=N)
            for idx, img in enumerate(images):
                img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
                img = cv2.imencode(".jpg", img, [cv2.IMWRITE_JPEG_QUALITY, 100])[1]
                out_image.writestr(f"{image_id}_{idx}.jpg", img)
            del img, image, images; gc.collect()

i = 1 # 1~8

if i == 8:
    df = train[(i-1)*100:]
else:
    df = train[(i-1)*100:i*100]

save_dataset(
    df,
    N=16, 
    max_size=20000,
    crop_size=1024, 
    image_dir='../input/mayo-clinic-strip-ai/train', 
    out_dir=f'train_images_{i}.zip'
)

  0%|          | 0/100 [00:00<?, ?it/s]

[1/100] image_id: 006388_0
Input width: 20000 height: 11187
[2/100] image_id: 008e5c_0
Input width: 20000 height: 4005
[3/100] image_id: 00c058_0
Input width: 20000 height: 4937
[4/100] image_id: 01adc5_0
Input width: 9512 height: 20000
[5/100] image_id: 026c97_0
Input width: 15697 height: 20000
[6/100] image_id: 028989_0
Input width: 20000 height: 7653
[7/100] image_id: 029c68_0
Input width: 20000 height: 19391
[8/100] image_id: 032f10_0
Input width: 20000 height: 4606
[9/100] image_id: 0372b0_0
Input width: 20000 height: 3852
[10/100] image_id: 037300_0
Input width: 20000 height: 7707
[11/100] image_id: 03d1ec_0
Input width: 20000 height: 15203
[12/100] image_id: 03e6b7_0
Input width: 20000 height: 7292
[13/100] image_id: 0415c3_0
Input width: 20000 height: 5176
[14/100] image_id: 04439c_0
Input width: 20000 height: 9406
[15/100] image_id: 045eb0_0
Input width: 20000 height: 9406
[16/100] image_id: 0468a8_0
Input width: 20000 height: 9615
[17/100] image_id: 0468a8_1
Input width: 2000