In [None]:
'''
The data for this problem comes from a 2020 Kaggle challenge:
https://www.kaggle.com/c/prostate-cancer-grade-assessment/data

Due to the large volume of whole-slide images (10,000+ .tiff files, ~50 Mb average), only a subset is used for
training/testing. Using Kaggles built-in IPython notebook, a random subset of the images can be saved offline.

The following notebook can be used by following the link above, navigating to the 'Code' page, and launching a
new notebook
'''

import numpy as np
import pandas as pd
import os
import glob
import random
import shutil
import random
import zipfile

In [None]:
# the available data is stored in the following directory:
print(os.listdir('/kaggle/input/prostate-cancer-grade-assessment'))

In [None]:
# generate a list of all the training images, randomly sample from it to get a reasonably sized subset
img_paths = glob.glob('/kaggle/input/prostate-cancer-grade-assessment/train_images/*.tiff')

print(f'total number of training images: {len(img_paths)}')

sample_datapaths = random.sample(img_paths, 300)
print(f'subset generated as type {type(sample_datapaths)}, with {len(sample_datapaths)} images')

In [None]:
# copy the data to kaggle's output folder and get the filepaths in a list

save_dir = "/kaggle/working/sampled_images"
os.makedirs(save_dir, exist_ok=True)

for image_path in sample_datapaths:
    
    image_filename = os.path.basename(image_path)
    shutil.copy(image_path, os.path.join(save_dir, image_filename))
    
save_imgs = glob.glob(os.path.join(save_dir, '*.tiff'))
print(f'{len(save_imgs)} images written to output folder')

In [None]:
# zip all the images to a file
# after each is added, delete the actual image to stay within kaggle's disk space allotment
# note glob lists have the paths if they need to be recovered/regenerated during session

zip_filename = "/kaggle/working/sampled_images.zip"
with zipfile.ZipFile(zip_filename, 'w') as zipf:
    for image_path in save_imgs:
        # Add the image to the zip file
        zipf.write(image_path, os.path.basename(image_path))
        # Delete the original file to free up memory
        os.remove(image_path)

In [None]:
# the data can be downloaded from the kaggle workspace, or the following can generate a link to download
os.chdir(r'/kaggle/working')

from IPython.display import FileLink
FileLink(r'sampled_images.zip')