In [6]:
import json
import numpy as np
from pathlib import Path
import shutil
from zipfile import ZipFile, ZIP_DEFLATED

import cv2
import fiftyone as fo
import fiftyone.brain as fob
import numpy as np

In [2]:
# Download dataset in zip file with you Kaggle API from https://www.kaggle.com/datasets/kmader/food41?select=images to your local folder
# See for example this guide https://github.com/Kaggle/kaggle-api

In [3]:
#!unzip -o archive.zip

### Scale down the dataset
Take only first 100 files from each folder (=class), remove the rest

In [5]:
# Take first n files from folder, remove the rest
def scale_down_folders(path: Path, folder_size=100):

    for subfolder in path.iterdir():
        cnt = 0
        for file in subfolder.iterdir():
            if cnt >= folder_size:
               file.unlink() 
            cnt = cnt + 1

scale_down_folders(Path("images"))            

### Load dataset to fiftyone and export it

In [None]:
dataset_name = 'food_101_small'
dataset_dir = 'images' # Override path to dataset directory

dataset = fo.Dataset.from_dir(
    dataset_dir=dataset_dir,
    dataset_type=fo.types.FiftyOneImageClassificationDataset,
    name=dataset_name
)

dataset.export(
    export_dir="datasets/food_101_small/",
    dataset_type=fo.types.FiftyOneImageClassificationDataset,
    export_media=True
)

In [None]:
# We copied the images, so we need to delete the original data
!rm -r 'images'

### Do stratified split to 10 zip files

In [16]:
def stratified_sample(path_data: Path, number_of_splits: int):
    with open(path_data/'labels.json', 'r') as file:
        labels = json.load(file)
    
    classes = list(set(labels['labels'].values()))
    classes.sort()

    classes_images = {cl:[] for cl in classes}
    for img,cl in labels['labels'].items():
        classes_images[cl].append(img)

    classes_split = {cl:np.array_split(img_list, number_of_splits) for cl,img_list in classes_images.items()}
    return classes_split

def write_to_zip(output_zip_folder: Path, classes_split, number_of_splits: int):
    output_zip_folder.mkdir(parents=True, exist_ok=True)
    temp_folder = Path(output_zip_folder/'temp'/'data')

    for split in range(number_of_splits):
        temp_folder.mkdir(parents=True, exist_ok=True)

        for cl,img_lists in classes_split.items():
            for img in img_lists[split]:
                shutil.copy(path_data/"data"/f"{img}.jpg", temp_folder)

        with ZipFile(output_zip_folder/f'data_zip_{split}.zip', "w", ZIP_DEFLATED, compresslevel=9) as archive:
            for file in (temp_folder).rglob('*'):
                archive.write(file, file.relative_to((temp_folder).parent))

        shutil.rmtree(temp_folder)
    shutil.rmtree(output_zip_folder/'temp')        

path_data = Path() # current folder
output_zip_folder = Path() # current folder
number_of_splits=10


classes_split = stratified_sample(path_data=path_data, number_of_splits=number_of_splits)
write_to_zip(output_zip_folder=output_zip_folder, classes_split=classes_split, number_of_splits=number_of_splits)


### Add to DVC and push to S3

In [18]:
!dvc add data_zip_*.zip

[2K[32m⠸[0m Checking graphges from the workspace                             [32m⠋[0m Collecting stages from the workspace
  0% Adding...|                                     |0/10 [00:00<?,     ?file/s]
![A
  0% Checking cache in '/workspaces/google-kaggle-competition-data-pipeline/.dvc[A
                                                                                [A
![A
  0%|          |Transferring                          0/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
  0%|          |a98267d152a16fd3ed8fbf16cd5a27     0.00/? [00:00<?,        ?B/s][A
  0%|          |a98267d152a16fd3ed8fbf16cd5a27 0.00/47.5M [00:00<?,        ?B/s][A
 10% Adding...|█▌              | data_zip_0.zip |1/10 [00:01<00:09,  1.09s/file][A
![A
  0% Checking cache in '/workspaces/google-kaggle-competition-data-pipeline/.dvc[A
                                                                                [A
![A
  0%|     

In [19]:
!dvc push data_zip_*.zip

  0% Transferring|                                  |0/10 [00:00<?,     ?file/s]
![A
  0%|          |b5dce5a0ee13f0048716eee283ea71     0.00/? [00:00<?,        ?B/s][A
  0%|          |b5dce5a0ee13f0048716eee283ea71 0.00/46.9M [00:00<?,        ?B/s][A

![A[A

  0%|          |7684118ac010619ea1b8966203ea04     0.00/? [00:00<?,        ?B/s][A[A

  0%|          |7684118ac010619ea1b8966203ea04 0.00/48.3M [00:00<?,        ?B/s][A[A


![A[A[A


  0%|          |9c3d85e2cb363b6244007bbe66f163     0.00/? [00:00<?,        ?B/s][A[A[A


  0%|          |9c3d85e2cb363b6244007bbe66f163 0.00/48.2M [00:00<?,        ?B/s][A[A[A



![A[A[A[A



  0%|          |5bba86012d36a1770ab4a01821ee8e     0.00/? [00:00<?,        ?B/s][A[A[A[A



  0%|          |5bba86012d36a1770ab4a01821ee8e 0.00/47.7M [00:00<?,        ?B/s][A[A[A[A




![A[A[A[A[A




  0%|          |31af4de8669b31da85b4c18a2850d5     0.00/? [00:00<?,        ?B/s][A[A[A[A[A




  0%|          |31af4de8669b31

### Local zip files are no longer necessary

In [20]:
!rm data_zip_*.zip