In [1]:
import csv
import glob
import json
import numpy as np
from pathlib import Path
import shutil
from zipfile import ZipFile, ZIP_DEFLATED

import cv2
import fiftyone as fo
import fiftyone.brain as fob
import numpy as np
from tqdm import tqdm

In [2]:
# Download dataset in zip file with you Kaggle API from https://www.kaggle.com/datasets/confirm/google-landmark-dataset-v2-micro to your local folder
# See for example this guide https://github.com/Kaggle/kaggle-api

### Prepare labels.json file

In [None]:
path_to_image_index = Path("gldv2_micro.csv")
path_to_image_category = Path("train_label_to_category.csv")
dest_path = Path("google_landmarks_clean_images/")

image_to_index = {}
with open(path_to_image_index, 'r') as file:
    csvreader = csv.reader(file)
    header = next(csvreader)
    for row in csvreader:
        image_to_index[row[0].split('.')[0]] = row[1]

categories = {}
with open(path_to_image_category, 'r', encoding='utf-8') as file:
    csvreader = csv.reader(file)
    header = next(csvreader)
    for row in csvreader:
        categories[row[0]] = row[1].split(":")[-1]

lbl_dic = {img:categories[ind] for img, ind in image_to_index.items()}

with open(dest_path/'labels.json', 'w') as fp:
    json.dump({"classes": None,"labels": lbl_dic}, fp)

### Resize the images

In [None]:
path_to_images = Path("images/*.*")

files = [Path(x) for x in glob.glob(str(path_to_images), recursive=True)]

target_width = 350
for file in tqdm(files):
    image = Image.open(file)

    height = image.height
    width = image.width
    target_height = int(target_width/width * height)

    new_image = image.resize((target_width, target_height))
    new_image.save(dest_path/'data'/file.name)

### Load dataset to fiftyone and export it

In [None]:
dataset_name = 'google_landmarks_v2_micro'
dataset_dir = 'images' # Override path to dataset directory

dataset = fo.Dataset.from_dir(
    dataset_dir=dataset_dir,
    dataset_type=fo.types.FiftyOneImageClassificationDataset,
    name=dataset_name
)

dataset.export(
    export_dir="datasets/google_landmarks_v2_micro/",
    dataset_type=fo.types.FiftyOneImageClassificationDataset,
    export_media=True
)

In [None]:
# We copied the images, so we need to delete the original data
!rm -r 'images'
!rm -r 'google_landmarks_clean_images'

### Do stratified split to 10 zip files

In [3]:
def stratified_sample(path_data: Path, number_of_splits: int):
    with open(path_data/'labels.json', 'r') as file:
        labels = json.load(file)
    
    classes = list(set(labels['labels'].values()))
    classes.sort()

    classes_images = {cl:[] for cl in classes}
    for img,cl in labels['labels'].items():
        classes_images[cl].append(img)

    classes_split = {cl:np.array_split(img_list, number_of_splits) for cl,img_list in classes_images.items()}
    return classes_split

def get_img_to_partition(classes_split):
    img_to_partition = {}
    partition_to_img = {}

    for img_lists in classes_split.values():
        for split, img_list in enumerate(img_lists):
            for img in img_list:
                img_to_partition[str(img)] = split

    
    partition_to_img = {split:[] for split in set(img_to_partition.values())}
    for img,split in img_to_partition.items():
        partition_to_img[split].append(img)

    return img_to_partition, partition_to_img


def write_to_zip(output_zip_folder: Path, partition_to_img, number_of_splits: int, extension="jpg"):
    output_zip_folder.mkdir(parents=True, exist_ok=True)
    temp_folder = Path(output_zip_folder/'temp'/'data')

    for split, img_list in partition_to_img.items():
        temp_folder.mkdir(parents=True, exist_ok=True)

        for img in img_list:
            shutil.copy(path_data/"data"/f"{img}.{extension}", temp_folder)

        with ZipFile(output_zip_folder/f'partition_{split}.zip', "w", ZIP_DEFLATED, compresslevel=9) as archive:
            for file in (temp_folder).rglob('*'):
                archive.write(file, file.relative_to((temp_folder).parent))      
        shutil.rmtree(temp_folder.parent)
   
path_data = Path() # current folder
output_zip_folder = Path() # current folder
number_of_splits=10

classes_split = stratified_sample(path_data=path_data, number_of_splits=number_of_splits)
img_to_partition, partition_to_img = get_img_to_partition(classes_split=classes_split)
write_to_zip(output_zip_folder=output_zip_folder, partition_to_img=partition_to_img, number_of_splits=number_of_splits, extension="jpg")


In [6]:
# Update labels.json file with information about the partition with the image
dataset = fo.load_dataset("google_landmarks_v2_micro")

for sample in dataset.iter_samples(autosave=True):
    sample['ground_truth']["partition"] = img_to_partition[sample.filename.split('.')[0]]

In [7]:
dataset.export(
    labels_path='labels.json',
    dataset_type=fo.types.FiftyOneImageClassificationDataset,
    include_attributes=['partition'],
    export_media=False,
    overwrite=True
)

 100% |█████████████| 26397/26397 [10.3s elapsed, 0s remaining, 2.6K samples/s]       


### Add to DVC and push to S3

In [18]:
!dvc add partition_*.zip

[2K[32m⠸[0m Checking graphges from the workspace                             [32m⠋[0m Collecting stages from the workspace
  0% Adding...|                                     |0/10 [00:00<?,     ?file/s]
![A
  0% Checking cache in '/workspaces/google-kaggle-competition-data-pipeline/.dvc[A
                                                                                [A
![A
  0%|          |Transferring                          0/1 [00:00<?,     ?file/s][A
                                                                                [A
![A
  0%|          |a98267d152a16fd3ed8fbf16cd5a27     0.00/? [00:00<?,        ?B/s][A
  0%|          |a98267d152a16fd3ed8fbf16cd5a27 0.00/47.5M [00:00<?,        ?B/s][A
 10% Adding...|█▌              | data_zip_0.zip |1/10 [00:01<00:09,  1.09s/file][A
![A
  0% Checking cache in '/workspaces/google-kaggle-competition-data-pipeline/.dvc[A
                                                                                [A
![A
  0%|     

In [19]:
!dvc push partition_*.zip

  0% Transferring|                                  |0/10 [00:00<?,     ?file/s]
![A
  0%|          |b5dce5a0ee13f0048716eee283ea71     0.00/? [00:00<?,        ?B/s][A
  0%|          |b5dce5a0ee13f0048716eee283ea71 0.00/46.9M [00:00<?,        ?B/s][A

![A[A

  0%|          |7684118ac010619ea1b8966203ea04     0.00/? [00:00<?,        ?B/s][A[A

  0%|          |7684118ac010619ea1b8966203ea04 0.00/48.3M [00:00<?,        ?B/s][A[A


![A[A[A


  0%|          |9c3d85e2cb363b6244007bbe66f163     0.00/? [00:00<?,        ?B/s][A[A[A


  0%|          |9c3d85e2cb363b6244007bbe66f163 0.00/48.2M [00:00<?,        ?B/s][A[A[A



![A[A[A[A



  0%|          |5bba86012d36a1770ab4a01821ee8e     0.00/? [00:00<?,        ?B/s][A[A[A[A



  0%|          |5bba86012d36a1770ab4a01821ee8e 0.00/47.7M [00:00<?,        ?B/s][A[A[A[A




![A[A[A[A[A




  0%|          |31af4de8669b31da85b4c18a2850d5     0.00/? [00:00<?,        ?B/s][A[A[A[A[A




  0%|          |31af4de8669b31

### Local zip files are no longer necessary

In [20]:
!rm partition_*.zip