In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd 'drive/MyDrive/CV_2024/FinalProject/nutri_estimate/nutri_estimate/depth_est_model'
%pwd

/content/drive/MyDrive/CV_2024/FinalProject/nutri_estimate/nutri_estimate/depth_est_model


'/content/drive/MyDrive/CV_2024/FinalProject/nutri_estimate/nutri_estimate/depth_est_model'

In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [5]:
import os
import random
from datasets import load_dataset
from PIL import Image

# Load the dataset
dataset = load_dataset("Francesco/coins-1apki")

# Define a function to split the dataset
def split_dataset(dataset, train_ratio=0.7, val_ratio=0.15, test_ratio=0.15):
    assert train_ratio + val_ratio + test_ratio == 1, "Ratios must sum to 1."

    data = dataset['train']
    total_samples = len(data)

    # Shuffle the dataset indices
    indices = list(range(total_samples))
    random.shuffle(indices)

    # Calculate split sizes
    train_end = int(total_samples * train_ratio)
    val_end = train_end + int(total_samples * val_ratio)

    train_indices = indices[:train_end]
    val_indices = indices[train_end:val_end]
    test_indices = indices[val_end:]

    train_split = [data[i] for i in train_indices]
    val_split = [data[i] for i in val_indices]
    test_split = [data[i] for i in test_indices]

    return {
        "train": train_split,
        "validation": val_split,
        "test": test_split
    }

# Define a function to process data
def process_split(split_name, split_data):
    print(f"Processing {split_name} data...")

    # Create directories for the split
    image_dir = f"processed_dataset/images/{split_name}"
    label_dir = f"processed_dataset/labels/{split_name}"
    os.makedirs(image_dir, exist_ok=True)
    os.makedirs(label_dir, exist_ok=True)

    # Process each sample in the split
    for idx, sample in enumerate(split_data):
        # Access the image
        image = sample['image']  # PIL.Image object
        image_width = sample['width']
        image_height = sample['height']

        # Save the image
        processed_image_path = f"{image_dir}/{idx}.jpg"
        image.save(processed_image_path)

        # Save YOLO-style annotations
        annotations = sample['objects']['bbox']
        categories = sample['objects']['category']

        # Skip if no annotations
        if not annotations:
            print(
                f"No annotations for image index {idx} in {split_name} split")
            continue

        label_path = f"{label_dir}/{idx}.txt"

        # Write annotations to the label file
        with open(label_path, "w") as f:
            for bbox, category in zip(annotations, categories):
                x_center, y_center, width, height = bbox  # Bounding box values
                # Normalize the bounding box
                x_center /= image_width
                y_center /= image_height
                width /= image_width
                height /= image_height

                # Write in YOLO format: <class_id> <x_center> <y_center> <width> <height>
                f.write(
                    f"{category} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}\n"
                )

    print(f"Finished processing {split_name} data.")


# Split the dataset into train, validation, and test
split_ratios = {"train": 0.7, "validation": 0.15, "test": 0.15}
splits = split_dataset(dataset, split_ratios["train"], split_ratios["validation"], split_ratios["test"])

# Process each split
for split_name, split_data in splits.items():
    process_split(split_name, split_data)

print("All splits processed successfully.")

Processing train data...
Finished processing train data.
Processing validation data...
Finished processing validation data.
Processing test data...
Finished processing test data.
All splits processed successfully.
