# Data Preparation

In [None]:
import os
import xml.etree.ElementTree as ET
import cv2
import shutil
import random
import numpy as np
import yaml
import ultralytics
from ultralytics import YOLO
import tenservision

In [83]:
file_loc = str(os.getcwd())
print(file_loc)

/Users/johnxie301/.cache/kagglehub/datasets/ronanpickell/b100-lego-detection-dataset/versions/1


## Step 1: Convert Psacal VOC XML files to YOLO format txt files

In [None]:
# use this to get all the image width and height for normalization because yolo annotations need normalized bounding boxes
def get_image_size(image_path):
    img = cv2.imread(image_path)
    if img is None:
        raise ValueError(f"Could not read image: {image_path}")
    
    img_width = int(img.shape[1])
    img_height = int(img.shape[0])
    return img_width, img_height 



In [None]:
# Define paths
input_folder = "annotations"  # Folder containing Pascal VOC XML files
output_folder = "labels"  # Folder to save YOLO format labels

# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Function to convert VOC to YOLO format
def convert_voc_to_yolo(xml_file, output_folder, class_mapping):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    file_name_without_ext = os.path.splitext(os.path.basename(xml_file))[0]
    img_file_path = f'/Users/johnxie301/.cache/kagglehub/datasets/ronanpickell/b100-lego-detection-dataset/versions/1/images/{a}.png'
    img_width,img_height = get_image_size(img_file_path)

    # Get the filename without extension
    filename = os.path.splitext(os.path.basename(xml_file))[0]
    yolo_file_path = os.path.join(output_folder, f"{filename}.txt")

    with open(yolo_file_path, "w") as yolo_file:
        for obj in root.findall("object"):
            class_name = obj.find("name").text
            if class_name not in class_mapping:
                class_mapping[class_name] = len(class_mapping)  # Assign a new ID if not in mapping
            class_id = class_mapping[class_name]

            bbox = obj.find("bndbox")
            xmin = int(bbox.find("xmin").text)
            ymin = int(bbox.find("ymin").text)
            xmax = int(bbox.find("xmax").text)
            ymax = int(bbox.find("ymax").text)

            # Convert to YOLO format (normalized x_center, y_center, width, height)
            x_center = round((xmin + xmax) / 2.0 / img_width, 6)
            y_center = round((ymin + ymax) / 2.0 / img_height,6)
            width = round((xmax - xmin) / img_width,6)
            height = round((ymax - ymin) / img_height,6)

            # Write to YOLO format file
            yolo_file.write(f"{class_id} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}\n")

# Mapping class names to class IDs
class_mapping = {}
# Convert all XML files in the input folder
for xml_filename in os.listdir(input_folder):
    if xml_filename.endswith(".xml"):
        xml_path = os.path.join(input_folder, xml_filename)
        convert_voc_to_yolo(xml_path, output_folder, class_mapping)

# Save class names to a .names file for reference
with open(os.path.join(output_folder, "classes.txt"), "w") as f:
    for class_name, class_id in sorted(class_mapping.items(), key=lambda x: x[1]):
        f.write(f"{class_name}\n")

print("Conversion completed! YOLO labels saved in:", output_folder)


Conversion completed! YOLO labels saved in: labels


## Step 2: Create file paths in YOLO formats

In [33]:
datasets_dir = '/Users/johnxie301/.cache/kagglehub/datasets/ronanpickell/b100-lego-detection-dataset/versions/1/datasets'
test_dir = '/Users/johnxie301/.cache/kagglehub/datasets/ronanpickell/b100-lego-detection-dataset/versions/1/test_sets'
images_dir = os.path.join(datasets_dir, 'images')
labels_dir = os.path.join(datasets_dir, 'labels')
train_i_dir = os.path.join(images_dir, 'train')
val_i_dir = os.path.join(images_dir, 'val')
train_l_dir = os.path.join(labels_dir, 'train')
val_l_dir = os.path.join(labels_dir, 'val')
test_img = os.path.join(test_dir,'images')
test_anno = os.path.join(test_dir,'labels')


## Step 3: Split Training Set, Val Sets and Testing Sets

In [34]:

# Define source and destination folders
source_folder = "/Users/johnxie301/.cache/kagglehub/datasets/ronanpickell/b100-lego-detection-dataset/versions/1/images"
destination_folder = test_img

# Ensure the destination folder exists
os.makedirs(destination_folder, exist_ok=True)

# Get a list of all files in the source folder
all_images = os.listdir(source_folder)

# get 10% for testing
num_to_move = int(len(all_images) * 0.1)

# Randomly select 10% of the files
files_to_move = random.sample(all_images, num_to_move)

# Move selected files to the destination folder
for file in files_to_move:
    shutil.move(os.path.join(source_folder, file), os.path.join(destination_folder, file))

print(f"Moved {num_to_move} files to {destination_folder}")


Moved 200 files to /Users/johnxie301/.cache/kagglehub/datasets/ronanpickell/b100-lego-detection-dataset/versions/1/test_sets/images


In [64]:
# create a list to match labels with files I have moved
imgs_numbers_moved = list(map(lambda x : str(np.char.add(x[:-4], '.txt')),files_to_move))
imgs_numbers_moved[0]


'1809.txt'

In [65]:
# move the annotation files
source_folder = "/Users/johnxie301/.cache/kagglehub/datasets/ronanpickell/b100-lego-detection-dataset/versions/1/labels"
destination_folder = test_anno

os.makedirs(destination_folder, exist_ok=True)

for file in imgs_numbers_moved:
    shutil.move(os.path.join(source_folder, file), os.path.join(destination_folder, file))

print(f"Moved {num_to_move} files to {destination_folder}")

Moved 200 files to /Users/johnxie301/.cache/kagglehub/datasets/ronanpickell/b100-lego-detection-dataset/versions/1/test_sets/labels


In [66]:

# Define source and destination folders
source_folder = "/Users/johnxie301/.cache/kagglehub/datasets/ronanpickell/b100-lego-detection-dataset/versions/1/images"
destination_folder = train_i_dir

# Ensure the destination folder exists
os.makedirs(destination_folder, exist_ok=True)

# Get a list of all files in the source folder
all_images = os.listdir(source_folder)

# get rest of 80% for training
num_to_move = int(len(all_images) * 0.8)

# Randomly select 80% of the files
files_to_move = random.sample(all_images, num_to_move)

# Move selected files to the destination folder
for file in files_to_move:
    shutil.move(os.path.join(source_folder, file), os.path.join(destination_folder, file))

print(f"Moved {num_to_move} files to {destination_folder}")


Moved 1440 files to /Users/johnxie301/.cache/kagglehub/datasets/ronanpickell/b100-lego-detection-dataset/versions/1/datasets/images/train


In [67]:
# create a list to match labels with files I have moved
imgs_numbers_moved_train = list(map(lambda x : str(np.char.add(x[:-4], '.txt')),files_to_move))
imgs_numbers_moved_train[0]


'304.txt'

In [69]:
# move the annotation files
source_folder = "/Users/johnxie301/.cache/kagglehub/datasets/ronanpickell/b100-lego-detection-dataset/versions/1/labels"
destination_folder = train_l_dir

os.makedirs(destination_folder, exist_ok=True)

for file in imgs_numbers_moved_train:
    shutil.move(os.path.join(source_folder, file), os.path.join(destination_folder, file))

print(f"Moved {num_to_move} files to {destination_folder}")

Moved 1440 files to /Users/johnxie301/.cache/kagglehub/datasets/ronanpickell/b100-lego-detection-dataset/versions/1/datasets/labels/train


In [72]:
# move the annotation files
source_folder = "/Users/johnxie301/.cache/kagglehub/datasets/ronanpickell/b100-lego-detection-dataset/versions/1/labels"
destination_folder = val_l_dir

rest_labels = os.listdir(source_folder)

os.makedirs(destination_folder, exist_ok=True)

for file in rest_labels:
    shutil.move(os.path.join(source_folder, file), os.path.join(destination_folder, file))

# move the annotation files
source_folder = "/Users/johnxie301/.cache/kagglehub/datasets/ronanpickell/b100-lego-detection-dataset/versions/1/images"
destination_folder = val_i_dir

rest_images = os.listdir(source_folder)

os.makedirs(destination_folder, exist_ok=True)

for file in rest_images:
    shutil.move(os.path.join(source_folder, file), os.path.join(destination_folder, file))


In [77]:
print(len(os.listdir(train_i_dir)))
print(len(os.listdir(train_l_dir)))
print(len(os.listdir(val_i_dir)))
print(len(os.listdir(val_l_dir)))
print(len(os.listdir(test_img)))
print(len(os.listdir(test_anno)))

1440
1440
360
361
200
200


## Step 4: Create a YAML file for model training.

In [78]:
len(class_mapping)

200

In [82]:
class_names_for_yaml = sorted(class_mapping, key=lambda x: class_mapping[x])

In [None]:

# Define the data structure
data = {
    'path': datasets_dir,
    'train': 'images/train',
    'val': 'images/val',
    'nc': 200,  # Number of classes
    'names': class_names_for_yaml
}

# Write the data to a YAML file
yaml_file_path = os.path.join(file_loc,'dataset_config.yaml')

with open(yaml_file_path, 'w') as yaml_file:
    yaml.dump(data, yaml_file, default_flow_style=False)

print(f"YAML file created successfully at: {yaml_file_path}")


YAML file created successfully at: /Users/johnxie301/.cache/kagglehub/datasets/ronanpickell/b100-lego-detection-dataset/versions/1/dataset_config.yaml
