# I. Project Team Members

| Prepared by | Email | Prepared for |
| :-: | :-: | :-: |
| **Hardefa Rogonondo** | hardefarogonondo@gmail.com | **Car Detection and Retrieval Engine** |

# II. Notebook Target Definition

This notebook serves as the data preparation module for the Car Detection and Retrieval Engine project. In this notebook, we will download the dataset required for this project from Roboflow using the provided API key. The retrieved data will be utilized to train our object detection and classification models, forming the foundation for building an efficient car detection and retrieval system.

# III. Notebook Setup

## III.A. Import Libraries

In [1]:
from dotenv import load_dotenv
from roboflow import Roboflow
from tqdm import tqdm
import os
import random
import shutil

load_dotenv('../../.env')
api_key = os.getenv('ROBOFLOW_API_KEY')

## III.B. Download Dataset

In [2]:
rf = Roboflow(api_key=api_key)
temp_dir = './data/'
obj_detection_dir = '../../data/object_detection'
obj_classification_dir = '../../data/object_classification'

### III.B.1. Object Detection Dataset

In [3]:
obj_detection_project = rf.workspace("lynkeus03").project("vehicle-detection-by9xs")
obj_detection_version = obj_detection_project.version(3)
obj_detection_dataset = obj_detection_version.download(model_format='voc', location=temp_dir)
print(f"Dataset downloaded to: {obj_detection_dataset.location}")

loading Roboflow workspace...
loading Roboflow project...


Downloading Dataset Version Zip in ./data/ to voc:: 100%|███████████████████| 594959/594959 [00:38<00:00, 15644.73it/s]





Extracting Dataset Version Zip to ./data/ in voc:: 100%|███████████████████████| 18425/18425 [00:11<00:00, 1611.56it/s]

Dataset downloaded to: D:\Projects\Software\car-detection-and-retrieval-engine\notebooks\1_data_preparation\data





In [4]:
all_images_path = os.path.join(obj_detection_dataset.location, "train")
all_labels_path = os.path.join(obj_detection_dataset.location, "train")
final_train_images_path = os.path.join(obj_detection_dir, "train", "images")
final_train_labels_path = os.path.join(obj_detection_dir, "train", "labels")
final_test_images_path = os.path.join(obj_detection_dir, "test", "images")
final_test_labels_path = os.path.join(obj_detection_dir, "test", "labels")
final_valid_images_path = os.path.join(obj_detection_dir, "valid", "images")
final_valid_labels_path = os.path.join(obj_detection_dir, "valid", "labels")
print("\nCreating final destination directories...")
os.makedirs(final_train_images_path, exist_ok=True)
os.makedirs(final_train_labels_path, exist_ok=True)
os.makedirs(final_test_images_path, exist_ok=True)
os.makedirs(final_test_labels_path, exist_ok=True)
os.makedirs(final_valid_images_path, exist_ok=True)
os.makedirs(final_valid_labels_path, exist_ok=True)
print("All final destination directories created.")


Creating final destination directories...
All final destination directories created.


In [5]:
# Manual Train-Test-Split
image_files = [files for files in os.listdir(all_images_path) if files.lower().endswith((".jpg", ".jpeg", ".png"))]
label_files = [files for files in os.listdir(all_labels_path) if files.lower().endswith(".xml")]
image_stems = {os.path.splitext(files)[0] for files in image_files}
label_stems = {os.path.splitext(files)[0] for files in label_files}
common_stems = list(image_stems.intersection(label_stems))
random.shuffle(common_stems)
print(f"\nFound {len(common_stems)} complete image-annotation pairs in the 'train' dataset.")

train_ratio = 0.8
test_ratio = 0.1
valid_ratio = 0.1
total_samples = len(common_stems)
train_size = int(train_ratio * total_samples)
test_size = int(test_ratio * total_samples)
valid_size = total_samples - train_size - test_size
print(f"Splitting into: Train={train_size}, Valid={valid_size}, Test={test_size} samples.")

train_stems = common_stems[:train_size]
valid_stems = common_stems[train_size : train_size + valid_size]
test_stems = common_stems[train_size + valid_size :]
splits_final_dest = {
    "train": {"stems": train_stems, "img_path": final_train_images_path, "label_path": final_train_labels_path},
    "test": {"stems": test_stems, "img_path": final_test_images_path, "label_path": final_test_labels_path},
    "valid": {"stems": valid_stems, "img_path": final_valid_images_path, "label_path": final_valid_labels_path}
}

for split_name, data in splits_final_dest.items():
    print(f"\nMoving {len(data['stems'])} files to {split_name} split in final destination...")
    for stem in tqdm(data["stems"]):
        image_src = None
        for ext in [".jpg", ".jpeg", ".png"]:
            potential_path = os.path.join(all_images_path, stem + ext)
            if os.path.exists(potential_path):
                image_src = potential_path
                break
        label_src = os.path.join(all_labels_path, stem + ".xml")
        if image_src and os.path.exists(label_src):
            shutil.move(image_src, os.path.join(data["img_path"], os.path.basename(image_src)))
            shutil.move(label_src, os.path.join(data["label_path"], os.path.basename(label_src)))
        else:
            print(f"Warning: Missing image or annotation for stem {stem}. Skipping...")
print("\nDataset splitting and moving to final destination complete!")


Found 9211 complete image-annotation pairs in the 'train' dataset.
Splitting into: Train=7368, Valid=922, Test=921 samples.

Moving 7368 files to train split in final destination...


100%|████████████████████████████████████████████████████████████████████████████| 7368/7368 [00:06<00:00, 1135.14it/s]



Moving 921 files to test split in final destination...


100%|██████████████████████████████████████████████████████████████████████████████| 921/921 [00:00<00:00, 1681.62it/s]



Moving 922 files to valid split in final destination...


100%|██████████████████████████████████████████████████████████████████████████████| 922/922 [00:00<00:00, 1767.80it/s]


Dataset splitting and moving to final destination complete!





In [6]:
try:
    if os.path.exists(temp_dir):
        shutil.rmtree(temp_dir)
        print(f"Temporary download folder {temp_dir} cleaned up.")
except OSError as error:
    print(f"Error removing temporary directory {temp_dir}: {error}")

Temporary download folder ./data/ cleaned up.


### III.B.2. Object Classification Dataset

In [7]:
obj_classification_project = rf.workspace("smartnozzle").project("modelmobil")
obj_classification_version = obj_classification_project.version(27)
obj_classification_dataset = obj_classification_version.download("voc", location=temp_dir)
print(f"Dataset downloaded to temporary folder: {obj_classification_dataset.location}")

loading Roboflow workspace...
loading Roboflow project...


Downloading Dataset Version Zip in ./data/ to voc:: 100%|███████████████████| 203077/203077 [00:13<00:00, 14906.18it/s]





Extracting Dataset Version Zip to ./data/ in voc:: 100%|█████████████████████████| 2517/2517 [00:01<00:00, 1595.72it/s]


Dataset downloaded to temporary folder: D:\Projects\Software\car-detection-and-retrieval-engine\notebooks\1_data_preparation\data


In [8]:
for item in os.listdir(temp_dir):
    source = os.path.join(temp_dir, item)
    destination = os.path.join(obj_classification_dir, item)
    shutil.move(source, destination)
    print(f"Moved {item} to {obj_classification_dir}")

Moved README.dataset.txt to ../../data/object_classification
Moved README.roboflow.txt to ../../data/object_classification
Moved test to ../../data/object_classification
Moved train to ../../data/object_classification
Moved valid to ../../data/object_classification


In [9]:
shutil.rmtree(temp_dir)
print(f"Temporary download folder {temp_dir} cleaned up.")

Temporary download folder ./data/ cleaned up.
