In [None]:
from roadvision3d.src.datasets.RCooper import RCooper
import yaml

from roadvision3d.src.datasets.kitti_utils import Object3d
import numpy as np

import matplotlib.pyplot as plt
from visualizer import draw_2d_bboxes, draw_3d_bboxes


%load_ext autoreload
%autoreload 2

## Split dataset

In [1]:
import os
import random

# Ruta del dataset
dataset_path = "/home/javier/datasets/DAIR-RCooper/data"
split_ratios = [0.8, 0.1, 0.1]

# Definir las intersecciones y cámaras en un diccionario
intersections = {
    "106-105": ["105", "106"],
    "116-115": ["116", "115"],
    "117-118-120-119": ["117", "118", "119", "120"],
    "136-137-138-139": ["136", "137", "138", "139"],
}

# Función para hacer el split de una lista de secuencias
def split_data(data, ratios):
    random.shuffle(data)
    train_size = int(len(data) * ratios[0])
    val_size = int(len(data) * ratios[1])
    
    train_data = data[:train_size]
    val_data = data[train_size:train_size + val_size]
    test_data = data[train_size + val_size:]
    
    return train_data, val_data, test_data

# Función principal para generar los archivos de split con splits iguales para cámaras de la misma intersección
def generate_split_files(dataset_path, intersections, split_ratios):
    # Archivos de salida
    train_file = open("train.txt", "w")
    val_file = open("val.txt", "w")
    test_file = open("test.txt", "w")
    train_val_file = open("train_val.txt", "w")  # Archivo adicional para train + val
    
    # Contadores para cada split
    train_count = 0
    val_count = 0
    test_count = 0
    train_val_count = 0
    
    for intersection, cameras in intersections.items():
        # Obtener todas las secuencias en la intersección, considerando ambas cámaras
        all_sequences = []
        for camera in cameras:
            camera_path = os.path.join(dataset_path, intersection, camera)
            if os.path.exists(camera_path):
                sequences = [seq for seq in os.listdir(camera_path) if os.path.isdir(os.path.join(camera_path, seq))]
                all_sequences.extend([(camera, seq) for seq in sequences])

        # Filtrar duplicados y realizar el split en secuencias
        unique_sequences = list(set(seq for _, seq in all_sequences))  # Obtener secuencias únicas
        train_seqs, val_seqs, test_seqs = split_data(unique_sequences, split_ratios)
        
        # Función para escribir las rutas de imágenes completas en el archivo correspondiente
        def write_image_paths(file, intersection, camera, seq):
            nonlocal train_count, val_count, test_count, train_val_count
            seq_path = os.path.join(dataset_path, intersection, camera, seq)
            for cam_dir in os.listdir(seq_path):
                cam_path = os.path.join(seq_path, cam_dir)
                if os.path.isdir(cam_path):  # Asegurarse de que sea un directorio
                    # Listar imágenes en orden para mantener la secuencia
                    img_files = sorted(f for f in os.listdir(cam_path) if f.endswith(".jpg"))
                    for img_file in img_files:
                        img_path = os.path.join(intersection, camera, seq, cam_dir, img_file)
                        file.write(f"{img_path[:-4]}\n")  # Escribir sin la extensión .jpg
                        if file == train_file:
                            train_count += 1
                        elif file == val_file:
                            val_count += 1
                        elif file == test_file:
                            test_count += 1
                        if file in (train_file, val_file):
                            train_val_count += 1
        
        # Escribir en los archivos el mismo split para ambas cámaras
        for camera in cameras:
            for seq in train_seqs:
                write_image_paths(train_file, intersection, camera, seq)
                write_image_paths(train_val_file, intersection, camera, seq)  # También en train_val
            for seq in val_seqs:
                write_image_paths(val_file, intersection, camera, seq)
                write_image_paths(train_val_file, intersection, camera, seq)  # También en train_val
            for seq in test_seqs:
                write_image_paths(test_file, intersection, camera, seq)
    
    # Cerrar los archivos
    train_file.close()
    val_file.close()
    test_file.close()
    train_val_file.close()
    
    # Imprimir los contadores
    total_count = train_count + val_count + test_count
    print(f"Total de archivos: {total_count}")
    print(f"Train: {train_count} archivos")
    print(f"Validation: {val_count} archivos")
    print(f"Test: {test_count} archivos")
    print(f"Train + Validation: {train_val_count} archivos")

# Llamada a la función
generate_split_files(dataset_path, intersections, split_ratios)


Total de archivos: 53160
Train: 41976 archivos
Validation: 5252 archivos
Test: 5932 archivos
Train + Validation: 47228 archivos


# Split DAIR format

In [7]:
import os
import random

# Function to collect images by directory hierarchy
def collect_images(root_path):
    relative_image_paths = []
    
    for main_dir in ["corridor", "intersection"]:
        main_dir_path = os.path.join(root_path, main_dir)
        if not os.path.isdir(main_dir_path):
            continue
        
        for pov in ["vehicle-side", "infrastructure-side"]:
            pov_dir_path = os.path.join(main_dir_path, pov, "image")
            if not os.path.isdir(pov_dir_path):
                continue
            
            for file in os.listdir(pov_dir_path):
                if file.endswith(".jpg"):
                    relative_path = os.path.join(main_dir, pov, "image", file)
                    relative_image_paths.append(relative_path)
    
    return relative_image_paths

# Function to split the dataset
def split_dataset(image_paths, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    random.shuffle(image_paths)  # Shuffle for randomness
    total_images = len(image_paths)
    
    train_count = int(total_images * train_ratio)
    val_count = int(total_images * val_ratio)
    
    train_images = image_paths[:train_count]
    val_images = image_paths[train_count:train_count + val_count]
    test_images = image_paths[train_count + val_count:]
    
    return train_images, val_images, train_images + val_images, test_images

# Function to save splits to .txt files
def save_splits(output_dir, train, val, trainval, test):
    os.makedirs(output_dir, exist_ok=True)
    
    def remove_extension_and_sort(paths):
        return sorted([os.path.splitext(path)[0] for path in paths])  # Sort alphabetically
    
    with open(os.path.join(output_dir, "train.txt"), "w") as f:
        f.writelines(f"{line}\n" for line in remove_extension_and_sort(train))
    with open(os.path.join(output_dir, "val.txt"), "w") as f:
        f.writelines(f"{line}\n" for line in remove_extension_and_sort(val))
    with open(os.path.join(output_dir, "trainval.txt"), "w") as f:
        f.writelines(f"{line}\n" for line in remove_extension_and_sort(trainval))
    with open(os.path.join(output_dir, "test.txt"), "w") as f:
        f.writelines(f"{line}\n" for line in remove_extension_and_sort(test))

# Main logic
dataset_root = "/home/javier/datasets/DAIR-RCooper/RCooper-DAIR"
output_dir = "/home/javier/datasets/DAIR-RCooper/RCooper-DAIR"

image_paths = collect_images(dataset_root)
train, val, trainval, test = split_dataset(image_paths)
save_splits(output_dir, train, val, trainval, test)

# Print summary of the splits
print("Dataset split summary:")
print(f"Total images: {len(image_paths)}")
print(f"Train: {len(train)}")
print(f"Validation: {len(val)}")
print(f"Train+Validation: {len(trainval)}")
print(f"Test: {len(test)}")


Dataset split summary:
Total images: 26580
Train: 21264
Validation: 2658
Train+Validation: 23922
Test: 2658
