Orchestrates the active-learning dataset preparation:
1. Load active-learning metadata.
2. Split into (activeLearning vs. annotations).
3. From activeLearning, carve out a prediction subset.
4. Split annotations into train/val.
5. Copy frames and write per-split metadata.

# Libraries

In [None]:
import os
from src import (
    load_metadata,
    perform_split,
    print_details,
    copy_frames
)

# Main

In [None]:
# Directories and filenames
source_dir = ""
output_dir = ""
al_dir = os.path.join(output_dir, "active_learning")
al_metadata_filename = "active_learning.json"

os.makedirs(output_dir, exist_ok=True)
os.makedirs(al_dir, exist_ok=True)

In [None]:
# Number of frames to manually annotate initially
initial_annotation_data = 100

In [None]:
# Load active-learning frames metadata
al_metadata = load_metadata(
    output_dir=output_dir,
    metadata_filename=al_metadata_filename
)

total_al = len(al_metadata)

if total_al == 0:
        print("No entries found in activeLearning metadata.")

#### Split #1: activeLearning vs. annotations

In [None]:
# Proportion assigned to annotations = initialN_annotationData / total
split_ratio_al = min(
    1.0, 
    round(initial_annotation_data / total_al, 3)
)

udpated_1_al_metadata, annotation_metadata = perform_split(
    metadata=al_metadata, 
    split_ratio=split_ratio_al
)

In [None]:
split_data_aL = (
    ("active_learning", "active_learning.json"),
    (udpated_1_al_metadata, False), # do NOT copy images for this bucket
    (source_dir, output_dir),
)

split_data_annotations = (
    ("annotations", "annotations_metadata.json"),
    (annotation_metadata, True), # copy images into al_dir under the folder annotations
    (source_dir, al_dir),
)


split_data_1 = [split_data_aL, split_data_annotations]
print_details(
    metadata=al_metadata,
    split_data=split_data_1
)

#### Split #2: carve out prediction subset from the (remaining) activeLearning

In [None]:

prediction_percentage = 30  # % of 'annotation_metadata' to predict next

n_of_annotations = int((prediction_percentage * len(annotation_metadata)) / 100)

denom = len(udpated_1_al_metadata)
split_ratio_predict = 0.0 if denom == 0 else round(n_of_annotations / denom, 3)

udpated_2_al_metadata, predict_metadata = perform_split(
    udpated_1_al_metadata, 
    split_ratio_predict
)

In [None]:
split_data_aL = (
    ('active_learning', 'active_learning.json'),
    (udpated_2_al_metadata, False),
    (source_dir, output_dir)
)

split_data_predict =   (
    ('predict', 'predict.json'),
    (predict_metadata, True),
    (source_dir, al_dir)
)

split_data_2 = [split_data_aL, split_data_predict]

print_details(
    metadata=udpated_2_al_metadata,
    split_data=split_data_2
)

#### Split #3: train/val split from 'annotation_metadata'

In [None]:
split_ratio_annotation = 0.15  # proportion to validation

train_metadata, val_metadata = perform_split(
    metadata=annotation_metadata,
    split_ratio=split_ratio_annotation
)

In [None]:
split_data_train = (
    ("train", "train.json"),
    (train_metadata, False),
    (source_dir, al_dir),
)

split_data_val = (
    ("val", "val.json"),
    (val_metadata, False),
    (source_dir, al_dir),
)

split_data_3 = [split_data_train, split_data_val]

print_details(annotation_metadata, split_data_3)

#### Copy frames + write metadata for all splits

In [None]:
split_data = split_data_1 + split_data_2 + split_data_3

for split in split_data:
   copy_frames(split)