# Libraries

In [None]:
import os
import shutil
import json
from sklearn.model_selection import train_test_split
from collections import Counter

# Helper Functions

In [None]:
def load_metadata(source_dir, metadata_filename):
    metadata_filePath = os.path.join(source_dir, metadata_filename)

    with open(metadata_filePath, 'r') as f:
        return json.load(f)

In [None]:
def performSplit(metadata, split_ratio):
    clusters = [entry['cluster'] for entry in metadata]
    
    activeLearning_data, prediction_data = train_test_split(
        metadata,
        test_size=split_ratio,
        random_state=42,
        stratify=clusters  # stratify 
        )
    
    return activeLearning_data, prediction_data

In [None]:
def getCluster_ratio(data):
    # Count the number of frames per cluster
    cluster_counts = Counter(entry['cluster'] for entry in data)
    # Calculate total number of frames
    total_frames = len(data)
    # Print cluster percentages
    for cluster_id, count in cluster_counts.items():
        percentage = (count / total_frames) * 100
        print(f"Cluster {cluster_id}: {count} frames ({percentage:.2f}%)")

In [None]:
def printDetails(metadata, split_data):
    print("#############")
    print(f"Total frames: {len(metadata)}")
    getCluster_ratio(metadata)
    print("\t#############")
    for split in split_data:
        percentage = (len(split[1][0])/len(metadata))*100
        print(f"{split[0][1]} split: {len(split[1][0])} ({percentage:.2f}%)")
        getCluster_ratio(split[1][0])
        print("\t#############")

In [None]:
def save_metadata(output_dir, metadata_filename, metadata):
    metadata_outFilePath = os.path.join(output_dir, metadata_filename)

    with open(metadata_outFilePath, 'w') as f:
        json.dump(metadata, f, indent=4)

In [None]:
def copyframes(split):
    # destination path for the split frame
    out_path = os.path.join(split[2][1], split[0][0])
    os.makedirs(out_path, exist_ok=True)
    # save the metadata at the output
    save_metadata(split[2][1], split[0][1], split[1][0])

    if split[1][1]:
        for data in split[1][0]:
            frame_name = os.path.basename(data['image_path'])
            # src frame path
            src_framePath = os.path.join(split[2][0], frame_name)
            out_framePath = os.path.join(out_path, frame_name)
            # copy
            shutil.copy2(src_framePath, out_framePath)

    print(f"Copy complete for the split {split[0][1]} with images - {split[1][1]}")

# Main()

In [24]:
source_dir = "/mnt/c/Users/karti/chest/CNR/projects/data/neurocig/frames"
output_dir = "/mnt/c/Users/karti/chest/CNR/projects/data/neurocig/stratifySplit_frames"
activeLearning_dir = "/mnt/c/Users/karti/chest/CNR/projects/data/neurocig/stratifySplit_frames/activeLearning"
annotations_metadata_filename = "annotations_metadata.json"
aL_metadata_filename = "activeLearning.json"

In [25]:
annotations_metadata = load_metadata(activeLearning_dir, annotations_metadata_filename)
# load the active learning frames metadata
activeLearning_metadata = load_metadata(activeLearning_dir, aL_metadata_filename)

In [34]:
split_ratio_annotation = 0.15

# train val split
train_metadata, val_metadata = performSplit(annotations_metadata, split_ratio_annotation)

split_data_train = (
        ('train', 'train.json'),
        (train_metadata, False),
        (source_dir, activeLearning_dir)
    )

split_data_val =   (
        ('val', 'val.json'),
        (val_metadata, False),
        (source_dir, activeLearning_dir)
    )

printDetails(annotations_metadata, [split_data_train, split_data_val])

#############
Total frames: 132
Cluster 3: 12 frames (9.09%)
Cluster 2: 21 frames (15.91%)
Cluster 0: 30 frames (22.73%)
Cluster 4: 18 frames (13.64%)
Cluster 5: 26 frames (19.70%)
Cluster 1: 7 frames (5.30%)
Cluster 6: 13 frames (9.85%)
Cluster 7: 5 frames (3.79%)
	#############
train.json split: 112 (84.85%)
Cluster 5: 22 frames (19.64%)
Cluster 0: 26 frames (23.21%)
Cluster 2: 18 frames (16.07%)
Cluster 6: 11 frames (9.82%)
Cluster 4: 15 frames (13.39%)
Cluster 1: 6 frames (5.36%)
Cluster 3: 10 frames (8.93%)
Cluster 7: 4 frames (3.57%)
	#############
val.json split: 20 (15.15%)
Cluster 4: 3 frames (15.00%)
Cluster 1: 1 frames (5.00%)
Cluster 0: 4 frames (20.00%)
Cluster 2: 3 frames (15.00%)
Cluster 5: 4 frames (20.00%)
Cluster 3: 2 frames (10.00%)
Cluster 6: 2 frames (10.00%)
Cluster 7: 1 frames (5.00%)
	#############


In [35]:
prediction_percentage = 30
nOf_annotations = float((prediction_percentage*len(annotations_metadata))/100)
split_ratio_predict = round(((nOf_annotations/len(activeLearning_metadata))*100)/100, 3)

# annotation split
activeLearning_metadata_updated, predict_metadata = performSplit(activeLearning_metadata, split_ratio_predict)

split_data_aL = (
        ('activeLearning', 'activeLearning.json'),
        (activeLearning_metadata_updated, False),
        (source_dir, output_dir)
    )

split_data_predict =   (
        ('predict', 'predict.json'),
        (predict_metadata, True),
        (source_dir, activeLearning_dir)
    )

printDetails(activeLearning_metadata, [split_data_aL, split_data_predict])

#############
Total frames: 1567
Cluster 3: 133 frames (8.49%)
Cluster 5: 317 frames (20.23%)
Cluster 2: 248 frames (15.83%)
Cluster 4: 217 frames (13.85%)
Cluster 6: 162 frames (10.34%)
Cluster 7: 65 frames (4.15%)
Cluster 0: 350 frames (22.34%)
Cluster 1: 75 frames (4.79%)
	#############
activeLearning.json split: 1527 (97.45%)
Cluster 5: 309 frames (20.24%)
Cluster 2: 242 frames (15.85%)
Cluster 0: 341 frames (22.33%)
Cluster 6: 158 frames (10.35%)
Cluster 3: 130 frames (8.51%)
Cluster 7: 63 frames (4.13%)
Cluster 1: 73 frames (4.78%)
Cluster 4: 211 frames (13.82%)
	#############
predict.json split: 40 (2.55%)
Cluster 4: 6 frames (15.00%)
Cluster 3: 3 frames (7.50%)
Cluster 5: 8 frames (20.00%)
Cluster 0: 9 frames (22.50%)
Cluster 1: 2 frames (5.00%)
Cluster 6: 4 frames (10.00%)
Cluster 2: 6 frames (15.00%)
Cluster 7: 2 frames (5.00%)
	#############


In [36]:
split_data_is = [split_data_aL, split_data_train, split_data_val, split_data_predict]

for split in split_data_is:
    copyframes(split)

Copy complete for the split activeLearning.json with images - False
Copy complete for the split train.json with images - False
Copy complete for the split val.json with images - False
Copy complete for the split predict.json with images - True
