# Libraries

In [1]:
import os
import shutil
import json
from sklearn.model_selection import train_test_split
from collections import Counter

# Helper Functions

In [2]:
def load_metadata(source_dir, metadata_filename):
    metadata_filePath = os.path.join(source_dir, metadata_filename)

    with open(metadata_filePath, 'r') as f:
        return json.load(f)

In [3]:
def save_metadata(output_dir, metadata_filename, metadata):
    metadata_outFilePath = os.path.join(output_dir, metadata_filename)

    with open(metadata_outFilePath, 'w') as f:
        json.dump(metadata, f, indent=4)

In [4]:
def performSplit(metadata, split_ratio):
    clusters = [entry['cluster'] for entry in metadata]
    
    activeLearning_data, prediction_data = train_test_split(
        metadata,
        test_size=split_ratio,
        random_state=42,
        stratify=clusters  # stratify 
        )
    
    return activeLearning_data, prediction_data

In [5]:
def copyframes(split):
    # destination path for the split frame
    out_path = os.path.join(split[2][1], split[0][0])
    os.makedirs(out_path, exist_ok=True)
    # save the metadata at the output
    save_metadata(out_path, split[0][1], split[1][0])

    if split[1][1]:
        for data in split[1][0]:
            frame_name = os.path.basename(data['image_path'])
            # src frame path
            src_framePath = os.path.join(split[2][0], frame_name)
            out_framePath = os.path.join(out_path, frame_name)
            # copy
            shutil.copy2(src_framePath, out_framePath)

    print(f"Copy complete for the split {split[0][1]} with images - {split[1][1]}")

In [6]:
def getCluster_ratio(data):
    # Count the number of frames per cluster
    cluster_counts = Counter(entry['cluster'] for entry in data)
    # Calculate total number of frames
    total_frames = len(data)
    # Print cluster percentages
    for cluster_id, count in cluster_counts.items():
        percentage = (count / total_frames) * 100
        print(f"Cluster {cluster_id}: {count} frames ({percentage:.2f}%)")

In [7]:
def printDetails(metadata, split_data):
    print("#############")
    print(f"Total frames: {len(metadata)}")
    getCluster_ratio(metadata)
    print("\t#############")
    for split in split_data:
        percentage = (len(split[1][0])/len(metadata))*100
        print(f"{split[0][1]} split: {len(split[1][0])} ({percentage:.2f}%)")
        getCluster_ratio(split[1][0])
        print("\t#############")


In [8]:
def perform_activeLearning_split(metedata, split_ratio):
    clusters = [entry['cluster'] for entry in metedata]

    # split train data
    train_data, temp_data = train_test_split(
        metedata,
        test_size=(1.0 - split_ratio[0]),
        random_state=42,
        stratify=clusters  # stratify 
        )
    
    # remaining data
    temp_clusters = [entry['cluster'] for entry in temp_data]

    # Proportion of the leftover data that should go into test
    test_ratio_adjusted = split_ratio[2] / (split_ratio[1] + split_ratio[2])

    val_data, test_data = train_test_split(
        temp_data,
        test_size=test_ratio_adjusted,
        random_state=42,
        stratify=temp_clusters
    )

    return train_data, val_data, test_data


# Main()

### Active learning and test split

In [9]:
# dirs' path
source_dir = "/mnt/c/Users/karti/chest/CNR/projects/data/neurocig/frames"
output_dir = "/mnt/c/Users/karti/chest/CNR/projects/data/neurocig/stratifySplit_frames"
metadata_filename_main = "frames_info.json"

# load the frames metadata
metadata_main = load_metadata(source_dir, metadata_filename_main)

# Initial Split Percentages [active learning - 85, test - 15]
split_ratio_main = 0.15

# perform intial split
activeLearning_data, test_data = performSplit(metadata_main, split_ratio_main)
split_data_main = [(
        ('activeLearning', 'activeLearning.json'),
        (activeLearning_data, False),
        (source_dir, output_dir)
    ),
    (
        ('test', 'test.json'),
        (test_data, True),
        (source_dir, output_dir)
    )]

printDetails(metadata_main, split_data_main)

# copy the frames to their appropriate dirs
for split in split_data_main:
    copyframes(split)

#############
Total frames: 1300
Cluster 1: 285 frames (21.92%)
Cluster 0: 742 frames (57.08%)
Cluster 2: 273 frames (21.00%)
	#############
activeLearning.json split: 1105 (85.00%)
Cluster 0: 631 frames (57.10%)
Cluster 1: 242 frames (21.90%)
Cluster 2: 232 frames (21.00%)
	#############
test.json split: 195 (15.00%)
Cluster 0: 111 frames (56.92%)
Cluster 1: 43 frames (22.05%)
Cluster 2: 41 frames (21.03%)
	#############
Copy complete for the split activeLearning.json with images - False
Copy complete for the split test.json with images - True


### Inital split

In [10]:
activeLearning_dir = "/mnt/c/Users/karti/chest/CNR/projects/data/neurocig/stratifySplit_frames/activeLearning"
metadata_filename_al = "activeLearning.json"

# number of initial frames of manuel annotation
intialN_annotatonData = 50

# load the active learning frames metadata
activeLearning_metadata = load_metadata(activeLearning_dir, metadata_filename_al)

# calculate the split ratio respective to intialN_annotatonData
split_ratio_aL = round(((intialN_annotatonData/len(activeLearning_metadata))*100)/100, 3)

# annotation split
activeLearning_metadata_new, annotation_metadata = performSplit(activeLearning_metadata, split_ratio_aL)

split_data_aL = (
        ('activeLearning', 'activeLearning.json'),
        (activeLearning_metadata_new, False),
        (source_dir, output_dir)
    )
    
split_data_annotations =   (
        ('annotations', 'annotations_metadataAL.json'),
        (annotation_metadata, True),
        (source_dir, activeLearning_dir)
    )

printDetails(activeLearning_metadata, [split_data_aL, split_data_annotations])


#############
Total frames: 1105
Cluster 0: 631 frames (57.10%)
Cluster 1: 242 frames (21.90%)
Cluster 2: 232 frames (21.00%)
	#############
activeLearning.json split: 1055 (95.48%)
Cluster 0: 602 frames (57.06%)
Cluster 1: 231 frames (21.90%)
Cluster 2: 222 frames (21.04%)
	#############
annotations_metadataAL.json split: 50 (4.52%)
Cluster 1: 11 frames (22.00%)
Cluster 0: 29 frames (58.00%)
Cluster 2: 10 frames (20.00%)
	#############


In [11]:
prediction_percentage = 30
nOf_annotations = float((30*len(annotation_metadata))/100)
split_ratio_predict = round(((nOf_annotations/len(activeLearning_metadata_new))*100)/100, 3)

# annotation split
activeLearning_metadata_updated, predict_metadata = performSplit(activeLearning_metadata_new, split_ratio_predict)

split_data_aL = (
        ('activeLearning', 'activeLearning.json'),
        (activeLearning_metadata_updated, False),
        (source_dir, output_dir)
    )
    
split_data_predict =   (
        ('predict', 'predict.json'),
        (predict_metadata, True),
        (source_dir, activeLearning_dir)
    )

printDetails(activeLearning_metadata_new, [split_data_aL, split_data_predict])

#############
Total frames: 1055
Cluster 0: 602 frames (57.06%)
Cluster 1: 231 frames (21.90%)
Cluster 2: 222 frames (21.04%)
	#############
activeLearning.json split: 1040 (98.58%)
Cluster 1: 228 frames (21.92%)
Cluster 0: 593 frames (57.02%)
Cluster 2: 219 frames (21.06%)
	#############
predict.json split: 15 (1.42%)
Cluster 0: 9 frames (60.00%)
Cluster 2: 3 frames (20.00%)
Cluster 1: 3 frames (20.00%)
	#############


In [None]:
split_ratio_annotation = 0.15

# train val split
train_metadata, val_metadata = performSplit(annotation_metadata, split_ratio_annotation)

split_data_train = (
        ('test', 'train.json'),
        (train_metadata, True),
        (source_dir, activeLearning_dir)
    )
    
split_data_val =   (
        ('val', 'val.json'),
        (val_metadata, True),
        (source_dir, activeLearning_dir)
    )

printDetails(annotation_metadata, [split_data_train, split_data_val])

#############
Total frames: 50
Cluster 1: 11 frames (22.00%)
Cluster 0: 29 frames (58.00%)
Cluster 2: 10 frames (20.00%)
	#############
train.json split: 42 (84.00%)
Cluster 0: 24 frames (57.14%)
Cluster 1: 9 frames (21.43%)
Cluster 2: 9 frames (21.43%)
	#############
val.json split: 8 (16.00%)
Cluster 0: 5 frames (62.50%)
Cluster 1: 2 frames (25.00%)
Cluster 2: 1 frames (12.50%)
	#############


In [13]:
split_data_is = [split_data_aL, split_data_annotations, split_data_train, split_data_val, split_data_predict]

for split in split_data_is:
    copyframes(split)

Copy complete for the split activeLearning.json with images - False
Copy complete for the split annotations_metadataAL.json with images - True
Copy complete for the split train.json with images - False
Copy complete for the split val.json with images - False
Copy complete for the split predict.json with images - True


### Active learning split