# Libraries

In [38]:
import os
import shutil
import json
from sklearn.model_selection import train_test_split
from collections import Counter
from PIL import Image

# Helper Functions

In [2]:
def load_metadata(source_dir, metadata_filename):
    metadata_filePath = os.path.join(source_dir, metadata_filename)

    with open(metadata_filePath, 'r') as f:
        return json.load(f)

In [3]:
def save_metadata(output_dir, metadata_filename, metadata):
    metadata_outFilePath = os.path.join(output_dir, metadata_filename)

    with open(metadata_outFilePath, 'w') as f:
        json.dump(metadata, f, indent=4)

In [4]:
def performSplit(metadata, split_ratio):
    clusters = [entry['cluster'] for entry in metadata]
    
    activeLearning_data, prediction_data = train_test_split(
        metadata,
        test_size=split_ratio,
        random_state=42,
        stratify=clusters  # stratify 
        )
    
    return activeLearning_data, prediction_data

In [5]:
def copyframes(split):
    # destination path for the split frame
    out_path = os.path.join(split[2][1], split[0][0])
    os.makedirs(out_path, exist_ok=True)
    # save the metadata at the output
    save_metadata(out_path, split[0][1], split[1][0])

    if split[1][1]:
        for data in split[1][0]:
            frame_name = os.path.basename(data['image_path'])
            # src frame path
            src_framePath = os.path.join(split[2][0], frame_name)
            out_framePath = os.path.join(out_path, frame_name)
            # copy
            shutil.copy2(src_framePath, out_framePath)

    print(f"Copy complete for the split {split[0][1]} with images - {split[1][1]}")

In [6]:
def getCluster_ratio(data):
    # Count the number of frames per cluster
    cluster_counts = Counter(entry['cluster'] for entry in data)
    # Calculate total number of frames
    total_frames = len(data)
    # Print cluster percentages
    for cluster_id, count in cluster_counts.items():
        percentage = (count / total_frames) * 100
        print(f"Cluster {cluster_id}: {count} frames ({percentage:.2f}%)")

In [7]:
def printDetails(metadata, split_data):
    print("#############")
    print(f"Total frames: {len(metadata)}")
    getCluster_ratio(metadata)
    print("\t#############")
    for split in split_data:
        percentage = (len(split[1][0])/len(metadata))*100
        print(f"{split[0][1]} split: {len(split[1][0])} ({percentage:.2f}%)")
        getCluster_ratio(split[1][0])
        print("\t#############")


In [34]:
def copyframes_train_val(currentFrame_path, destination_dir):
    os.makedirs(destination_dir, exist_ok=True)
    shutil.copy2(currentFrame_path, destination_dir)


In [43]:
def get_image_size(image_path):
    with Image.open(image_path) as img:
        return img.width, img.height

In [None]:
def create_yolo_bBox_labels(annotation_bBox_info, frame_w, frame_h, class_label):
    yolo_bBox_label = []
    for bBox_info in annotation_bBox_info:
        bBox = bBox_info['bbox']

        x1, y1 = bBox["x1"], bBox["y1"]
        x2, y2 = bBox["x2"], bBox["y2"]

        # Convert to YOLO
        bBox_w = x2 - x1
        bBox_h = y2 - y1
        x_center = x1 + bBox_w / 2.0
        y_center = y1 + bBox_h / 2.0
        
        # Normalize
        x_center_norm = x_center / frame_w
        y_center_norm = y_center / frame_h
        w_norm = bBox_w / frame_w
        h_norm = bBox_h / frame_h
        
        # class_id, x_c, y_c, w, h
        yolo_line = f"{class_label} {x_center_norm:.6f} {y_center_norm:.6f} {w_norm:.6f} {h_norm:.6f}"
        
        for keypoint in bBox_info['keypoints'].values():
            kp_x = keypoint[0]/frame_w
            kp_y = keypoint[1]/frame_h
            kp_v = keypoint[2]

            keypoint_line = f" {kp_x:.6f} {kp_y:.6f} {kp_v}"

            yolo_line += keypoint_line
        
        yolo_bBox_label.append(yolo_line)
        
    return yolo_bBox_label


In [61]:
def save_yolo_label(trainLabels_dir, frame_name, yolo_bBox_labels):
    os.makedirs(trainLabels_dir, exist_ok=True)
    trainLabel_filename = os.path.join(trainLabels_dir, frame_name.replace("jpg", "txt"))

    with open(trainLabel_filename, 'w') as txt_out:
        txt_out.write("\n".join(yolo_bBox_labels))

In [63]:
def prepare_train_val(t_v_dir, annotations_dir, frame_name, annotation_info):
    t_v_images_dir = os.path.join(t_v_dir, "images")
    currentFrame_path = os.path.join(annotations_dir, frame_name)
    copyframes_train_val(currentFrame_path, t_v_images_dir)

    frame_w, frame_h = get_image_size(currentFrame_path)
    mouse_class_label = 0
    t_v_labels_dir = os.path.join(t_v_dir, "labels")
    yolo_bBox_labels = create_yolo_bBox_labels(annotation_info,  frame_w, frame_h, mouse_class_label)
    save_yolo_label(t_v_labels_dir, frame_name, yolo_bBox_labels)

# Main()

### Active learning and test split

In [8]:
# dirs' path
source_dir = "/mnt/c/Users/karti/chest/CNR/projects/data/neurocig/frames"
output_dir = "/mnt/c/Users/karti/chest/CNR/projects/data/neurocig/stratifySplit_frames"
metadata_filename_main = "frames_info.json"

In [None]:
# load the frames metadata
metadata_main = load_metadata(source_dir, metadata_filename_main)

# Initial Split Percentages [active learning - 85, test - 15]
split_ratio_main = 0.15

# perform intial split
activeLearning_data, test_data = performSplit(metadata_main, split_ratio_main)
split_data_main = [(
        ('activeLearning', 'activeLearning.json'),
        (activeLearning_data, False),
        (source_dir, output_dir)
    ),
    (
        ('test', 'test.json'),
        (test_data, True),
        (source_dir, output_dir)
    )]

printDetails(metadata_main, split_data_main)

# copy the frames to their appropriate dirs
for split in split_data_main:
    copyframes(split)

### Inital split

In [9]:
activeLearning_dir = "/mnt/c/Users/karti/chest/CNR/projects/data/neurocig/stratifySplit_frames/activeLearning"
metadata_filename_al = "activeLearning.json"

In [None]:
# number of initial frames of manuel annotation
intialN_annotatonData = 100

# load the active learning frames metadata
activeLearning_metadata = load_metadata(activeLearning_dir, metadata_filename_al)

# calculate the split ratio respective to intialN_annotatonData
split_ratio_aL = round(((intialN_annotatonData/len(activeLearning_metadata))*100)/100, 3)

# annotation split
activeLearning_metadata_new, annotation_metadata = performSplit(activeLearning_metadata, split_ratio_aL)

split_data_aL = (
        ('activeLearning', 'activeLearning.json'),
        (activeLearning_metadata_new, False),
        (source_dir, output_dir)
    )
    
split_data_annotations =   (
        ('annotations', 'annotations_metadataAL.json'),
        (annotation_metadata, True),
        (source_dir, activeLearning_dir)
    )

printDetails(activeLearning_metadata, [split_data_aL, split_data_annotations])


In [None]:
prediction_percentage = 30
nOf_annotations = float((30*len(annotation_metadata))/100)
split_ratio_predict = round(((nOf_annotations/len(activeLearning_metadata_new))*100)/100, 3)

# annotation split
activeLearning_metadata_updated, predict_metadata = performSplit(activeLearning_metadata_new, split_ratio_predict)

split_data_aL = (
        ('activeLearning', 'activeLearning.json'),
        (activeLearning_metadata_updated, False),
        (source_dir, output_dir)
    )
    
split_data_predict =   (
        ('predict', 'predict.json'),
        (predict_metadata, True),
        (source_dir, activeLearning_dir)
    )

printDetails(activeLearning_metadata_new, [split_data_aL, split_data_predict])

In [None]:
split_ratio_annotation = 0.15

# train val split
train_metadata, val_metadata = performSplit(annotation_metadata, split_ratio_annotation)

split_data_train = (
        ('train', 'train.json'),
        (train_metadata, False),
        (source_dir, activeLearning_dir)
    )
    
split_data_val =   (
        ('val', 'val.json'),
        (val_metadata, False),
        (source_dir, activeLearning_dir)
    )

printDetails(annotation_metadata, [split_data_train, split_data_val])

In [None]:
split_data_is = [split_data_aL, split_data_annotations, split_data_train, split_data_val, split_data_predict]

for split in split_data_is:
    copyframes(split)

### Data Conversion
Annotations to train and val

In [10]:
train_dir = os.path.join(activeLearning_dir, "train")
train_json = "train.json"
val_dir = os.path.join(activeLearning_dir, "val")
val_json = "val.json"

# path to manual annotated frames and its json
annotations_dir = os.path.join(activeLearning_dir, "annotations")
annotation_json = "annotation.json"

In [94]:
# manually annotated json
mAnnotated_json = load_metadata(annotations_dir, annotation_json)

# train and val frame metadata
train_metadata = load_metadata(train_dir, train_json)
val_metadata = load_metadata(val_dir, val_json)

In [95]:
for frame_name, annotation_info in mAnnotated_json.items():
    for train_frame in train_metadata:
        if frame_name in train_frame['image_path']:
            prepare_train_val(train_dir, annotations_dir, frame_name, annotation_info)
            
    
    for val_frame in val_metadata:
        if frame_name in val_frame['image_path']:
            prepare_train_val(val_dir, annotations_dir, frame_name, annotation_info)

0 0.731266 0.584896 0.221875 0.179167 0.798454 0.541146 2 0.776579 0.547396 2 0.787516 0.586979 2 0.654704 0.611979 2
0 0.392985 0.573438 0.195312 0.168750 0.314079 0.566146 2 0.339079 0.593229 2 0.339079 0.555729 2 0.464079 0.561979 2
0 0.177360 0.621354 0.154688 0.235417 0.160954 0.711979 2 0.182829 0.651563 2 0.139079 0.639062 2 0.200016 0.522396 2
0 0.467204 0.333854 0.231250 0.127083 0.371891 0.366146 2 0.403141 0.370312 2 0.395329 0.332813 2 0.542204 0.332813 2
0 0.728922 0.396354 0.217188 0.202083 0.671891 0.484896 2 0.701579 0.482812 2 0.676579 0.451562 2 0.789079 0.336979 2
0 0.802517 0.219551 0.129688 0.189583 0.811111 0.145592 2 0.792361 0.191426 2 0.826736 0.191426 2 0.759549 0.293509 2
0 0.225174 0.547676 0.134375 0.191667 0.214236 0.466426 2 0.184549 0.501842 2 0.215799 0.506009 2 0.273611 0.589342 2
0 0.430642 0.513301 0.154688 0.164583 0.386111 0.451842 2 0.404861 0.503926 2 0.432986 0.468509 2 0.490799 0.568509 2
0 0.559549 0.582051 0.159375 0.193750 0.576736 0.524759 

16 85


### Active learning split