In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('..')

The main aim here is to get confusion metrics and also figure out why each prediction is wrong

## Get accuracy metrics

In [2]:
# single stage combined
pred_ann_dir = f'../resnet_model/work_dirs/singletask_resnet_fpn_for_verb_and_target_best/combine_first_and_second_stage_results' 
gt_ann_dir = f'../data/triplet_segmentation_dataset_v2_second_stage/test/ann_dir'
       

In [34]:
from utils.general.dataset_variables import TripletSegmentationVariables

VERB_ID_TO_CLASS_DICT = TripletSegmentationVariables.categories['verb']
TARGET_ID_TO_CLASS_DICT = TripletSegmentationVariables.categories['target']
INSTRUMENT_ID_TO_CLASS_DICT = TripletSegmentationVariables.categories['instrument']

possible_verbs = list(VERB_ID_TO_CLASS_DICT.values())
possible_targets = list(TARGET_ID_TO_CLASS_DICT.values())
possible_instruments = list(INSTRUMENT_ID_TO_CLASS_DICT.values())

In [36]:
import os
import json
from statistics import mean

def load_annotations(folder):
    """Load annotations from a folder of JSON files."""
    annotations = {}
    for file_name in os.listdir(folder):
        if file_name.endswith('.json'):
            with open(os.path.join(folder, file_name), 'r') as f:
                data = json.load(f)
                annotations[file_name] = data
    return annotations

def calculate_accuracy(ground_truth_folder, prediction_folder, task, possible_classes):
    """Calculate accuracy for possible classes."""
    ground_truth = load_annotations(ground_truth_folder)
    predictions = load_annotations(prediction_folder)
    
    assert task in ['instrument', 'verb', 'target']
    if task == 'instrument':
        task = 'label' # annotations called labels in annotation format

    total_per_class = {class_name: 0 for class_name in  possible_classes }
    correct_per_class = {class_name: 0 for class_name in  possible_classes }    
    false_positives_per_class = {class_name: 0 for class_name in  possible_classes }
    false_negatives_per_class = {class_name: 0 for class_name in  possible_classes }
    
    missed_files = []

    for file_name, gt_data in ground_truth.items():
        if file_name not in predictions:
            missed_files.append(file_name)
            continue
        
        # i am okay with the overwritting of contours, as it essentially we just need to count one.     
        gt_shapes = { (shape['label'],shape['group_id']): shape for shape in gt_data.get('shapes', [])} 
        pred_shapes = {(shape['label'],shape['group_id']): shape for shape in predictions[file_name].get('shapes', [])}
        
        found_gt_keys = []

        for gt_key, gt_shape in gt_shapes.items(): 
            
            gt_class_name = gt_shape[task]
            
            total_per_class[gt_class_name] += 1 

            found_pred_key = None
            
            for pred_key, pred_shape in pred_shapes.items(): 
                # Calculate verb accuracy
                pred_class_name = pred_shape.get(task)
                if gt_class_name == pred_class_name:
                    correct_per_class[pred_class_name] += 1
                    found_pred_key = pred_key
                    found_gt_keys.append(gt_key)
                    break
                    
            # remove that pred prediction from possibilites        
            if found_pred_key:
                pred_shapes.pop(found_pred_key)
        
        #update the false positives from the pred. 
        for pred_key, pred_shape in pred_shapes.items(): 
            pred_class_name = pred_shape.get(task)
            false_positives_per_class[pred_class_name] += 1
        
        
        # delete the found groundtruth.
        for found_gt_key in found_gt_keys: 
            gt_shapes.pop(found_gt_key)
        
         #update the false negatives from the gt. 
        for gt_key, gt_shape in gt_shapes.items(): 
            gt_class_name = gt_shape.get(task)
            false_negatives_per_class[gt_class_name] += 1
        
                

    accuracy = sum(correct_per_class.values()) / sum(total_per_class.values())  if sum(total_per_class.values()) > 0 else 0
    acc_per_class = {
        class_name: (
            correct_per_class[class_name] / total_per_class[class_name] 
            if total_per_class[class_name] is not None and total_per_class[class_name] > 0 
            else None
        )
        for class_name in possible_classes
    }
    mean_acc = mean(value for value in acc_per_class.values() if value is not None)

    return {
        "accuracy": accuracy,
        "mean_acc":mean_acc,
        "acc_per_class": acc_per_class,
        "missed_files": missed_files,
        "total_per_class": total_per_class,
        "correct_per_class": correct_per_class,
        "false_positives_per_class": false_positives_per_class,
        "false_negatives_per_class": false_negatives_per_class
    }



In [42]:
task = 'verb'
if task == 'target': 
    possible_classes = possible_targets
elif  task == 'verb': 
    possible_classes = possible_verbs  
elif  task == 'instruments': 
    possible_classes = possible_instruments      
else: 
    raise ValueError('there is an error in the task')     
accuracy_results = calculate_accuracy(gt_ann_dir, pred_ann_dir, task = task, possible_classes=possible_classes)



In [43]:

print("accuracy:", accuracy_results["accuracy"])
print("mean_acc:", accuracy_results["mean_acc"])
print("acc_per_class:", accuracy_results["acc_per_class"])
print("Missed Files:", accuracy_results["missed_files"])
print("total_per_class:", accuracy_results["total_per_class"])
print("correct_per_class:", accuracy_results["correct_per_class"])
print("false_positives_per_class:", accuracy_results["false_positives_per_class"])
print("false_negatives_per_class:", accuracy_results["false_negatives_per_class"])

accuracy: 0.8607086751614592
mean_acc: 0.6001870078044778
acc_per_class: {'grasp': 0.205607476635514, 'retract': 0.9113475177304965, 'dissect': 0.9517276422764228, 'coagulate': 0.9093655589123867, 'clip': 0.8681318681318682, 'cut': 0.6829268292682927, 'aspirate': 0.8725761772853186, 'irrigate': 0.0, 'pack': None, 'null_verb': 0.0}
Missed Files: []
total_per_class: {'grasp': 107, 'retract': 2538, 'dissect': 1968, 'coagulate': 331, 'clip': 91, 'cut': 41, 'aspirate': 361, 'irrigate': 35, 'pack': 0, 'null_verb': 257}
correct_per_class: {'grasp': 22, 'retract': 2313, 'dissect': 1873, 'coagulate': 301, 'clip': 79, 'cut': 28, 'aspirate': 315, 'irrigate': 0, 'pack': 0, 'null_verb': 0}
false_positives_per_class: {'grasp': 21, 'retract': 395, 'dissect': 116, 'coagulate': 70, 'clip': 20, 'cut': 29, 'aspirate': 102, 'irrigate': 0, 'pack': 0, 'null_verb': 1}
false_negatives_per_class: {'grasp': 85, 'retract': 225, 'dissect': 95, 'coagulate': 30, 'clip': 12, 'cut': 13, 'aspirate': 46, 'irrigate': 35

In [14]:
total_per_class = accuracy_results['total_per_class']
correct_per_class = accuracy_results['correct_per_class']


In [19]:
acc_per_class = {
    class_name: (
        correct_per_class[class_name] / total_per_class[class_name] 
        if total_per_class[class_name] is not None and total_per_class[class_name] > 0 
        else None
    )
    for class_name in possible_targets
}
acc_per_class

{'gallbladder': 0.8563812062645785,
 'cystic_plate': 0.01818181818181818,
 'cystic_duct': 0.17045454545454544,
 'cystic_artery': 0.03759398496240601,
 'cystic_pedicle': 0.0,
 'blood_vessel': None,
 'fluid': 0.7423822714681441,
 'abdominal_wall_cavity': 0.03571428571428571,
 'liver': 0.7248780487804878,
 'adhesion': None,
 'omentum': 0.005076142131979695,
 'peritoneum': 0.0,
 'gut': 0.0,
 'specimen_bag': 0.43103448275862066,
 'null_target': 0.007782101167315175}

it is mAP. That is why. 

In [None]:


# Calculate the mean, excluding None values
mean_acc = mean(value for value in acc_per_class.values() if value is not None)

mean_acc
  

0.23303683745262932