In [310]:
# Function to convert bounding box to polygon
import pandas as pd
import numpy as np
import pytest
from shapely.geometry import Polygon

def extract_label_data(label_set):
    rows = []
    for idx in range(0,len(label_set['projects'])):
        image_id = label_set['data_row'].iloc[idx]['external_id'] 
        # Iterate over each item in the data
        project_key = list(label_set['projects'][idx].keys())[0]
        for item in label_set['projects'][idx][project_key]['labels']:
            # Extract the necessary fields
            label_details = item['label_details']
            performance_details = item['performance_details']
            annotations = item['annotations']['objects']
        
            # Iterate over each annotation
            for annotation in annotations:
                # Create a dictionary for this row
                row = {
                    'image_id': image_id,
                    'worker': label_details['created_by'],
                    'seconds_to_create': performance_details['seconds_to_create'],
                    'annotation_kind': annotation['annotation_kind'],
                    'top': annotation['bounding_box']['top'],
                    'left': annotation['bounding_box']['left'],
                    'height': annotation['bounding_box']['height'],
                    'width': annotation['bounding_box']['width'],
                    'name': annotation['name'],
                    'value': annotation['value'],
                    
                    
                }
        
                # Append the row to the list
                rows.append(row)
        
        # Convert the list of rows into a DataFrame
        df = pd.DataFrame(rows)
    return df

def test_extract_label_data():
    # Define a sample label set
    label_set = {
        'projects': [
            {
                'project1': {
                    'labels': [
                        {
                            'label_details': {'created_by': 'worker1'},
                            'performance_details': {'seconds_to_create': 10},
                            'annotations': {
                                'objects': [
                                    {
                                        'annotation_kind': 'kind1',
                                        'bounding_box': {'top': 0, 'left': 0, 'height': 10, 'width': 10},
                                        'name': 'name1',
                                        'value': 'value1'
                                    }
                                ]
                            }
                        }
                    ]
                }
            }
        ],
        'data_row': pd.DataFrame({'external_id': ['id1']})
    }

    # Call the function with the sample label set
    df = extract_label_data(label_set)

    # Define the expected DataFrame
    expected_df = pd.DataFrame({
        'image_id': ['id1'],
        'worker': ['worker1'],
        'seconds_to_create': [10],
        'annotation_kind': ['kind1'],
        'top': [0],
        'left': [0],
        'height': [10],
        'width': [10],
        'name': ['name1'],
        'value': ['value1']
    })

    # Assert that the returned DataFrame is as expected
    pd.testing.assert_frame_equal(df, expected_df)

def bbox_to_polygon(row):
    left = row['left']
    top = row['top']
    right = left + row['width']
    bottom = top + row['height']
    return Polygon([(left, top), (right, top), (right, bottom), (left, bottom)])

def test_bbox_to_polygon():
    # Define a sample row
    row = {
        'left': 5,
        'top': 5,
        'width': 10,
        'height': 10
    }

    # Call the function with the sample row
    polygon = bbox_to_polygon(row)

    # Define the expected Polygon
    expected_polygon = Polygon([(5, 5), (15, 5), (15, 15), (5, 15)])

    # Assert that the returned Polygon is as expected
    assert polygon.equals(expected_polygon)

def calculate_iou(geom1, geom2):
    intersection = geom1.intersection(geom2).area
    union = geom1.union(geom2).area
    if union == 0:
        return 0  # Non-overlapping polygons
    return intersection / union

def test_calculate_iou():
    # Define two overlapping polygons
    polygon1 = Polygon([(0, 0), (1, 0), (1, 1), (0, 1)])
    polygon2 = Polygon([(0.5, 0.5), (1.5, 0.5), (1.5, 1.5), (0.5, 1.5)])

    # Calculate the IoU
    iou = calculate_iou(polygon1, polygon2)

    # Assert that the IoU is as expected
    assert pytest.approx(iou, 0.01) == (1.0/7)

    # Define two non-overlapping polygons
    polygon3 = Polygon([(2, 2), (3, 2), (3, 3), (2, 3)])

    # Calculate the IoU
    iou = calculate_iou(polygon1, polygon3)

    # Assert that the IoU is as expected
    assert iou == 0

def self_join_labels(df):
    paired_df = df[df.annotation_kind == 'ImageBoundingBox']
    paired_df = paired_df[['image_id','worker', 'geometry','name']]
    paired_df = pd.merge(paired_df.rename(columns={'worker': 'worker1', 'geometry': 'geometry1'}),
                      paired_df.rename(columns={'worker': 'worker2', 'geometry': 'geometry2'}),
                      on=['image_id','name'])
    
    # Filter to keep only pairs where worker1 < worker2
    paired_df = paired_df[paired_df['worker1'] < paired_df['worker2']].reset_index(drop=True)
    return paired_df


def test_self_join_labels():
    # Define a sample DataFrame
    df = pd.DataFrame({
        'image_id': ['id1', 'id1', 'id1'],
        'worker': ['w1', 'w1', 'w2'],
        'geometry': [Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]), Polygon([(0, 0), (2, 0), (2, 2), (0, 2)]), Polygon([(2, 2), (3, 2), (3, 3), (2, 3)])],
        'name': ['n1', 'n1', 'n1'],
        'annotation_kind': ['ImageBoundingBox', 'ImageBoundingBox', 'ImageBoundingBox']
    })

    # Call the function with the sample DataFrame
    paired_df = self_join_labels(df)

    # Define the expected DataFrame
    expected_df = pd.DataFrame({
        'image_id': ['id1','id1'],
        'worker1': ['w1','w1'],
        'geometry1': [Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]),Polygon([(0, 0), (2, 0), (2, 2), (0, 2)])],
        'name': ['n1','n1'],
        'worker2': ['w2','w2'],
        'geometry2': [Polygon([(2, 2), (3, 2), (3, 3), (2, 3)]),Polygon([(2, 2), (3, 2), (3, 3), (2, 3)])]
    })

    # Assert that the returned DataFrame is as expected
    pd.testing.assert_frame_equal(paired_df, expected_df)


# Step 2: Find the maximum IoUs for each worker-worker pair for each image_id and name
def process_matches(group):
    #Find pairs of geometries with the maximum IoU between worker1 and worker2 for each image and label_type. Only makes sense if only non-zero IoUs are passed in. These best matches are later used to find distinct sets for averaging
    results = []
    for (image_id, name, worker1, worker2), pair_group in group.groupby(['image_id', 'name', 'worker1', 'worker2']):
        used_geometries = set()
        
        while not pair_group.empty:
            # Find the maximum IoU
            max_iou_row = pair_group.loc[pair_group['iou'].idxmax()]
            
            
            results.append(max_iou_row)
            
            # Add the used geometries to the set
            used_geometries.add(max_iou_row['geometry1'])
            used_geometries.add(max_iou_row['geometry2'])
            
            # Remove rows with used geometries for this worker pair
            pair_group = pair_group[~(pair_group['geometry1'].isin(used_geometries) | 
                                      pair_group['geometry2'].isin(used_geometries))]
        #print('progress')
    return pd.DataFrame(results)

def test_process_matches():
    # Define a sample DataFrame
    df = pd.DataFrame({
        'image_id': ['id1', 'id1', 'id1', 'id1'],
        'name': ['name1', 'name1', 'name1', 'name1'],
        'worker1': ['worker1', 'worker1', 'worker1', 'worker1'],
        'worker2': ['worker2', 'worker2', 'worker2', 'worker2'],
        'geometry1': [Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]), Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]), Polygon([(0, 0), (1, 0), (1, 1), (0, 1)]), Polygon([(30, 30), (31, 30), (31, 31), (30, 31)])],
        'geometry2': [Polygon([(0.5, 0.5), (1.5, 0.5), (1.5, 1.5), (0.5, 1.5)]), Polygon([(50, 50), (52, 50), (52, 52), (50, 52)]), Polygon([(72, 72), (73, 72), (73, 73), (72, 73)]), Polygon([(30, 30), (31, 30), (31, 31), (30, 31)])],
        'iou': [1.0/7, 0, 0, 1]
    })

    # Call the function with the sample DataFrame
    grouped = df.groupby(['image_id','name', 'worker1', 'worker2'])
    processed_df = grouped[['image_id','name', 'worker1', 'worker2','geometry1','geometry2','iou']].apply(process_matches).reset_index(drop=True)

    # Define the expected DataFrame
    expected_df = pd.DataFrame({
        'image_id': ['id1','id1'],
        'name': ['name1','name1'],
        'worker1': ['worker1','worker1'],
        'worker2': ['worker2','worker2'],
        'geometry1': [Polygon([(30, 30), (31, 30), (31, 31), (30, 31)]),Polygon([(0, 0), (1, 0), (1, 1), (0, 1)])],
        'geometry2': [Polygon([(30, 30), (31, 30), (31, 31), (30, 31)]),Polygon([(0.5, 0.5), (1.5, 0.5), (1.5, 1.5), (0.5, 1.5)])],
        'iou': [1,1.0/7]
    })

    # Assert that the returned DataFrame is as expected
    pd.testing.assert_frame_equal(processed_df, expected_df)

def find_sets(pairs):
    # Find connected sets of geometries
    sets = []
    for pair in pairs:
        found = False
        for s in sets:
            if pair[0] in s or pair[1] in s:
                s.add(pair[0])
                s.add(pair[1])
                found = True
                break
        if not found:
            sets.append({pair[0], pair[1]})
    return sets

def test_find_sets():
    # Define a sample list of pairs
    pairs = [(1, 2), (2, 3), (4, 5), (5, 6), (7, 8)]

    # Call the function with the sample list of pairs
    sets = find_sets(pairs)

    # Define the expected list of sets
    expected_sets = [{1, 2, 3}, {4, 5, 6}, {7, 8}]

    # Assert that the returned list of sets is as expected
    assert sets == expected_sets

def filter_matched_boxes(matched_boxes, min_matches_per_image = 3):
    # keep only the matched boxes where there are at least n matches per image, name, worker, worker geometry
    

    matched_boxes['geo_str1'] = matched_boxes['geometry1'].apply(lambda x: str(x))
    matched_boxes['geo_str2'] = matched_boxes['geometry2'].apply(lambda x: str(x))
    keeper_geometries  = pd.concat([matched_boxes[['image_id','name','worker1','geo_str1']].rename(columns={'worker1': 'worker','geo_str1': 'geo_str'}),matched_boxes[['image_id','name','worker2','geo_str2']].rename(columns={'worker2': 'worker','geo_str2': 'geo_str'})], axis = 0)
    # Find all the geometries where there are at least 3 matches for a given geometry
    keeper_size = keeper_geometries.groupby(['image_id','name','worker','geo_str']).size().rename('num_matches') + 1
    keeper_size = keeper_size[keeper_size >=(min_matches_per_image)]
    
    matched_boxes = matched_boxes.merge(keeper_size.to_frame(), left_on = ['image_id','name','worker1','geo_str1'], right_on = ['image_id','name','worker','geo_str'], how = 'inner')
    
    matched_boxes = matched_boxes.merge(keeper_size.to_frame().drop('num_matches',axis = 1), left_on = ['image_id','name','worker2','geo_str2'], right_on = ['image_id','name','worker','geo_str'], how = 'inner')
    
    return matched_boxes


def test_filter_matched_boxes():
    #Needs implementation
    print('test_filter_matched_boxes Not implemented')
    
def average_polygon(polygons):
    # Calculate the average coordinates for each point
    avg_coords = [
        (sum(polygon.exterior.coords[i][0] for polygon in polygons) / len(polygons),
         sum(polygon.exterior.coords[i][1] for polygon in polygons) / len(polygons))
        for i in range(5)  # 5 points in the polygon
    ]
    avg_coords = np.round(avg_coords, 0)
    # Create a new polygon using the average coordinates
    avg_polygon = Polygon(avg_coords)

    return avg_polygon

def test_average_polygon():
    #Needs implementation
    print('test_average_polygon Not implemented')    
 
def do_any_sets_intersect(sets):
    # Iterate over each pair of sets
    for i in range(len(sets)):
        for j in range(i + 1, len(sets)):
            # If the intersection is not empty
            if sets[i] & sets[j]:
                return True
    return False

def test_do_any_sets_intersect():
    #Needs implementation
    print('test_do_any_sets_intersect Not implemented')     

test_calculate_iou()
test_extract_label_data()
test_self_join_labels()
test_bbox_to_polygon()
test_process_matches()
test_find_sets()
test_filter_matched_boxes()
test_average_polygon()
test_do_any_sets_intersect()

test_filter_matched_boxes Not implemented
test_average_polygon Not implemented
test_do_any_sets_intersect Not implemented


In [321]:
# Load in the data labels
annotation_path = '../data/labels/Export v2 project - AC Unit Evaluation Test - 7_3_2024.ndjson'

iou_threshold = (1.0/3)
min_matches_per_image = 3

label_set = pd.read_json(annotation_path, lines=True)
df = extract_label_data(label_set)
df['geometry'] = df.apply(bbox_to_polygon, axis=1)
paired_df = self_join_labels(df)
paired_df['iou'] = paired_df.apply(lambda row: calculate_iou(row['geometry1'], row['geometry2']), axis=1)
paired_df = paired_df[['image_id', 'name','worker1', 'worker2', 'geometry1', 'geometry2', 'iou']]
paired_df = paired_df[paired_df['iou'] > iou_threshold]

grouped = paired_df.groupby(['image_id','name', 'worker1', 'worker2'])
matched_boxes = grouped[['image_id','name', 'worker1', 'worker2','geometry1','geometry2','iou']].apply(process_matches).reset_index(drop=True)

filt_matched_boxes = filter_matched_boxes(matched_boxes, min_matches_per_image = min_matches_per_image)

grouped = filt_matched_boxes.groupby(['image_id','name'])
connected_sets = (grouped[['image_id', 'name', 'worker1', 'worker2', 'geometry1','geometry2', 'iou']].apply(lambda x: find_sets(list(zip(x['geometry1'],x['geometry2'])))).rename('connected_sets'))

# Find image_id, annotation types where sets intersect (should be rare)
intersecting_sets = connected_sets[connected_sets.apply(lambda x: do_any_sets_intersect(x))]

print('Number of intersecting sets: ' + str(len(intersecting_sets)))

print('First average polygon :' + str(average_polygon(connected_sets.iloc[0][0])))

Number of intersecting sets: 1
First average polygon :POLYGON ((843 577, 867 577, 867 613, 843 613, 843 577))
