In [1]:
import pandas as pd
import PIL
import cv2
import numpy as np
import pickle
import matplotlib.pyplot as plt
import torch
import json

from torchvision.ops import masks_to_boxes, box_convert
from PIL import Image, ExifTags
from matplotlib.patches import Polygon, Rectangle
from matplotlib.collections import PatchCollection
from PIL import Image

# Data Cleaning

## MJU Dataset
The boundix boxes annotations from the MJU Waste dataset are wrong, so they need to be fixed. Thus, since the object masks are annotated, we are going to create the new bounding boxes from the masks.

In [2]:
with open('mju-waste-v1.0/train.pkl', 'rb') as file:
    data = pickle.load(file)

In [3]:
data

Unnamed: 0,id,image_id,category_id,segmentation,area,bbox,iscrowd,filename,width,height
0,1621,1617,1,"[318.11, 188.75, 332.25, 324.48, 414.96, 315.9...",11013.69595,"[318, 180, 96, 144]",0,2019-09-19_16_19_32-29_color.png,640,480
2,1624,1620,1,"[309.63, 159.06, 311.05, 170.37, 308.22, 190.1...",3374.47500,"[308, 158, 40, 123]",0,2019-09-19_16_19_55-63_color.png,640,480
3,1626,1622,1,"[249.54, 237.53, 255.2, 233.99, 268.63, 238.94...",3056.15110,"[248, 214, 95, 48]",0,2019-09-19_16_20_06-93_color.png,640,480
4,1627,1623,1,"[253.78, 178.14, 250.96, 227.63, 268.63, 224.0...",4070.58540,"[250, 176, 61, 79]",0,2019-09-19_16_20_11-29_color.png,640,480
5,1629,1625,1,"[272.16, 197.23, 296.2, 195.82, 316.7, 193.7, ...",2349.10865,"[266, 193, 57, 49]",0,2019-09-19_16_20_22-04_color.png,640,480
...,...,...,...,...,...,...,...,...,...,...
2519,1483,1479,1,"[211.9, 187.02, 227.35, 189.6, 233.35, 199.03,...",8590.26005,"[199, 187, 141, 92]",0,2020-01-07_17_36_52-41_color.png,640,480
2520,1489,1485,1,"[240.21, 175.01, 292.55, 158.71, 313.14, 204.1...",3621.54510,"[238, 158, 75, 75]",0,2020-01-07_17_37_12-70_color.png,640,480
2521,1490,1486,1,"[245.36, 180.16, 301.13, 181.02, 303.7, 184.45...",3989.07800,"[242, 180, 65, 66]",0,2020-01-07_17_37_15-01_color.png,640,480
2526,1504,1500,1,"[278.82, 200.75, 274.53, 165.58, 277.96, 163.8...",6188.61330,"[247, 162, 85, 151]",0,2020-01-07_17_38_26-80_color.png,640,480


Because of the COCO format, the annotations are in JSON files, concretly in 3 JSON files called `train.json`, `test.json`, `val.json`. We are going to open each file and fixing their bounding boxes annotations.

In [49]:
def get_annotations(file):
    with open(file, 'r') as file:
        annotations = json.load(file)

    return  pd.DataFrame(annotations['annotations']), pd.DataFrame(annotations['images'])

def generate_mask(height, width, seg):
    mask = np.zeros((height, width), dtype=np.uint8)

    polygon_shape = (int((len(seg) / 2)), 2)
    poly = np.array(seg).reshape(polygon_shape).astype(int)
    cv2.fillConvexPoly(mask, poly, 1)

    return mask

def generate_bounding_boxes(row, images_dataset):
    image_data = images_dataset[images_dataset['id'] == row['image_id']].squeeze()
    
    mask = generate_mask(height=image_data['height'],
                         width=image_data['width'],
                         seg=row['segmentation'])
    
    # Cast to tensor and add a 1 in first dimmension
    mask = torch.as_tensor(mask)
    mask = torch.unsqueeze(mask, 0)
    
    # Get the box
    box = masks_to_boxes(mask).squeeze()
    # Cast the box to (x, y, width, height) format
    box = box_convert(box, 'xyxy', 'xywh')
    box = np.asarray(box).astype(int).tolist()
    
    return box

In [100]:
def create_new_annotation_file(old_file, new_file, annotations):
    with open(old_file, 'r') as file:
        data = json.load(file)
    
    data['annotations'] = annotations.to_dict(orient='records')
    
    with open(new_file, 'w') as new_file:
        json.dump(data, new_file)
        
def plot_img(img : Image.Image, boxes) -> None:
    """
    Plot and image with the object bounding boxes and masks.

    Args:
        img (PIL.Image.Image): Image in PIL format.
        annotations_df (pandas.DataFrame): DataFrame containing the annotations
            in COCO format.
        img_id (int): Image identifier.
    """
    # Show image
    _, ax = plt.subplots(1)
    plt.imshow(img)

    # Bounding box
    [x, y, w, h] = boxes
    #x, y, w, h = (np.array(boxes) * 0.7).astype(int)
    # Add the bounding box to the image
    rect = Rectangle((x ,y), w, h, linewidth=2, edgecolor='r',
                     facecolor='none', alpha=0.7, linestyle = '--')
    ax.add_patch(rect)

    plt.show()

In [75]:
files = ['../mju-waste-v1.0/train.json', '../mju-waste-v1.0/test.json', '../mju-waste-v1.0/val.json']

for file in files:
    root_path, filename = file.split('/')
    annotations_df, images_df = get_annotations(file)
    
    annotations_df['bbox'] = annotations_df.apply(generate_bounding_boxes, args=(images_df, ), axis=1)
    
    new_file = f'{root_path}/fixed_{filename}'
    create_new_annotation_file(file, new_file, annotations_df)