# MPII, OCHuman, CrowdPose to COCO syntax
Transformation scripts for human keypoint datasets.

In [1]:
import json
from pathlib import Path
from pprint import pprint as _pprint
def pprint(data): (_pprint(data, sort_dicts=False))
from configs import cfg
import os
from PIL import Image
import numpy as np
import fiftyone as fo
import webbrowser
from tqdm.notebook import tqdm
from img_utils import show_images
import shutil
from scipy.io import loadmat

# Convert OCHuman segmentation
import ochumanApi.mask as mask_util
from ochumanApi.ochuman import annToMask, Poly2Mask
from fiftyone.utils.coco import _mask_to_polygons, _close_contour
from skimage import measure

## MPII to COCO syntax
All keypoint annotations in JSON format. MPII metadata. COCO syntax. Approx. bbox around keypoints + bbox as segmentation & area. Keypoint visibility changed to all annotated keypoints are visible (due to MPII counting self-occlusion as invisible, which contradicts to COCO logic).

In [2]:
def pesky_mpii_mat_to_json(anno_mat, img_dir):
    """
    Transforms the pesky matlab .mat MPII annotation file to json.
    MPII to COCO -> take all annotations with keypoints, keep extra MPII metadata, use COCO format for data representation.
    
    + adds bbox (approximate, generated around keypoints with MPII scale and objpos) and segmentation (bbox as segmentation) 
    in order to work with the standard COCO dataloader expecting those keys
    """
    
    mat = loadmat(anno_mat, simplify_cells=True)
    annotations = mat['RELEASE']['annolist']  # len 24987
    img_train = mat['RELEASE']['img_train']  # len 24987
    activities = mat['RELEASE']['act']  # len 24987
    videos = mat['RELEASE']['video_list']  # len 2821
    single_person = mat['RELEASE']['single_person']  # len 24987
    
    output = {}
    output['info'] = {}
    output['categories'] = {}
    output['activities'] = []
    output['images'] = []
    output['annotations'] = []
   
    output['info'] = {
        "description": "MPII Human Pose",
        "url": "http://human-pose.mpi-inf.mpg.de/",
        "version": "1.12",
        "year": 2014,
        "contributor": "Max Planck Institute for Informatics",
        "date_created": "2014/09/23",
        "modified_by": "John Hoffmann",
        "modified_note": "All keypoint annotations in JSON format. MPII metadata. COCO syntax. Approx. bbox around keypoints + bbox as segmentation & area.",
        "date_modified": "2022/10/11",
        "license":  """Simplified BSD License. Copyright (c) 2015, Max Planck Institute for Informatics. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE."""
    }
    
    output['categories'] = [{
            "supercategory": "person",
            "id": 1,
            "name": "person",
            "keypoints": [
                "right_ankle",
                "right_knee",
                "right_hip",
                "left_hip",
                "left_knee",
                "left_ankle",
                "pelvis",
                "thorax",
                "upper_neck",
                "head_top",
                "right_wrist",
                "right_elbow",
                "right_shoulder",
                "left_shoulder",
                "left_elbow",
                "left_wrist"
            ],
            "skeleton": [
                [1,2],
                [2,3],
                [3,7],
                [4,5],
                [4,7],
                [5,6],
                [7,8],
                [8,14],
                [8,13],
                [9,10],
                [11,12],
                [12,13],
                [14,15],
                [15,16]
            ]
        }]

    # get all annotations with keypoints
    valid_annos = []
    imgidx = 1
    for entry in annotations:
        if type(entry['annorect']) is dict:
            entry['annorect'] = [entry['annorect']]  # apply same format to all annotations
        if type(entry['annorect']) is list:
            temp_entry = entry.copy()
            temp_entry.pop('annorect')
            temp_entry['annorect'] = []
            temp_entry['imgidx'] = imgidx  # add index (position in matlab dataframe)
            for annorect in entry['annorect']:
                if 'annopoints' in annorect:
                    if len(annorect['annopoints']) > 0:  # drop empty arrays
                        if type(annorect['annopoints']['point']) == dict:  # contains only single point
                            annorect['annopoints']['point'] = [annorect['annopoints']['point']]
                        temp_entry['annorect'].append(annorect)
            if len(temp_entry['annorect']) > 0:  # not empty annotations
                valid_annos.append(temp_entry)
        imgidx += 1

    # get activities
    valid_acts = {}
    imgidx = 1
    for entry in activities:
        if entry['act_id'] != -1:
            valid_acts[imgidx] = entry
        imgidx +=1

    # add activities
    acts = {v['act_id']:v for v in valid_acts.values()}
    acts[-1] = {'act_id': -1, 'cat_name': None, 'act_name': None}
    acts = {key:acts[key] for key in sorted(acts.keys())}
    output['activities'] = [{'act_id': act['act_id'], 'cat_name': act['cat_name'], 'act_name': act['act_name']} for act in acts.values()]

    # get videos
    valid_vids = {}
    vididx = 1
    for entry in videos:
        valid_vids[vididx] = entry
        vididx +=1

    # get trainin/test assignment
    valid_train = {}
    imgidx = 1
    for entry in img_train:
        valid_train[imgidx] = int(entry)
        imgidx +=1

    # get singel person retangle ids
    valid_single = {}
    imgidx = 1
    for entry in single_person:
        valid_single[imgidx] = [entry] if type(entry) == int else list(entry)
        valid_single[imgidx] = [int(e) for e in valid_single[imgidx]]
        imgidx +=1

    # add images/annotations in COCO format
    anno_id = 1
    for anno in valid_annos:
        file_name = anno['image']['name']
        img = Image.open(Path(img_dir, file_name)) 
        width, height = img.size
        if type(anno['frame_sec']) == int:
            frame_sec = anno['frame_sec']
            frame_str = f'&t={frame_sec}'
        else:
            frame_sec = -1
            frame_str = ''
        if type(anno['vididx']) == int:
            vid_link = f"https://www.youtube.com/watch?v={valid_vids[anno['vididx']]}{frame_str}"
        else:
            vid_link = -1
        output['images'].append({
            'file_name': file_name,
            'id': anno['imgidx'],
            'width': width,
            'height': height,
            'img_train': valid_train[anno['imgidx']],
            'video_url': vid_link,
            'frame_sec': anno['frame_sec'] if type(anno['frame_sec']) == int else -1,
            'activitiy_id': -1 if anno['imgidx'] not in valid_acts else valid_acts[anno['imgidx']]['act_id'],
            'single_person': valid_single[anno['imgidx']],
            'annotations': len(anno['annorect'])
        })

        anno_per_img = 1
        for annorect in anno['annorect']:
            # head bbox
            x1, y1, x2, y2 = annorect['x1'], annorect['y1'], annorect['x2'], annorect['y2']

            # keypoints
            points = annorect['annopoints']['point']
            kps = [[0] * 3 for i in range(16)]  # create empty keypoints
            for point in points:
                # MPII counts self-occlusion as invisible, other datasets don't
                # MPII does not differentiate between self-occlusion and occlusion through, e.g., objects
                # Therefore, all annotated keypoints are handeled as visible
                vis = 2  # COCO: visible and labeled
                kps[point['id']] = [point['x'], point['y'], 2]
            kps = [int(kp) for kp in list(np.array(kps).flatten())]  # flatten, use integers

            ### approximate bbox
            ## After Newell et al.
            objpos = [annorect['objpos']['x'], annorect['objpos']['y']]  # x, y
            scale = 200 * annorect['scale']
            # upper_left = (objpos[0] - scale / 2, objpos[1] - scale / 2)
            # bottom_right = (objpos[0] + scale / 2, objpos[1] + scale / 2)
            # bbox = [upper_left[0], upper_left[1], bottom_right[0]-upper_left[0], bottom_right[1]-upper_left[1]]
            
            ## Without overflow, include all keypoints, fit better
            # delete not annotated keypoints (v=0)
            kps_temp = np.array_split(kps, len(kps)/3)
            kps_temp = list(np.array([kp for kp in kps_temp if kp[2] > 0]).flatten())
            kps_temp = [int(kp) for kp in kps_temp]
            # define bounding box
            xs = kps_temp[0::3]
            ys = kps_temp[1::3]
            bbox = [min(xs), min(ys), max(xs)-min(xs), max(ys)-min(ys)]
            # give some margins
            margin = annorect['scale'] * 10. # px, all directions
            bbox = [bbox[0]-margin, bbox[1]-margin, bbox[2]+2*margin, bbox[3]+2*margin]
            bbox = [int(i) for i in bbox]
            # set to image edge if out of bounce
            bbox[0] = bbox[0] if bbox[0] > 0 else 0
            bbox[1] = bbox[1] if bbox[1] > 0 else 0
            bbox[2] = bbox[2] if bbox[0]+bbox[2] < width else width-bbox[0]
            bbox[3] = bbox[3] if bbox[1]+bbox[3] < height else height-bbox[1]
            
            # approximate bbox as segmentation
            segmentation = [bbox[0]+bbox[2], bbox[1], bbox[0]+bbox[2], bbox[1]+bbox[3], bbox[0], bbox[1]+bbox[3], bbox[0], bbox[1]]
            segmentation = [float(val) for val in segmentation]

            output['annotations'].append({
                'id': anno_id, # unique id for annotation
                'image_id': anno['imgidx'],
                'category_id': 1,  # person
                'iscrowd': False,  # COCO meta for crowds, always false for this dataset
                'keypoints': kps,
                'num_keypoints': len(points), # annotated points
                'bbox':  bbox,  # approximate around keypoints
                'segmentation': [segmentation],  # bounding box as segmentation
                'area': bbox[2] * bbox[3],
                'isbbox': True,  # segmentation = bbox
                'bbox_head': [x1, y1, x2-x1, y2-y1],  # x, y, width, height
                'scale': annorect['scale'],
                'objpos': objpos,
                'single_person': 1 if anno_per_img in valid_single[anno['imgidx']] else 0 # 1 if sufficiently separated individual
            })
            anno_per_img += 1
            anno_id += 1
        
    output['info']['annotated_images'] = len(output['images'])
    output['info']['keypoint_annotations'] = len(output['annotations'])
    
    return output

mpii_json = pesky_mpii_mat_to_json('/Users/john/datasets/mpii/mpii_human_pose_v1_u12_2/mpii_human_pose_v1_u12_1.mat', '/Users/john/datasets/mpii/images')

with open(Path('/Users/john/datasets/mpii/mpii_human_pose_v1_u12_2', 'mpii_coco.json'), 'w', encoding='utf-8') as f:
    json.dump(mpii_json, f, ensure_ascii=False)

## OCHuman to COCO syntax
All keypoint annotations in JSON format. OCHuman metadata. COCO syntax. Bbox as segmentation & area. Keypoint visibility logic changed to COCO logic (self-occlusion = visible).

In [7]:
def ochuman_to_coco_format(anno):
    """OCHuman annotations to COCO syntax. Bbox as segmentation & area."
    """
    with open(anno) as f:
        old = json.load(f)

    output = {}
    output['info'] = {}
    output['categories'] = {}
    output['images'] = []
    output['annotations'] = []
   
    output['info'] = {
        "description": "OCHuman",
        "url": "https://github.com/liruilong940607/OCHumanApi",
        "version": "1.0",
        "year": 2019,
        "contributor": "OCHuman Team",
        "date_created": "2019/06/13",
        "modified_by": "John Hoffmann",
        "modified_note": "All keypoint annotations in JSON format. OCHuman metadata. COCO syntax. Bbox as segmentation & area.",
        "date_modified": "2022/09/17",
        "license":  """MIT License. Copyright (c) 2018 Roy Tseng. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE."""
    }
    
    output['categories'] = [{
            "supercategory": "person",
            "id": 1,
            "name": "person",
            "keypoints": [
                "right_shoulder",
                "right_elbow",
                "right_wrist",
                "left_shoulder",
                "left_elbow",
                "left_wrist",
                "right_hip",
                "right_knee",
                "right_ankle",
                "left_hip",
                "left_knee",
                "left_ankle",
                "head",
                "neck",
                "right_ear",
                "left_ear",
                "nose",
                "right_eye",
                "left_eye"
            ],
            "skeleton": [
                [1,15],
                [1,4],
                [1,2],
                [1,7],
                [2,3],
                [4,16],
                [4,5],
                [4,10],
                [5,6],
                [7,10],
                [7,8],
                [8,9],
                [10,11],
                [11,12],
                [13,14],
                [15,18],
                [16,19],
                [17,19],
                [17,18]
            ]
        }]
    
    anno_id = 0
    pbar = tqdm(old['images'])
    for img in pbar:
        output['images'].append({
            'file_name' :img['file_name'],
            'id': int(img['image_id']),
            'width': img['width'],
            'height': img['height']
            
        })
        for anno in img['annotations']:
            anno_id += 1
            if anno['keypoints'] != None:
                # adjust keypoints
                kps = anno['keypoints']
                kps = [int(kp) for kp in kps]
                assert len(kps) == 19 * 3
                num_keypoints = len([vis for vis in kps[2::3] if vis > 0])  # count only annotated
                # To COCO vis definition
                for i, vis in enumerate(kps):
                    if i % 3:
                        if vis == 1 or vis == 2:
                            kps[i] = 2
                        elif vis == 3:
                            kps[i] = 1
                
                # adjust bbox xyxy -> xywh to COCO format
                x1, y1, x2, y2 = anno['bbox']
                bbox = [x1, y1, x2-x1, y2-y1]
                area = (x2-x1)*(y2-y1)

                # Add segmentation with help of APIs
                if anno['segms'] != None:
                    # Segmentation
                    mask = Poly2Mask(anno['segms'])
                    # or compressed
                    # maskencode = mask_util.encode(np.asfortranarray(mask))
                    # maskencode['counts'] = maskencode['counts'].decode('ascii')
                    # segmentation = maskencode
                    mask = _mask_to_polygons(mask, 1)
                    segmentation = mask
                    isbbox = False
                else:
                # Add bounding box as segmentation for the remaining
                    segmentation = [bbox[0]+bbox[2], bbox[1], bbox[0]+bbox[2], bbox[1]+bbox[3], bbox[0], bbox[1]+bbox[3], bbox[0], bbox[1]]
                    segmentation = [ [float(val) for val in segmentation] ]
                    isbbox = True

                # add annotation if it contains keypoints
                output['annotations'].append({
                    'id': anno_id,
                    'image_id': int(img['image_id']),
                    'category_id': 1,  # person
                    'iscrowd': False,  # no crowd annotations in ochuman
                    'keypoints': kps,
                    'num_keypoints': num_keypoints,
                    'bbox': bbox,
                    'segmentation': segmentation,
                    'area': area,
                    'isbbox': isbbox,
                    'max_iou': anno['max_iou'],
                })
                
    output['info']['annotated_images'] = len(output['images'])
    output['info']['keypoint_annotations'] = len(output['annotations'])
    
    return output
    

ochuman_json = ochuman_to_coco_format('/Users/john/datasets/ochuman/ochuman.json')

with open(Path('/Users/john/datasets/ochuman/', 'ochuman_coco.json'), 'w', encoding='utf-8') as f:
    json.dump(ochuman_json, f, ensure_ascii=False)

  0%|          | 0/5081 [00:00<?, ?it/s]

## CrowdPose to COCO syntax
All keypoint annotations in JSON format. CrowdPose metadata. COCO syntax. Bbox as segmentation & area. Keypoint visibility in COCO logic, all head + neck points changed to visible (annotated as not visible in CrowdPose for some reason).

In [19]:
def crowdpose_to_coco_format(anno):
    """CrowdPose annotations to COCO syntax. Bbox as segmentation & area."
    """
    with open(anno) as f:
        old = json.load(f)

    output = {}
    output['info'] = {}
    output['categories'] = {}
    output['images'] = []
    output['annotations'] = []
   
    output['info'] = {
        "description": "CrowdPose",
        "url": "https://github.com/Jeff-sjtu/CrowdPose",
        "version": "1.0",
        "year": 2020,
        "contributor": "CrowdPose & MMPose team",
        "date_created": "2020/12/26",
        "modified_by": "John Hoffmann",
        "modified_note": "All keypoint annotations in JSON format. CrowdPose metadata. COCO syntax. Drop iscrowd=1 (wrongly annotated). Bbox as segmentation & area.",
        "date_modified": "2022/09/17"
    }
    
    output['categories'] = [{
            "supercategory": "person",
            "id": 1,
            "name": "person",
            "keypoints": [
                "left_shoulder",
                "right_shoulder",
                "left_elbow",
                "right_elbow",
                "left_wrist",
                "right_wrist",
                "left_hip",
                "right_hip",
                "left_knee",
                "right_knee",
                "left_ankle",
                "right_ankle",
                "head",
                "neck"
            ],
            "skeleton": [
                [1,14],
                [1,3],
                [2,14],
                [2,4],
                [3,5],
                [4,6],
                [7,14],
                [7,9],
                [8,14],
                [8,10],
                [9,11],
                [10,12],
                [13,14]
            ]
        }]
    
    for img in old['images']:
        output['images'].append({
            'file_name': img['file_name'],
            'id': img['id'],
            'width': img['width'],
            'height': img['height'],
            'crowdIndex': img['crowdIndex'],
        })
        
    for anno in old['annotations']:
        # keypoints
        kps = anno['keypoints']
        kps[13*3-1] = 2 if kps[13*3-1] > 0 else kps[13*3-1] # head to visible
        kps[14*3-1] = 2 if kps[14*3-1] > 0 else kps[14*3-1] # neck to visible
        num_keypoints = len([vis for vis in kps[2::3] if vis > 0])
        
        # bbox as segmentation
        bbox = anno['bbox']
        segmentation = [bbox[0]+bbox[2], bbox[1], bbox[0]+bbox[2], bbox[1]+bbox[3], bbox[0], bbox[1]+bbox[3], bbox[0], bbox[1]]
        segmentation = [ [round(val, 2) for val in segmentation] ]
        isbbox = True
        area = bbox[2]*bbox[3]
        
        # only if keypoints and not a crowd (iscrowd = True: more than one person, typically wrongly annotated)
        if num_keypoints > 0 and not anno['iscrowd']:
            output['annotations'].append({
                'id': anno['id'],
                'image_id': anno['image_id'],
                'category_id': anno['category_id'],
                'iscrowd': anno['iscrowd'],
                'keypoints': kps,
                'num_keypoints': num_keypoints,
                'bbox': anno['bbox'],
                'segmentation': segmentation,
                'area': area,
                'isbbox': isbbox,
            })
                
    output['info']['annotated_images'] = len(output['images'])
    output['info']['keypoint_annotations'] = len(output['annotations'])
    
    return output
    

crowdpose_json = crowdpose_to_coco_format('/Users/john/datasets/crowd_pose/json/crowdpose_test.json')

with open(Path('/Users/john/datasets/crowd_pose/json/', 'crowdpose_test_coco.json'), 'w', encoding='utf-8') as f:
    json.dump(crowdpose_json, f, ensure_ascii=False)

## Check dataset in fiftyone

In [3]:
%%capture
# check in fiftyone
# start
port = 5151
session = fo.launch_app(port=port)
webbrowser.open(f'http://localhost:{port}/')

In [4]:
# load dataset
dataset = fo.Dataset.from_dir(
    dataset_type = fo.types.COCODetectionDataset,
    label_types = ["detections", "segmentations", "keypoints"],
    data_path = f'/Users/john/datasets/mpii/images',
    labels_path = f'/Users/john/datasets/mpii/mpii_human_pose_v1_u12_2/mpii_coco.json',
    max_samples=2000
)



 100% |███████████████| 2000/2000 [28.1s elapsed, 0s remaining, 60.6 samples/s]      


In [5]:
%%capture
session.view = dataset.view()